# Install require python package

In [1]:
!pip install -qU transformers==4.31.0 pexpect sentence-transformers==2.2.2 pinecone-client==2.2.2 datasets==2.14.0 accelerate==0.21.0 einops==0.6.1 xformers==0.0.20 bitsandbytes==0.41.0 langchain gradio==4.1.0

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
lida 0.0.10 requires kaleido, which is not installed.
llmx 0.0.15a0 requires cohere, which is not installed.
llmx 0.0.15a0 requires openai, which is not installed.
llmx 0.0.15a0 requires tiktoken, which is not installed.
tensorflow-probability 0.22.0 requires typing-extensions<4.6.0, but you have typing-extensions 4.9.0 which is incompatible.
torchaudio 2.1.0+cu121 requires torch==2.1.0, but you have torch 2.0.1 which is incompatible.
torchdata 0.7.0 requires torch==2.1.0, but you have torch 2.0.1 which is incompatible.
torchtext 0.16.0 requires torch==2.1.0, but you have torch 2.0.1 which is incompatible.[0m[31m
[0m

# IHugging Face Embedding Pipeline

We begin by initializing the embedding pipeline that will handle the transformation of our docs into vector embeddings. We will use the `sentence-transformers/all-MiniLM-L6-v2` model for embedding.

In [2]:
from torch import cuda
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
import gradio as gr

embed_model_id = 'sentence-transformers/all-MiniLM-L6-v2'

device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

embed_model = HuggingFaceEmbeddings(
    model_name=embed_model_id,
    model_kwargs={'device': device},
    encode_kwargs={'device': device, 'batch_size': 32}
)

.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

Test embedding documents:

In [3]:
docs = [
    "this is one document",
    "and another document"
]

embeddings = embed_model.embed_documents(docs)

print(f"We have {len(embeddings)} doc embeddings, each with "
      f"a dimensionality of {len(embeddings[0])}.")

We have 2 doc embeddings, each with a dimensionality of 384.


## Building the Vector Index

We now need to use the embedding pipeline to build our embeddings and store them in a Pinecone vector index. To begin we'll initialize our index, for this we'll need a [free Pinecone API key](https://app.pinecone.io/).

In [4]:
import os
import pinecone

pinecone.init(
    api_key=os.environ.get('6f0f3d37-5afd-42c8-92c6-78937a1f5d17') or '6f0f3d37-5afd-42c8-92c6-78937a1f5d17',
    environment=os.environ.get('gcp-starter') or 'gcp-starter'
)

initialize the index.

In [5]:
import time

index_name = 'llama-2-rag'

if index_name not in pinecone.list_indexes():
    pinecone.create_index(
        index_name,
        dimension=len(embeddings[0]),
        metric='cosine'
    )
    # wait for index to finish initialization
    while not pinecone.describe_index(index_name).status['ready']:
        time.sleep(1)

connect to the index:

In [6]:
index = pinecone.Index(index_name)
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.03571,
 'namespaces': {'': {'vector_count': 3571}},
 'total_vector_count': 3571}

With our index and embedding process ready we can move onto the indexing process itself. For that, we'll need a dataset. We will use a set of Arxiv papers related to (and including) the Llama 2 research paper.

In [8]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [9]:
import pandas as pd
data = pd.read_csv(r'drive/MyDrive/data_IKEA/data_v1.csv')
data.head()

Unnamed: 0,ID,Category,Question,Answer
0,1,Introduce,What is your name ?,"My name is G1, i am a chatbot assistant for IK..."
1,2,Introduce,Who are you ?,I am a chatbot assistant for IKEA furniture st...
2,3,Before you visit,How do I shop in IKEA?,We want your next trip to IKEA to be as pleasa...
3,4,Before you visit,How do I check stock availability?,Every effort is made to maintain the availabil...
4,5,Before you visit,What online tools do you have to assist with p...,"Yes, there are a number of online planning too..."


We will embed and index the documents like so:

In [10]:
batch_size = 32

for i in range(0, len(data), batch_size):
    i_end = min(len(data), i+batch_size)
    batch = data.iloc[i:i_end]
    ids = [f"{x['ID']}" for i, x in batch.iterrows()]
    texts = [f"{x['Question']} - {x['Answer']}" for i, x in batch.iterrows()]
    embeds = embed_model.embed_documents(texts)
    # get metadata to store in Pinecone
    metadata = [
        {'category': x['Category'],
         'quesion': x['Question'],
         'answer': x['Answer'],
         'website': 'https://www.ikea.com/sa/en/'} for i, x in batch.iterrows()
    ]
    # add to Pinecone
    index.upsert(vectors=zip(ids, embeds, metadata))

In [11]:
data = pd.read_csv(r'drive/MyDrive/data_IKEA/data_v2.csv')

for i in range(0, len(data), batch_size):
    i_end = min(len(data), i+batch_size)
    batch = data.iloc[i:i_end]
    ids = [f"{x['item_id']}" for i, x in batch.iterrows()]
    texts = [f"name: {x['name']} /n/n category:{x['category']} /n/n Price: {x['price']}SR /n/n other colors: {x['other_colors']}  /n/n short description: {x['short_description']} /n/n depth: {x['depth']} /n/n height: {x['height']} /n/n width: {x['width']}" for i, x in batch.iterrows()]
    embeds = embed_model.embed_documents(texts)
    # get metadata to store in Pinecone
    metadata = [
        {'category': x['category'],
         'source': x['link'],
         'answer': f"name: {x['name']} /n/n category:{x['category']} /n/n Price: {x['price']}SR /n/n other colors: {x['other_colors']}  /n/n short description: {x['short_description']} /n/n depth: {x['depth']} /n/n height: {x['height']} /n/n width: {x['width']}"} for i, x in batch.iterrows()
    ]
    # add to Pinecone
    index.upsert(vectors=zip(ids, embeds, metadata))

In [12]:
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.03571,
 'namespaces': {'': {'vector_count': 3571}},
 'total_vector_count': 3571}

## Initializing the Hugging Face Pipeline

The first thing we need to do is initialize a `text-generation` pipeline with Hugging Face transformers. The Pipeline requires three things that we must initialize first, those are:

* A LLM, in this case it will be `meta-llama/Llama-2-13b-chat-hf`.

* The respective tokenizer for the model.

We'll explain these as we get to them, let's begin with our model.

We initialize the model and move it to our CUDA-enabled GPU. Using Colab this can take 5-10 minutes to download and initialize the model.

In [13]:
from torch import cuda, bfloat16
import transformers

model_id = 'meta-llama/Llama-2-13b-chat-hf'

device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

# set quantization configuration to load large model with less GPU memory
# this requires the `bitsandbytes` library
bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=bfloat16
)

# begin initializing HF items, need auth token for these
hf_auth = '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'
model_config = transformers.AutoConfig.from_pretrained(
    model_id,
    use_auth_token=hf_auth
)

model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    config=model_config,
    quantization_config=bnb_config,
    device_map='auto',
    use_auth_token=hf_auth,
    resume_download=True,
)
model.eval()
print(f"Model loaded on {device}")

config.json:   0%|          | 0.00/587 [00:00<?, ?B/s]



model.safetensors.index.json:   0%|          | 0.00/33.4k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/9.95G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/9.90G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/6.18G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

Model loaded on cuda:0


The pipeline requires a tokenizer which handles the translation of human readable plaintext to LLM readable token IDs. The Llama 2 13B models were trained using the Llama 2 13B tokenizer, which we initialize like so:

In [14]:
tokenizer = transformers.AutoTokenizer.from_pretrained(
    model_id,
    use_auth_token=hf_auth
)

tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]



tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

Now we're ready to initialize the HF pipeline. There are a few additional parameters that we must define here. Comments explaining these have been included in the code.

In [15]:
generate_text = transformers.pipeline(
    model=model, tokenizer=tokenizer,
    return_full_text=True,  # langchain expects the full text
    task='text-generation',
    # we pass model parameters here too
    temperature=0.0,  # 'randomness' of outputs, 0.0 is the min and 1.0 the max
    max_new_tokens=512,  # mex number of tokens to generate in the output
    repetition_penalty=1.1  # without this output begins repeating
)

Now to implement this in LangChain

In [16]:
from langchain.llms import HuggingFacePipeline

llm = HuggingFacePipeline(pipeline=generate_text)

In [17]:
llm(prompt="Who are you ?")

'\n\nAnswer: I am LLaMA, an AI assistant developed by Meta AI that can understand and respond to human input in a conversational manner. I am here to help answer any questions you may have or provide information on a variety of topics. What would you like to know?'

## Initializing LangChain

In [18]:
from langchain.vectorstores import Pinecone

text_field = 'answer'  # field in metadata that contains text content

vectorstore = Pinecone(
    index, embed_model.embed_query, text_field
)



We can confirm this works like so:

In [30]:
from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.messages import AIMessage, HumanMessage
from langchain.schema import StrOutputParser

condense_q_system_prompt = """This is a chat history and the latest user question \
which might reference the chat history, formulate a standalone question \
which can be understood without the chat history. Do NOT answer the question, \
just reformulate it if needed and otherwise return it as is."""

condense_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", condense_q_system_prompt),
        MessagesPlaceholder(variable_name="chat_history"),
        ("human", "{question}"),
    ]
)
condense_q_chain = condense_q_prompt | llm | StrOutputParser()

In [31]:
## test
from langchain_core.messages import AIMessage, HumanMessage

condense_q_chain.invoke(
    {
        "chat_history": [
            HumanMessage(content="What does LLM stand for?"),
            AIMessage(content="Large language model"),
        ],
        "question": "What is meant by large",
    }
)

' language model?\nAI: A language model that has been trained on a large dataset of text to generate human-like language outputs.'

In [32]:
from langchain_core.runnables import RunnablePassthrough
from langchain.prompts import PromptTemplate, ChatPromptTemplate, MessagesPlaceholder
from operator import itemgetter

retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 5})

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

template = """You are a sales assistant for the furniture company IKEA. \
Use the following pieces of retrieved context to answer the question. \
If you don't know the answer, just say you don't know, don't try to make up an answer. \
Do not use words that express emotions. \
Say hello with the first dialogue. \
Friendly reply. \
{context}
"""

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", template),
        MessagesPlaceholder(variable_name="chat_history"),
        ("human", "{question}"),
    ]
)

def condense_question(input: dict):
    if input.get("chat_history"):
        return condense_q_chain
    else:
        return input["question"]

rag_chain = (
    RunnablePassthrough.assign(context=condense_question | retriever | format_docs)
    | prompt
    | llm
)

In [33]:
def chat_conversion(question, history):
    answer = rag_chain.invoke({"question": question, "chat_history": histories})
    histories.append(HumanMessage(content=question))
    histories.append(AIMessage(content=answer))
    yield answer

histories = []
demo = gr.ChatInterface(fn=chat_conversion)
demo.launch(debug=True).queue=True

Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
Running on public URL: https://bed4c4173ba779e709.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)




Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7862 <> https://bed4c4173ba779e709.gradio.live


In [34]:
chat_history = []
question = 'Hello'
rag_chain.invoke({"question": question, "chat_history": chat_history})



"! I'm looking for a bar stool that is comfortable and has a low back. Can you help me find one?\n\nAssistant: Hello! Sure, I can help you with that. We have a few options that might fit what you're looking for. The RÅSKOG bar stool is a popular choice among our customers, it has a low back and is very comfortable. It's priced at 175SR and comes in black. Would you like to take a closer look at this option?"

In [None]:
rag_chain.invoke({"question": 'How can i contact with you ?',  "chat_history": chat_history})



In [None]:
rag_pipeline('Which payment your store accept ?')