# Creating document emneddings and storing them in ChromaDB

In [6]:
#!pip install  openai langchain sentence_transformers chromadb unstructured -q

########################################################################

from langchain.document_loaders import DirectoryLoader

directory = '/kaggle/input/document-chat'

def load_docs(directory):
  loader = DirectoryLoader(directory)
  documents = loader.load()
  return documents

documents = load_docs(directory)
print("Total Documnets : ",len(documents))

#########################################################################


from langchain.text_splitter import RecursiveCharacterTextSplitter

def split_docs(documents,chunk_size=1000,chunk_overlap=20):
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
  docs = text_splitter.split_documents(documents)
  return docs

docs = split_docs(documents)
print("Total Chunks Created : ", len(docs))

###########################################################################

from langchain.embeddings import SentenceTransformerEmbeddings
embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

##########################################################################

from langchain.vectorstores import Chroma
persist_directory = "chroma_db"
db = Chroma.from_documents(docs, embeddings, persist_directory=persist_directory)

#########################################################################


query = "who is deepak tripathi"
matching_docs = db.similarity_search(query)

print(matching_docs[:2])

##########################################################################
import warnings
import re
warnings.filterwarnings('ignore')
def print_large(text, font_size=18):
    text = re.sub('\n','<br/>',text)
    html_text = f"<p style='font-size:{font_size}px'>{text}</p>"
    from IPython.core.display import display, HTML
    display(HTML(html_text))
    
print_large(matching_docs[0].page_content)

Total Documnets :  3
Total Chunks Created :  158


Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[Document(page_content="Name: Deepak Tripathi\n\nContact details.: +44 7823943370\n\nI am writing to express my interest in the Data Scientist KTP Associate role in University of Aberdeen. I hold a Master's degree in Data Science with distinction from University of Essex and I am confident in my ability to make significant contributions to the role.\n\nHaving completed my studies at the University of Essex, I am familiar with the work culture and the research environment. With a Master's degree in Computer Science completed in 2015, I have accumulated over six years of experience in the field of computer science. Most recently, I worked at QUALIF[AI] as a Machine Learning Engineer, where I had the opportunity to work on various projects, hone my skills and gain valuable experience.", metadata={'source': '/kaggle/input/document-chat/Tripathi_Deepak_CoverLetter.docx'}), Document(page_content="Name: Deepak Tripathi\n\nContact details.: +44 7823943370\n\nI am writing to express my interest

# Loading LLama V2 model

In [7]:
from huggingface_hub import login

login(token = '**************')

###############################################################

import torch
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM

##############################################################


tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf",
                                          use_auth_token=True,)

model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf",
                                             device_map='auto',
                                             torch_dtype=torch.float16,
                                             use_auth_token=True,
                                             # load_in_8bit=True,
                                            # load_in_4bit=True
                                             )

##############################################################


from transformers import pipeline

pipe = pipeline("text-generation",
                model=model,
                tokenizer= tokenizer,
                #torch_dtype=torch.bfloat16,
                device_map="auto",
                max_new_tokens = 512,
                do_sample=True,
                top_k=30,
                num_return_sequences=1,
                eos_token_id=tokenizer.eos_token_id
                )


########################################################################################


from langchain import HuggingFacePipeline
from langchain import PromptTemplate,  LLMChain

#########################################################################################

llm = HuggingFacePipeline(pipeline = pipe, model_kwargs = {'temperature':0})


Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /root/.cache/huggingface/token
Login successful


Downloading (…)okenizer_config.json:   0%|          | 0.00/770 [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/635 [00:00<?, ?B/s]

Downloading (…)fetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)neration_config.json:   0%|          | 0.00/167 [00:00<?, ?B/s]

Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


# desinging prompt for Questions Answring system

In [11]:
##########################################################################################


import json
import textwrap


DEFAULT_SYSTEM_PROMPT = """\
You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.

If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."""


B_INST, E_INST = "[INST]", "[/INST]"
B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"


def get_prompt(instruction, new_system_prompt=DEFAULT_SYSTEM_PROMPT ):
    SYSTEM_PROMPT = B_SYS + new_system_prompt + E_SYS
    prompt_template =  B_INST + SYSTEM_PROMPT + instruction + E_INST
    return prompt_template

def parse_text(text):
        wrapped_text = textwrap.fill(text, width=100)
        print(wrapped_text +'\n\n')
        # return assistant_text
        
################################################################################################

instruction = """Answer me {question} using following text.

If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.
: {text}"""
system_prompt = "You are a question answering system capable of giving answers from provided context"


template = get_prompt(instruction, system_prompt)
print(template)

prompt = PromptTemplate(template=template, input_variables=["question","text"])
print(prompt)

###################################################################################################

from langchain.memory import ConversationBufferMemory

memory = ConversationBufferMemory(memory_key="text")

llm_chain = LLMChain(prompt=prompt, 
                     llm=llm, 
                     #memory=memory
                    )

####################################################################################################

[INST]<<SYS>>
You are a question answering system capable of giving answers from provided context
<</SYS>>

Answer me {question} using following text.

If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.
: {text}[/INST]
input_variables=['question', 'text'] output_parser=None partial_variables={} template="[INST]<<SYS>>\nYou are a question answering system capable of giving answers from provided context\n<</SYS>>\n\nAnswer me {question} using following text.\n\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\n: {text}[/INST]" template_format='f-string' validate_template=True


# Questions

In [13]:
question = "what does Deepak Tripathi do?"
matching_docs = db.similarity_search(question)
top = 20
context = " ".join([doc.page_content for doc in matching_docs[:top]])
output = llm_chain.predict(question=question, text=context)
print_large(output)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [14]:
question = "what main skills Deepak Tripathi has?"
matching_docs = db.similarity_search(question)
top = 20
context = " ".join([doc.page_content for doc in matching_docs[:top]])
output = llm_chain.predict(question=question, text=context)
print_large(output)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [15]:
question = "what was the conclusion of the project Deepak Tripathi worked on?"
matching_docs = db.similarity_search(question)
top = 20
context = " ".join([doc.page_content for doc in matching_docs[:top]])
output = llm_chain.predict(question=question, text=context)
print_large(output)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [16]:
question = "where is Deepak currently Living?"
matching_docs = db.similarity_search(question)
top = 20
context = " ".join([doc.page_content for doc in matching_docs[:top]])
output = llm_chain.predict(question=question, text=context)
print_large(output)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [29]:
question = "who is Deepak Tripathi's academic supervisor"
matching_docs = db.similarity_search(question)
top = 20
context = " ".join([doc.page_content for doc in matching_docs[:top]])
output = llm_chain.predict(question=question, text=context)
print_large(output)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]