In [None]:
!pip3 install unstructured
!pip3 install langchain sentence-transformers chromadb langchainhub

!pip3 -q install git+https://github.com/huggingface/transformers # need to install from github
!pip3 -q install bitsandbytes accelerate xformers einops

!pip3 install faiss-gpu

In [None]:
import re
import os
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import pipeline

from langchain_community.vectorstores import Chroma
from langchain_text_splitters import CharacterTextSplitter

from langchain_community.document_loaders import DirectoryLoader

from langchain_community.document_loaders import TextLoader
from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings

from langchain.chains.question_answering import load_qa_chain

from langchain.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores.utils import DistanceStrategy

In [None]:
model_id = "mistralai/Mistral-7B-Instruct-v0.1"
bnb_config = transformers.BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16
)

In [None]:
model = transformers.AutoModelForCausalLM.from_pretrained(
model_id,
trust_remote_code=True,
quantization_config=bnb_config,
device_map='auto',
use_auth_token=''
)

tokenizer = transformers.AutoTokenizer.from_pretrained(
model_id, use_auth_token=''
)

In [None]:
text = "<s>[INST]Does AXE help against body odor?[/INST]"

encodeds = tokenizer(text, return_tensors="pt", add_special_tokens=False)
model_inputs = encodeds

generated_ids = model.generate(**model_inputs, max_new_tokens=1000, do_sample=True)
decoded = tokenizer.batch_decode(generated_ids)
print(decoded[0])

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


<s> [INST]Does AXE help against body odor?[/INST] No, AXE doesn't specifically help against body odor. It's a cologne or perfume designed to give the user a certain scent. Body odor is often caused by sweat and bacteria on the skin surface, which is why people commonly use deodorants or antiperspirants to help control it.</s>


In [None]:
loader = DirectoryLoader('transcripts', glob="*.txt")
documents = loader.load()
len(documents)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


146

In [None]:
# split it into chunks
text_splitter = CharacterTextSplitter(chunk_size=512, chunk_overlap=int(512 / 10))
docs = text_splitter.split_documents(documents)

# create the open-source embedding function
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

# load it into Chroma
db = Chroma.from_documents(docs, embedding_function)

In [None]:
os.environ["HUGGINGFACEHUB_API_TOKEN"] = ''

In [None]:
EMBEDDING_MODEL_NAME = "thenlper/gte-small"

embedding_model = HuggingFaceEmbeddings(
    model_name=EMBEDDING_MODEL_NAME,
    multi_process=True,
    encode_kwargs={"normalize_embeddings": True},  # set True for cosine similarity
)

KNOWLEDGE_VECTOR_DATABASE = FAISS.from_documents(
    docs, embedding_model, distance_strategy=DistanceStrategy.COSINE
)

modules.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/68.1k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/583 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/66.7M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/394 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
prompt_in_chat_format = [
    {
        "role": "user",
        "content": """You will be speaking from Luke Maxfield and Muneeb Shah, two famous dermatologists. Using the information from their videos contained in the context, give a comprehensive answer to the question. Respond only to the question asked, response should be concise and relevant to the question. Provide the number of the source document when relevant. Start your answer with the following phrase: "Hi! We are Doctorly." If the answer cannot be deduced from the context, do not give an answer. "Context:
        {context}""",
    },
    {
        "role": "assistant",
        "content": "Got it!",
    },
    {
        "role": "user",
        "content": "{question}",
    }
]

RAG_PROMPT_TEMPLATE = tokenizer.apply_chat_template(
    prompt_in_chat_format, tokenize=False, add_generation_prompt=True
)

In [None]:
llm_pipeline = pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    do_sample=True,
    temperature=0.01,
    repetition_penalty=1.1,
    return_full_text=False,
    max_new_tokens=300,
)

In [None]:
def answer_with_RAG(user_query):
  retrieved_docs = KNOWLEDGE_VECTOR_DATABASE.similarity_search(query=user_query, k=5)
  retrieved_docs_text = [doc.page_content for doc in retrieved_docs]  # we only need the text of the documents
  context = "\nExtracted documents:\n"
  context += "".join([f"Document {str(i)}:::\n" + doc for i, doc in enumerate(retrieved_docs_text)])

  final_prompt = RAG_PROMPT_TEMPLATE.format(question=user_query, context=context)

  # Redact an answer
  answer = llm_pipeline(final_prompt)[0]["generated_text"]
  return answer

In [None]:
user_query = "Does AXE help against body odor?"

In [None]:
answer = answer_with_RAG(user_query)
answer

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


' Hi! We are Doctorly.\n\nAccording to Dr. Luke Maxfield and Dr. Muneeb Shah, AXE is a traditional deodorant spray that has no antiperspirant benefits. It works by masking the smell of body odor with a stronger scent. However, it does not actually kill bacteria or deodorize the skin. Instead, it simply covers up the odor with a fragrance.\n\nIf you are looking for a product that can actually help against body odor, it is recommended to use cleansers with antimicrobial agents such as 4% benzoyl peroxide or a chlorhexidine cleanser at 2%. These products can help to decolonize the bacteria on the skin and reduce body odor.\n\nIt is important to note that while these products may be effective in reducing body odor, they should not be used as a substitute for proper hygiene practices such as regular bathing and cleaning the affected areas. Additionally, it is important to consider individual sensitivity to certain ingredients and to choose products that are appropriate for your skin type.'