# RAG

## Requirements

In [1]:
%%capture
!pip install transformers accelerate bitsandbytes langchain langchain-community sentence-transformers faiss-gpu pandas gdown

## Dataset

In [2]:
!gdown --fuzzy https://drive.google.com/file/d/1Lq2zVJlN_B4kUAu4VafQ4jXMIQiAR9vI/view?usp=sharing

c## Config

In [3]:
class Config:
    EMBEDDING_MODEL_NAME="thenlper/gte-base"
    LLM_MODEL_NAME="HuggingFaceH4/zephyr-7b-beta"
    K = 5 # top K retrieval

## packages

In [4]:
!pip install -U langchain_huggingface

In [5]:
import nltk
import pandas as pd
import os
import re
from nltk.stem import WordNetLemmatizer
import subprocess
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
import pickle
from langchain.document_loaders.csv_loader import CSVLoader
from langchain.vectorstores.utils import DistanceStrategy
from langchain.vectorstores.faiss import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from transformers import pipeline
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain.chains.combine_documents import create_stuff_documents_chain

In [6]:
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')

In [7]:
import nltk
import subprocess

try:
    nltk.data.find('wordnet.zip')
except:
    nltk.download('wordnet', download_dir='/kaggle/working/')
    command = "unzip /kaggle/working/corpora/wordnet.zip -d /kaggle/working/corpora"
    subprocess.run(command.split())
    nltk.data.path.append('/kaggle/working/')

from nltk.corpus import wordnet

## Preprocessing

In [8]:
df = pd.read_json('IMDB_crawled.json')
DF = df[:1000]
#DF = df

df.head()

In [9]:
os.makedirs('data', exist_ok=True)

# preprocess your data and only store the needed data as the context window for embedding model is limited
# will be using the preprocess from proj

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def remove_links(text: str):
    patterns = [r'\S*http\S*', r'\S*www\S*', r'\S+\.ir\S*', r'\S+\.com\S*', r'\S+\.org\S*', r'\S*@\S*']
    for patt in patterns:
        text = re.sub(patt, "", text)
    return text

def remove_punctuations(text: str):
    return re.sub(r'[^\w\s]', '', text)

def normalize(text: str):
    return " ".join([lemmatizer.lemmatize(w.lower()) for w in word_tokenize(text)])

def remove_stopwords(text: str):
    return " ".join([word for word in word_tokenize(text) if word not in set(stopwords.words('english'))])

def remove_characters(text: str):
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+[a-z]\s+', ' ', text)
    text = re.sub(r'^[a-z]\s+', ' ', text)
    return text

def preprocess_text(text: str):
    text = text.lower()
    text = remove_stopwords(text)
    #text = remove_links(text)
    #text = remove_punctuations(text)
    text = normalize(text)
    #text = remove_characters(text)
    return text

DF = DF[["id", "title", "first_page_summary", "genres"]]
DF = DF.dropna()
DF["first_page_summary"] = DF["first_page_summary"].apply(preprocess_text)


DF.to_csv('data/imdb.csv', index=False)            
DF = DF[["first_page_summary", "genres"]]
DF.head()              


## Vectorizer

load the CSV file and vectorize the rows using HuggingFaceEmbeddings.
Store the results using FAISS vectorstore.
Save the vectorestore in a pickle file for future usages.

In [10]:
# load the csv
documents = CSVLoader("data/imdb.csv").load()

# load the embeddings model
#embeddings_model = HuggingFaceEmbeddings(model_name="thenlper/gte-base")
embeddings_model = HuggingFaceEmbeddings(model_name=Config.EMBEDDING_MODEL_NAME)

# save embed the documents using the model in a vectorstore
vectorstore = FAISS.from_documents(documents, embeddings_model, distance_strategy=DistanceStrategy.COSINE)


with open("data/vectorstore.pkl", "wb") as f:
     pickle.dump(vectorstore, f)

load the vectorstore as a retriever.

In [11]:
with open("data/vectorstore.pkl", "rb") as f:
    vectorstore = pickle.load(f)

# load the retriever from the vectorstore
retriever = vectorstore.as_retriever(k = Config.K)

## LLM

load the quantized LLM.

In [12]:
# load the quantization config
bnb_config = BitsAndBytesConfig()

model = AutoModelForCausalLM.from_pretrained(Config.LLM_MODEL_NAME, quantization_config=bnb_config, device_map="cuda:0")
tokenizer = AutoTokenizer.from_pretrained(Config.LLM_MODEL_NAME)

# init the pipeline
READER_LLM = pipeline("text-generation", max_new_tokens=2000, model=model, tokenizer=tokenizer)

llm = HuggingFacePipeline(
    pipeline=READER_LLM,
)

initialize the prompt template for the query chain. query chain is used to get a query from the chat history. you may change the prompt as you like to get better results.

In [13]:
class LoggerStrOutputParser(StrOutputParser):
    def parse(self, text: str) -> str:
        # process the LLM output
        print(f"QUERY: {text}")
        return text

query_transform_prompt = PromptTemplate(
    input_variables=["messages"],
    template="""<|system|>You are a helpful assistant.
{messages}
<|user|>
give me the search query about the above conversation.
<|assistant|>"""
)

# init the query chain
query_transforming_retriever_chain = (
    {"messages": RunnablePassthrough()} | query_transform_prompt | llm | StrOutputParser())

initialize the main retrieval chain that gives the resulting documents to LLM and gets the output back.

In [14]:
prompt = PromptTemplate(
    input_variables=["context", "messages"],
    template="""You are a helpful assistant.

Here are the movies you MUST choose from:

{context}
-----------------
{messages}
-----------------
Using above movies and user queries, generate a response containing most relevant movies to the user query.
your response doesnt need extra description just list them movies by mentioning title, genres and summary.
the queries can be related.
"""+ "|SEP|")

# init the retriver chain
retrieval_chain = ({"context" : retriever, "messages": RunnablePassthrough()} | prompt| llm | StrOutputParser())

write the conversation helper class for easier testing.

In [15]:
class Conversation:
    def __init__(self):
        self.messages = []

    def add_assistant_message(self, message):
        self.messages.append(('assistant', message))

    def add_user_message(self, message):
        self.messages.append(('user', message))

    def get_messages(self):
        # concatenate the messages with the roles in the instruction format
        return "\n".join([f"{r}: {m}" for r, m in self.messages])

    def chat(self, message):
        self.add_user_message(message)
        messages = self.get_messages()
        # invoke the chain
        SQ = query_transforming_retriever_chain.invoke(input=messages).split("|SEP|")[-1]
        response = retrieval_chain.invoke(SQ).split("|SEP|")[-1]
        self.add_assistant_message(response)
        return response

## Test

talk with the RAG to see how good it performs.

In [16]:
c = Conversation()
A = c.chat('give me a cool gangster movie')
print(A)

In [17]:
A = c.chat('give me a newer one')
print(A)