In [None]:
# ! pip install -U langchain tiktoken pypdf faiss-gpu -q
# ! pip install -U transformers InstructorEmbedding sentence_transformers -q
# ! pip install -U accelerate bitsandbytes xformers einops -q
# !pip install auto-gptq optimum -q

import warnings

import os
import glob
import textwrap
import time

import langchain
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain import PromptTemplate, LLMChain
from langchain.vectorstores import FAISS
from langchain.llms import HuggingFacePipeline
from InstructorEmbedding import INSTRUCTOR  
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.chains import RetrievalQA

import torch
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

print('LangChain:', langchain.__version__)

In [None]:
class CFG:
    # LLMs
    model_id = 'codegood/Mistral_Latest' #'llama2-13b' # wizardlm, bloom, falcon, llama2-7b, llama2-13b
    temperature = 0,
    top_p = 0.95,
    repetition_penalty = 1.15

    # splitting
    split_chunk_size = 800
    split_overlap = 0

    # embeddings
    embeddings_model_id = 'sentence-transformers/all-MiniLM-L6-v2'

    # similar passages
    k = 1

config = CFG()

In [None]:
%%time

from transformers import BitsAndBytesConfig

tokenizer = AutoTokenizer.from_pretrained(config.model_id, use_fast=True)
bnb_config = BitsAndBytesConfig(
    load_in_4bit= True,
    bnb_4bit_quant_type= "nf4",
    bnb_4bit_compute_dtype= torch.bfloat16,
    bnb_4bit_use_double_quant= True,
)

model = AutoModelForCausalLM.from_pretrained(
    config.model_id,
    load_in_4bit=True,
    device_map='auto',
    quantization_config=bnb_config,
    torch_dtype=torch.float16,
    # low_cpu_mem_usage=True,
    trust_remote_code=True
)
max_len = 256

In [None]:
pipe = pipeline(
    task = "text-generation",
    model = model,
    tokenizer = tokenizer,
    pad_token_id = tokenizer.eos_token_id,
    max_length = max_len,
    temperature = 0.25,
    #temperature = CFG.temperature,
    top_p = 0.9,
    #top_p = CFG.top_p,
    repetition_penalty = CFG.repetition_penalty
)

llm = HuggingFacePipeline(pipeline = pipe, batch_size=6)

#Test model
llm("<s>[INST]Where is Yeshiva university Beren campus located?[/INST]")

# Creating Vector DB

In [None]:
from langchain_community.document_loaders import TextLoader
import glob

# Update the directory path
Texts_path = "/content/drive/MyDrive/Fall 2023/Data/Data Text Files/Text files/*.txt"

# Use glob to get a list of all .txt files in the directory
text_files = glob.glob(Texts_path)

# Check if any text files were found
if not text_files:
    print("No text files found in the specified directory.")
else:
    # Initialize an empty list to store loaded documents
    documents = []

    # Iterate through each text file and load its content
    for file_path in text_files:
        try:
            text_loader = TextLoader(file_path)
            document = text_loader.load()
            documents.append(document)
        except Exception as e:
            print(f"Error loading {file_path}: {str(e)}")

    # Now, 'documents' contains the loaded content of all text files
    print("Documents loaded successfully.")
print(f'We have {len(documents)} files in total')

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=CFG.split_chunk_size,
    chunk_overlap=CFG.split_overlap
)

texts = text_splitter.split_documents([doc[0] for doc in documents])

print(f'We have created {len(texts)} chunks from {len(documents)} documents')

In [None]:
%%time

### download embeddings model
embeddings = HuggingFaceEmbeddings(
    model_name = CFG.embeddings_model_repo,
    model_kwargs = {"device": "cuda"}
)

### create embeddings and DB
vectordb = FAISS.from_documents(
    documents = texts,
    embedding = embeddings
)

### persist vector database
vectordb.save_local("/content/drive/MyDrive/Fall 2023/Spring 2024 Katzbot/RAG/Haider_VDB")


# Load Vector DB

In [None]:
embeddings = HuggingFaceEmbeddings(
    model_name = CFG.embedding_model_id,
    model_kwargs = {"device": "cuda"},
    encode_kwargs = {'normalize_embeddings': True}
)

# Load the vector database
vectordb_loaded = FAISS.load_local(
    "/content/drive/MyDrive/Fall 2023/Spring 2024 Katzbot/RAG/Haider_VDB",
    embeddings
)

In [None]:
import pandas as pd
prompt_template = """
Don't try to make up an answer, if you don't know just say that you don't know.
Answer in the same language the question was asked.
Use the following context to answer the question at the end.

{context}

Question: {question}
Answer:"""


PROMPT = PromptTemplate(
    template = prompt_template,
    input_variables = ["context", "question"]
)

test = pd.read_csv("/content/drive/MyDrive/Fall 2023/Data/Test dataset/test_data.csv")

In [None]:
# gpu_chain = PROMPT | llm.bind(stop=["\n\n"])

# questions = []
# context = []
# for question in test.question[:6]:
#     context = vectordb_loaded.max_marginal_relevance_search(question, k = CFG.k)[0].page_content
#     questions.append({"context": context, "question": question + "[/INST]"})

In [None]:
# # %%time
# answers = gpu_chain.batch(questions, verbose=True)
# for answer in answers:
#     print(answer)

In [None]:
retriever = vectordb_loaded.as_retriever(search_kwargs = {"k": CFG.k, "search_type" : "similarity"})

qa_chain = RetrievalQA.from_chain_type(
    llm = llm,
    chain_type = "stuff", # map_reduce, map_rerank, stuff, refine
    retriever = retriever,
    chain_type_kwargs = {"prompt": PROMPT},
    return_source_documents = True,
    verbose = False

)

In [None]:
### testing MMR search
question = "What is the address of Yeshiva University Beren campus?[/INST]"
qa_chain(question)

In [None]:
import pandas as pd
from tqdm import tqdm

responses = []
for q in tqdm(test.question):
  responses.append(qa_chain(q + "[/INST]")['result'].split("\n")[0])

In [None]:
test['pred'] = responses
test.to_csv("Mistral_GPTQ_RAG.csv", index=False)

In [None]:
!pip install vllm

In [None]:
from vllm import LLM, SamplingParams

sampling_params = SamplingParams(temperature=0.2, top_p=0.95)
llm = LLM(model="codegood/Mistral_Latest", )
outputs = llm.generate(test.question[:10], sampling_params)

for output in outputs:
    prompt = output.prompt
    generated_text = output.outputs[0].text
    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")


In [None]:
from random import randint

idx = randint(0, len(responses)-1)
print("Question:", test.question[idx])
print("Response:", responses[idx])
print("---"*10)
print("Ground truth:", test.answer[idx])

In [None]:
from rouge import Rouge
from tabulate import tabulate

rouge = Rouge()

scores = rouge.get_scores(test['answer'].to_list(), responses, avg=True)

score_table = [{'Metric': metric, 'Precision': score['p'], 'Recall': score['r'], 'F1-Score': score['f']} for metric, score in scores.items()]

print(tabulate(score_table, headers='keys', tablefmt='grid'))