# Retreival System

This is based heavily on the tutorial from https://huggingface.co/learn/cookbook/en/advanced_rag as recommended in the course. We should probably credit it.

Hyperparameters to test:


* embedding models
* distance strategies for vector store
* chunk size
* overlap size
* k value for top_k
* model used for LLM
* alter generated prompt



In [None]:
!pip install -q langchain langchain-community transformers sentence-transformers faiss-gpu bitsandbytes

In [None]:
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
import pandas as pd

In [None]:
# To get the value of the max sequence_length, we will query the underlying `SentenceTransformer` object used for embeddings
from sentence_transformers import SentenceTransformer
print(f"Model's maximum sequence length: {SentenceTransformer('all-mpnet-base-v2').max_seq_length}")

In [None]:
# Load data
df = pd.read_csv('data.csv')

# Initialize embeddings
# embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-mpnet-base-v2')
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-mpnet-base-v2",
    multi_process=True,
    model_kwargs={"device": "cuda"},
    encode_kwargs={"normalize_embeddings": True},  # Set `True` for cosine similarity
)

# Split text into chunks
TEXT_SEPARATORS = [
    "\n\n",
    "\n",
    "."
    " ",
    "",
]

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=384-64, # selected to stay under 384 max size for all-mpnet-base-v2
    chunk_overlap=50, # arbitrarily pick how much across chunks
    add_start_index=True,  # If `True`, includes chunk's start index in metadata
    strip_whitespace=True,  # If `True`, strips whitespace from the start and end of every document
    separators=TEXT_SEPARATORS,
)
texts = []
metadatas = []

for _, row in df.iterrows():
    try:
      chunks = text_splitter.split_text(row['text'])
      texts.extend(chunks)
      metadatas.extend([{'source': row['source']}] * len(chunks))
    except:
      print(f"source {row['source']} corrupted")

In [None]:
# Function to write a list of strings to a text file
def write_list_to_file(string_list, filename):
    with open(filename, 'w') as file:
        for string in string_list:
            file.write(string + '\n')

# Example usage
write_list_to_file(texts, 'output.txt')

In [None]:
lengths = [len(text) for text in texts]

# Plot the distribution of text lengths, counted as the number of tokens
import matplotlib.pyplot as plt
fig = pd.Series(lengths).hist()
plt.title("Distribution of document lengths in the knowledge base (in count of tokens)")
plt.show()

In [None]:
# Create FAISS index
from langchain_community.vectorstores.utils import DistanceStrategy

vectorstore = FAISS.from_texts(texts, embeddings, metadatas=metadatas, distance_strategy=DistanceStrategy.COSINE)

In [None]:
# Define retrieval function
def retrieve_top_k(query, k=5):
    results = vectorstore.similarity_search(query, k=k)
    return [(res.page_content, res.metadata['source']) for res in results]

In [None]:
# # Example usage of retrieval only
# query = "When is CMU's Spring Carnival Weekend 2025?"
# top_chunks = retrieve_top_k(query, k=5)
# for i, (text, source) in enumerate(top_chunks, 1):
#     print(f"\nResult {i}:")
#     print(f"Source: {source}")
#     print(f"Text: {text}\n")

# Reader

In [None]:
def create_template():
  prompt_in_chat_format = [
    {
      "role": "system",
      "content": """Using the information contained in the context,
        give a concise answer to the question.
        if possible limit your answer to single or a few words for who, when, where questions.
        wherever possible extract name, date, or title without additional explanations.
        Respond only to the question asked, response should be concise and relevant to the question.
        You should do short answers format responses. DO NOT PUT "ANSWER" BEFORE THE ANSWER.
        Don't answer in full sentences. For example, say "12" instead of "The answer is 12."
        """,
      },
      {
          "role": "user",
          "content": """Context:
          {context}
          ---
          Here are some examples of question answer pairs:

          Who is Pittsburgh named after?
          William Pitt

          What famous machine learning venue had its first conference in Pittsburgh in 1980?
          ICML

          What musical artist is performing at PPG Arena on October 13?
          Billie Eilish

          ---
          Don't answer in full sentences. For example, say "12" instead of "The answer is 12."
          Now here is the question for you to answer:

          {question}
          """,
      },
  ]
  RAG_PROMPT_TEMPLATE = tokenizer.apply_chat_template(
      prompt_in_chat_format, tokenize=False, add_generation_prompt=True
  )
  return RAG_PROMPT_TEMPLATE

def create_prompt(template, query, top_chunks):
  context = "\n\n".join([chunk[0] for chunk in top_chunks])
  return template.format(question=query, context=context)

In [None]:
from transformers import pipeline
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

In [None]:
READER_MODEL_NAME = "stabilityai/stablelm-zephyr-3b"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

tokenizer = AutoTokenizer.from_pretrained(READER_MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(READER_MODEL_NAME, quantization_config=bnb_config)
model.eval()

In [None]:
# Create the prompt template
template = create_template()

# Initialize LLM Pipeline
READER_LLM = pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    do_sample=True,
    temperature=0.01,
    repetition_penalty=1.1,
    return_full_text=False,
    max_new_tokens=300,
)

In [None]:
query = " in Week 3, and where was the game played?"
top_chunks = retrieve_top_k(query, k=5)
prompt = create_prompt(template, query, top_chunks)
READER_LLM(prompt)

In [None]:
from tqdm import tqdm
def get_answer(question):
    # Simulating a RAG system query
    top_chunks = retrieve_top_k(question, k=3)
    prompt = create_prompt(template, question, top_chunks)
    return READER_LLM(prompt)[0]['generated_text']


def generate_answers(input_file, output_file):

    with open(input_file, 'r') as file:
        questions = file.readlines()

    answers = []
    print("Number of Q's: " + str(len(questions)))
    for index, question in tqdm(enumerate(questions)):
        question = question.strip()
        if question:
            answer = get_answer(question)
            answers.append(f'Question {index + 1}: {answer}')

    with open(output_file, 'w') as file:
        for answer in answers:
            file.write(answer + '\n')
            print(answer)  # Print each answer as it's generated

In [None]:
# Usage example:
generate_answers("questions.txt", "model_answers.txt")