<h1>BDS HW Solution: Building a ChatBot</h1>

Create a virtual environment if you are running this locally<br>
Install and import other packages if required

In [1]:
!pip install -qU pymupdf
!pip install -qU langchain-community
!pip install -qU langchain-google-genai
!pip install -qU langchain-text-splitters
!pip install -qU "langchain-chroma>=0.1.2"

In [4]:
import os
import fitz  # PyMuPDF
import nltk
import json
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer

from langchain.embeddings import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from langchain.chains import RetrievalQA
from langchain.llms import HuggingFacePipeline
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.prompts import PromptTemplate

In [5]:
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /Users/kristop/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/kristop/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

# Task 1

In [6]:
# TODO: Download zip file and Unzip to get all PDFs for the chatbot knowledge base
!unzip Docs.zip
folder_name = "Docs"

Archive:  Docs.zip
   creating: Docs/
  inflating: Docs/Searchfor_Barwick.pdf  
  inflating: __MACOSX/Docs/._Searchfor_Barwick.pdf  
  inflating: Docs/Commentson_Szpak.pdf  
  inflating: __MACOSX/Docs/._Commentson_Szpak.pdf  
  inflating: Docs/CoherentNuclear_Vaidya.pdf  
  inflating: __MACOSX/Docs/._CoherentNuclear_Vaidya.pdf  
  inflating: Docs/DeuteronWaves_Yabuuchi.pdf  
  inflating: __MACOSX/Docs/._DeuteronWaves_Yabuuchi.pdf  
  inflating: Docs/Electrolysisof_Warner.pdf  
  inflating: __MACOSX/Docs/._Electrolysisof_Warner.pdf  
  inflating: Docs/Inthe_Storms.pdf   
  inflating: __MACOSX/Docs/._Inthe_Storms.pdf  
  inflating: Docs/ConcentrationPolarization_Barbieri.pdf  
  inflating: __MACOSX/Docs/._ConcentrationPolarization_Barbieri.pdf  
  inflating: Docs/Possibilityof_Zhang.pdf  
  inflating: __MACOSX/Docs/._Possibilityof_Zhang.pdf  
  inflating: Docs/ColdFusion_Arata2.pdf  
  inflating: __MACOSX/Docs/._ColdFusion_Arata2.pdf  
  inflating: Docs/Fulgurites,Bol

In [8]:
def extract_text_from_pdf(pdf_path):
  """Extracts text from a PDF file."""
  doc = fitz.open(pdf_path)
  text = "\n".join([page.get_text() for page in doc])
  return text

def process_pdf(pdf_path, output_txt):
  """Extracts text and saves the result."""
  text = extract_text_from_pdf(pdf_path)

  with open(output_txt, 'w', encoding='utf-8') as f:
      f.write(text)


# TODO: Create a directory to store the extracted text
os.mkdir("Docs_Txt")

# TODO: Use the given functions to extract text from all PDFs in the zip file and save it as a .txt file with the same filename
for fl in os.listdir("Docs"):
  if ".pdf" in fl:
    process_pdf(f"Docs/{fl}", f"Docs_Txt/{fl.split('.')[0]}.txt")

In [9]:
def chunk_text(text, chunk_size):
  """Splits text into chunks of approximately `chunk_size` words."""
  words = nltk.word_tokenize(text)
  chunks = [" ".join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]
  return chunks

def generate_embeddings(model, chunks):
  """Generates embeddings for text chunks."""
  return model.encode(chunks).tolist()


# TODO: Initialize any text embedding model
model = SentenceTransformer('all-MiniLM-L6-v2')

# TODO: For each given PDF:
# 1. Read the extracted .txt file
# 2. Create chunks of any size
# 3. Append to a list the chunk, its embedding and the filename
# Utilize the functions provided above
embeddings_list = []

for fl in os.listdir("Docs")[:10]:
  if ".pdf" not in fl:
    continue
  text = "" # Read the associated .txt file
  with open("Docs_Txt/"+fl.split(".")[0]+".txt", 'r', encoding='utf-8') as f:
      text = f.read()

  chunks = chunk_text(text, 256) # Creating chunks of size N

  for ch in chunks:
    embeds = generate_embeddings(model, [chunks]) # Generate embeddings for each chunk
    embeddings_list.append({
      "embedding": embeds[0],
      "chunk": ch,
      "filename": fl
  })


with open("KnowledgeBase.json", 'w', encoding='utf-8') as f:
        json.dump(embeddings_list, f, indent=4)

# Task 2

In [10]:
query1 = "Who is the founder of NYU?"

In [11]:
# TODO: Generate text embedding for the query with the same model use previously
query_embedding = model.encode([query1]).tolist()

In [12]:
# TODO: Define function to return the n most similar chunks from a Knowledge Base json file, when given a query embedding
# 1. Read from JSON file to get a list of all chunks
# 2. Use the cosine similarity function to compute similarity scores between the query and all chunks
# 3. Return the top n most similar embeddings with the chunk text and filename

def find_similar_chunks(query_embed, json_file, embedding_model, top_n):
  """Finds the top N most similar text chunks to the query."""
  with open(json_file, 'r', encoding='utf-8') as f:
      data = json.load(f)

  chunk_embeddings = np.array([item["embedding"] for item in data])
  similarities = cosine_similarity(query_embed, chunk_embeddings)[0]

  top_n_indices = np.argsort(similarities)[-top_n:][::-1]
  return [data[i] for i in top_n_indices]

In [13]:
# Prints the top 5 documents for the given 'query1'

top_results = find_similar_chunks(query_embedding, "KnowledgeBase.json", model, top_n=5)
for result in top_results:
  print(f'{result["filename"]}: {result["chunk"]}')

Inthe_Storms.pdf: Chem . 270 ( 1989 ) 451 . [ 4 ] J.O.M . Bockris , Accountability and academic freedom : The battle concerning research on cold fusion at Texas A & M Univer- sity , Accountability in Res . 8 ( 2000 ) 103–119 .
Inthe_Storms.pdf: does not make a person rational . Nevertheless , we all need to be reminded that new ideas must be considered with humility , objectivity , and knowledge , especially in the ﬁeld of cold fusion itself . Some new ideas are clearly wrong and need to be identiﬁed and improved . How this needs to be done is well understood in science . It does not and must not involve personal attack . Every kid who dreams of being a scientist knows this , so why is something so basic forgotten by some working scientists . Ideas must be evaluated by logic and knowledge . The rest of us need to keep reminding those scientists who forget how they are supposed to behave , as John would have wanted . References [ 1 ] M. Fleischmann , S. Pons and M. Hawkins , Electrochem

In [14]:
# TODO: Define LLM - locally, do NOT use an API

llm_model_name = "google/flan-t5-small"
llm_tokenizer = AutoTokenizer.from_pretrained(llm_model_name)
llm_model = AutoModelForSeq2SeqLM.from_pretrained(llm_model_name)
llm = pipeline("text2text-generation", model=llm_model, tokenizer=llm_tokenizer)

Device set to use mps:0


In [15]:
# TODO: Define a custom prompt that passes the query and the context to the LLM

def generate_answer(llm_model, query, context_chunks):
  """Generates an answer using the locally loaded LLM."""
  context_text = "\n".join([chunk["chunk"] for chunk in context_chunks])
  prompt = f"Context:\n{context_text}\n\nQuestion: {query}\nAnswer:"
  response = llm_model(prompt, max_length=200, num_return_sequences=1)
  return response

In [16]:
answer = generate_answer(llm, query1, top_results)
print("Query:", query1)
print("Answer:", answer)

Token indices sequence length is longer than the specified maximum sequence length for this model (1569 > 512). Running this sequence through the model will result in indexing errors


Query: Who is the founder of NYU?
Answer: [{'generated_text': 'Bernhardt Patrick John Mara Bockris'}]


In [17]:
# TODO: Repeat the same steps as above for query2 i.e. generate query embedding, get top 5 most-similar chunks and use them to obtain an answer from the LLM

query2 = "What is the difference between Hot and Cold fusion?"
query2_embedding = model.encode([query2]).tolist()
top_results = find_similar_chunks(query2_embedding, "KnowledgeBase.json", model, top_n=5)

answer = generate_answer(llm, query2, top_results)
print("Query:", query2)
print("Answer:", answer)

Query: What is the difference between Hot and Cold fusion?
Answer: [{'generated_text': 'a sprayed layer of palladium'}]


# Task 3

In [20]:
from langchain_core.documents import Document

In [18]:
# We are using the GEMINI API to try out a pre-trained model with billions of params
# TODO: Setup the Gemini API using the Gemini and Gemini LangChain links shared the HW PDF
# Once setup insert your API Key below

os.environ["GOOGLE_API_KEY"] = "AIzaSyCLZTVy9fMmd9QpoTkvKOlVUjcpzKjgEQA"

In [22]:
# Initialize the embedding model via Langchain
model_name = "sentence-transformers/all-MiniLM-L6-v2"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False}
embedding_model = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

# TODO: Load extracted text from the text files you had stored
docs = []
for filename in os.listdir("Docs_Txt"):
    if filename.endswith(".txt"):
        loader = TextLoader(os.path.join("Docs_Txt", filename), encoding='utf-8')
        docs.extend(loader.load())

# TODO: Chunk the documents with arguments for size and overlap
text_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=10)
texts = text_splitter.split_documents(docs)

# Create the vector store with ChromaDB to create our knowledge base DB in a presisting folder
db = Chroma(
    collection_name="example_collection",
    embedding_function=embedding_model,
    persist_directory="./chroma_langchain_db",  # Where to save data locally, remove if not necessary
)

def chunked(iterable, size):
    """Helper function to split list into chunks of given size"""
    for i in range(0, len(iterable), size):
        yield iterable[i:i + size]
        
max_batch_size = 5461

# Split your texts into smaller batches and add them one by one
for batch in chunked(texts, max_batch_size):
    db.add_documents(documents=batch)


In [23]:
# TODO: Initialize the LLM pipeline for Gemini via Langchain, and set the initializing parameters for temperature, max_tokens etc.
llm = llm = ChatGoogleGenerativeAI(
    model="gemini-1.5-flash-8b",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2
)

# TODO: Define a custom prompt template to pass the query and context to the LLM
# Note that Larger models such as Gemini are more capable at following instructions specified in the prompt
prompt_template = PromptTemplate.from_template(
    """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

{context}

Question: {question}
Answer:"""
)
# Create the RAG chain with the knowledge base and the prompt
qa_chain = RetrievalQA.from_llm(
    llm, retriever=db.as_retriever(), prompt=prompt_template
)

In [24]:
# Testing both queries with the Langchain RAG setup

print(query1)
print(qa_chain({"query": query1}))

print(query2)
print(qa_chain({"query": query2}))

Who is the founder of NYU?


  print(qa_chain({"query": query1}))


{'query': 'Who is the founder of NYU?', 'result': "Don't know."}
What is the difference between Hot and Cold fusion?
{'query': 'What is the difference between Hot and Cold fusion?', 'result': "I don't know.  The provided texts describe differences, but don't explicitly state *what* the differences are."}
