In [None]:
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
import requests
import json
import os

### Download the following PDF documents (or use you own):

In [None]:

# Define the URLs and filenames
pdf_urls = [
    "https://link.springer.com/content/pdf/10.1186/s13000-024-01464-7.pdf",
    "https://proceedings.neurips.cc/paper_files/paper/2023/file/91f18a1287b398d378ef22505bf41832-Paper-Datasets_and_Benchmarks.pdf"
]

pdf_filenames = [
    "Papers/Paper1.pdf",
    "Papers/Paper2.pdf"
]

# Make sure the "Papers" directory exists
directory = "Papers"
if not os.path.exists(directory):
    os.makedirs(directory)

# Download and save the PDF files locally
for pdf_url, pdf_filename in zip(pdf_urls, pdf_filenames):
    response = requests.get(pdf_url)
    with open(pdf_filename, "wb") as file:
        file.write(response.content)
    print(f"Downloaded and saved {pdf_filename}")

# Initialize an empty list to store all documents
all_docs = []

# Load the downloaded PDF files using PyPDFLoader
for pdf_filename in pdf_filenames:
    loader = PyPDFLoader(pdf_filename)
    docs = loader.load()
    all_docs.extend(docs)
    print(f"Pages loaded from {pdf_filename}: {len(docs)}.")

# Output the total number of pages loaded from both documents
print(f"Total pages loaded from both documents: {len(all_docs)}.")

### Chunk document(s) into chunks! Experiment with chunk size as well as chunk overlap

In [None]:
# Initialize the text splitter to divide text into chunks
# - `chunk_size` is the maximum size of each chunk (1000 characters in this case)
# - `chunk_overlap` is the number of characters that will overlap between chunks (100 characters)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)


# Code for processing PDF documents
docs = text_splitter.split_documents(all_docs)

# Print the number of chunks created from the documents
print(f"Current number of chunks {len(docs)}.")
print(docs[0])  # Uncomment to print the first chunk for verification

### Initialize model for converting text into numerical representation (embeddings). We are using all-MiniLM-L6-v2 from HuggingFace

In [None]:
# Define the embedding model to convert text into embeddings
# Using the "all-MiniLM-L6-v2" model from the Sentence Transformers library
embedding_model = "sentence-transformers/all-MiniLM-L6-v2"
embedding_function = HuggingFaceEmbeddings(model_name=embedding_model, model_kwargs={'device': 'cpu'})

### Convert & store PDF chunks in our vectorstore

In [None]:
# Create a vector store for storing document embeddings (for PDF documents)
vectorstore = Chroma.from_documents(docs, embedding_function, persist_directory="./chroma_db_docs")

# Print the number of entries in the vector store
print(vectorstore._collection.count())

### Formulate a question and retrieve top k similar documents

In [None]:
# Example questions for retrieval (uncomment one at a time to use)

# question = "What are key differences between Image-based and Laboratory-based Diagnostics with a focus on LLM applications?" # Question for paper 1
question = "What kinds of biases are associated with LLM-as-a-Judge models?" # Question for paper 2

# Convert the vector store into a retriever object to search for similar documents
# Using a similarity-based search with the top 3 similar results
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 3})
retrieved_docs = retriever.invoke(question)

# Print the retrieved documents and the number of similar documents found
print(retrieved_docs)
print(f"Collected most {len(retrieved_docs)} similar documents.")

# Function to format the retrieved documents into a readable format
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# Create a context from the formatted documents
context = format_docs(retrieved_docs)
print(context)  # Uncomment to print the formatted context for verification

### Provide question and context to the LLM model

In [None]:

# Define headers for the HTTP request to indicate we are sending JSON data
headers = { "Content-Type": "application/json" }

# Combine the user query and context into a single prompt
user_query = f"\nQuestion: {question}\nContext: {context}"

# Define the JSON payload for the POST request
data = {
    "messages": [
        {"role": "assistant", "content": "You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise."},
        {"role": "user", "content": user_query}
    ],
    "temperature": 0.7,  # Controls the randomness of the output
    "max_tokens": -1,     # Maximum number of tokens in the output (unlimited in this case)
    "stream": False       # Whether to stream the output or not
}

# Make a POST request to the local server running at localhost:1234
response = requests.post("http://localhost:1234/v1/chat/completions", headers=headers, data=json.dumps(data))

# Check if the request was successful
if response.status_code == 200:
    # Extract the assistant's response from the JSON response
    # bot_response = response.json()  # Uncomment to see the full response
    bot_response = response.json()["choices"][0]["message"]["content"]
    print("Answer:", bot_response)
else:
    # Print an error message if the request failed
    print("Failed to get response:", response.status_code, response.text)