In [1]:
# libraries
import fitz
import re
from langchain.vectorstores import Chroma
from dotenv import load_dotenv
import os
from langchain.embeddings import OpenAIEmbeddings
from langchain.schema import Document
import openai
import chromadb

In [2]:
def load_pdf(file_path):
    content = []
    with fitz.open(file_path) as pdf:
        for page in pdf:
            content.append(page.get_text())
    return content

In [3]:
file_path = '../data/Introduction to Statistics.pdf'
pdf_content = load_pdf(file_path)

In [4]:
def clean_pdf(pages):
    cleaned_content = []
    for page in pages:
        # Remove excessive whitespace and possible document-related metadata
        text = re.sub(r'\s+', ' ', page)
        text = re.sub(r'^\d+\s+', '', text)  # Remove page numbers at the beginning

        # Remove the specific URLs and any preceding numbers like '1.1.3'
        text = re.sub(r'\b\d+\.\d+\.\d+\s+https://stats\.libretexts\.org/@go/page/\d+\b\s*', '', text)
        text = re.sub(r'https://stats\.libretexts\.org/@go/page/\d+\b\s*', '', text)

        # Remove BOM-like characters and any other non-ASCII characters
        text = re.sub(r'[^\x00-\x7F]+', ' ', text)

        cleaned_content.append(text)
    return cleaned_content

In [5]:
cleaned_content = clean_pdf(pdf_content)

In [6]:
cleaned_content[0:30]

['INTRODUCTORY STATISTICS Shafer and Zhang LibreTexts ',
 'Introductory Statistics Shafer and Zhang ',
 'This text is disseminated via the Open Education Resource (OER) LibreTexts Project (https://LibreTexts.org) and like the hundreds of other texts available within this powerful platform, it is freely available for reading, printing and "consuming." Most, but not all, pages in the library have licenses that may allow individuals to make changes, save, and print this book. Carefully consult the applicable license(s) before pursuing such effects. Instructors can adopt existing LibreTexts texts or Remix them to quickly build course-specific resources to meet the needs of their students. Unlike traditional textbooks, LibreTexts  web based origins allow powerful integration of advanced features and new technologies to support learning. The LibreTexts mission is to unite students, faculty and scholars in a cooperative effort to develop an easy-to-use online platform for the construction, cu

In [7]:
len(cleaned_content)

401

In [8]:
type(cleaned_content)

list

In [9]:
# getting the key
api_key = os.getenv("OPENAI_API_KEY")
# getting the model
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

  embeddings = OpenAIEmbeddings(model="text-embedding-3-large")


In [10]:

# Convert cleaned_content into a list of Document objects
documents = [Document(page_content=text) for text in cleaned_content]

# Use the documents in Chroma
client = chromadb.PersistentClient(path="./chroma_db")
db = Chroma.from_documents(documents, embeddings, persist_directory="./chroma_db", client=client)

In [11]:
user_question = "What is a normal distribution?" # User question
retrieved_docs = db.similarity_search(user_question, k=10) # k is the number of documents to retrieve

In [12]:
# Display top results
for i, doc in enumerate(retrieved_docs[:1]): # Display top results
    print(f"Document {i+1}:\n{doc.page_content[36:1000]}") # Display content

Document 1:
onding to the density function for the bell curve with parameters and is called the normal distribution with mean and standard deviation . A continuous random variable whose probabilities are described by the normal distribution with mean and standard deviation is called a normally distributed random variable, or a normal random variable for short, with mean and standard deviation . Figure shows the density function that determines the normal distribution with mean and standard deviation . We repeat an important fact about this curve: The density curve for the normal distribution is symmetric about the mean. Figure : Density Function for a Normally Distributed Random Variable with Mean and Standard Deviation Heights of -year-old men in a certain region have mean inches and standard deviation inches. These heights are approximately normally distributed. Thus the height of a randomly selected -year-old man is a normal random variable with mean and stan


In [13]:
def _get_document_prompt(docs):
    prompt = "\n"
    for doc in docs:
        prompt += "\nContent:\n"
        prompt += doc.page_content + "\n\n"
    return prompt

In [14]:
# Generate a formatted context from the retrieved documents
formatted_context = _get_document_prompt(retrieved_docs)
print("Context formatted for GPT model.")

Context formatted for GPT model.


In [15]:
prompt = f"""
## SYSTEM ROLE
You are a knowledgeable and factual chatbot designed to assist with technical questions about **Cytology**, specifically focusing on **Lung Cancer**. 
Your answers must be based exclusively on provided content from technical books provided.

## USER QUESTION
The user has asked: 
"{user_question}"

## CONTEXT
Here is the relevant content from the technical books:  
'''
{formatted_context}
'''

## GUIDELINES
1. **Accuracy**:  
   - Only use the content in the `CONTEXT` section to answer.  
   - If the answer cannot be found, explicitly state: "The provided context does not contain this information."
   - Start explain cell morphology and then divide morphology in bulletpoints (nuclie, cytoplasm, background and other aspects to consider) 
   - Follow by differential diagnosis
   - Lastly explain ancillary studies for malignant mesothelioma.

2. **Transparency**:  
   - Reference the book's name and page numbers when providing information.  
   - Do not speculate or provide opinions.  

3. **Clarity**:  
   - Use simple, professional, and concise language.  
   - Format your response in Markdown for readability.  

## TASK
1. Answer the user's question **directly** if possible.  
2. Point the user to relevant parts of the documentation.  
3. Provide the response in the following format:

## RESPONSE FORMAT
'''
# [Brief Title of the Answer]
[Answer in simple, clear text.]

**Source**:  
• [Book Title], Page(s): [...]
'''
"""
print("Prompt constructed.")

Prompt constructed.


In [16]:
# Set up GPT client and parameters
client = openai.OpenAI()
model_params = {
    'model': 'gpt-4o',
    'temperature': 0.7,  # Increase creativity
    'max_tokens': 4000,  # Allow for longer responses
    'top_p': 0.9,        # Use nucleus sampling
    'frequency_penalty': 0.5,  # Reduce repetition
    'presence_penalty': 0.6    # Encourage new topics
}

In [17]:
messages = [{'role': 'user', 'content': prompt}]
completion = client.chat.completions.create(messages=messages, **model_params, timeout=120)

In [18]:
answer = completion.choices[0].message.content
print(answer)

'''
# Understanding Normal Distribution

A normal distribution is a probability distribution corresponding to the density function for a bell curve with specific parameters: mean (\(\mu\)) and standard deviation (\(\sigma\)). A continuous random variable that follows this distribution is called a normally distributed random variable, or simply, a normal random variable. An important characteristic of the normal distribution is that its density curve is symmetric about the mean.

**Source**:  
• [Technical Book on Probability Distributions], Page(s): Not specified in provided context
'''
