### 1: Dependencies

In [1]:
# Langchain dependencies
from langchain.document_loaders.pdf import PyPDFDirectoryLoader  # Importing PDF loader from Langchain
from langchain.text_splitter import RecursiveCharacterTextSplitter  # Importing text splitter from Langchain
from langchain.embeddings import OpenAIEmbeddings  # Importing OpenAI embeddings from Langchain
from langchain.schema import Document  # Importing Document schema from Langchain
from langchain.vectorstores.chroma import Chroma  # Importing Chroma vector store from Langchain
from dotenv import load_dotenv # Importing dotenv to get API key from .env file
from langchain.chat_models import ChatOpenAI


import os  # Importing os module for operating system functionalities
import shutil  # Importing shutil module for high-level file operations

### 2: Read PDF

In [2]:
# Directory to your pdf files:
DATA_PATH = r"data"

def load_documents():
    """
    Load PDF documents from the specified directory using PyPDFDirectoryLoader.

    Returns:
        List of Document objects: Loaded PDF documents represented as Langchain Document objects.
    """
    document_loader = PyPDFDirectoryLoader(DATA_PATH)  # Initialize PDF loader with specified directory
    return document_loader.load()  # Load PDF documents and return them as a list of Document objects

In [3]:
documents = load_documents()
print(documents)

[Document(page_content='Fully automated hand tracking for Parkinson’s Disease\nDiagnosis\nCallum Macpherson\nStudent ID: 201022895\nSupervised by Luisa Cutillo, Samuel Relton, and Hui Fang\nSubmitted in accordance with the requirements for the\nmodule MATH5872M: Dissertation in Data Science and Analytics\nas part of the degree of\nMaster of Science in Data Science and Analytics\nThe University of Leeds, School of Mathematics\nSeptember 2021\nThe candidate conﬁrms that the work submitted is his/her own and that appropriate\ncredit has been given where reference has been made to the work of others.', metadata={'source': 'data\\MACPHERSON, Callum - MSc Dissertation.pdf', 'page': 0}), Document(page_content='', metadata={'source': 'data\\MACPHERSON, Callum - MSc Dissertation.pdf', 'page': 1}), Document(page_content='Acknowledgements\nI would like to thank my supervisors, Samuel Relton, Luisa Cutillo, and Hui Fang, for their\nguidance and support over the last three months. Their support and

In [4]:
type(documents)

list

### 3: Split into chunks of text

Is this step necessary or useful for my application?

In [5]:
def split_text(documents: list[Document]):
    """
    Split the text content of the given list of Document objects into smaller chunks.

    Args:
        documents (list[Document]): List of Document objects containing text content to split.

    Returns:
        list[Document]: List of Document objects representing the split text chunks.
    """
    # Initialize text splitter with specified parameters
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=400,  # Size of each chunk in characters
        chunk_overlap=100,  # Overlap between consecutive chunks
        length_function=len,  # Function to compute the length of the text
        add_start_index=True,  # Flag to add start index to each chunk
    )
    # Split documents into smaller chunks using text splitter
    chunks = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(chunks)} chunks.")

    # Print example of page content and metadata for a chunk
    document = chunks[10]
    print(document.page_content)
    print(document.metadata)

    return chunks  # Return the list of split text chunks

In [6]:
chunks = split_text(documents)

Split 100 documents into 750 chunks.
detection model were YOLOv4 and YOLOv4-tiny. Similarly, the selected feature extractors
selected for the key point regression model were EfﬁcientNet-B0 and EfﬁcientNet-B4.
Results
Both YOLOv4 and YOLOv4-tiny achieve a mean average precision (mAP) of ∼100% and
mean intersection over union (IOU) of ∼85%. YOLOv4-tiny was the preferred object detection
{'source': 'data\\MACPHERSON, Callum - MSc Dissertation.pdf', 'page': 4, 'start_index': 1255}


In [7]:
for chunk in chunks:
    print(chunk)
    print("\n")

page_content='Fully automated hand tracking for Parkinson’s Disease\nDiagnosis\nCallum Macpherson\nStudent ID: 201022895\nSupervised by Luisa Cutillo, Samuel Relton, and Hui Fang\nSubmitted in accordance with the requirements for the\nmodule MATH5872M: Dissertation in Data Science and Analytics\nas part of the degree of\nMaster of Science in Data Science and Analytics\nThe University of Leeds, School of Mathematics' metadata={'source': 'data\\MACPHERSON, Callum - MSc Dissertation.pdf', 'page': 0, 'start_index': 0}


page_content='Master of Science in Data Science and Analytics\nThe University of Leeds, School of Mathematics\nSeptember 2021\nThe candidate conﬁrms that the work submitted is his/her own and that appropriate\ncredit has been given where reference has been made to the work of others.' metadata={'source': 'data\\MACPHERSON, Callum - MSc Dissertation.pdf', 'page': 0, 'start_index': 301}


page_content='Acknowledgements\nI would like to thank my supervisors, Samuel Relton, Lui

### 4: Save to a RDB using Chroma

In [8]:
CHROMA_PATH = "chroma"

In [9]:
def save_to_chroma(chunks: list[Document]):
    # Clear out the database first.
    if os.path.exists(CHROMA_PATH):
        shutil.rmtree(CHROMA_PATH)
    
    # print(chunks)

    # Create a new DB from the documents.
    db = Chroma.from_documents(
        chunks, OpenAIEmbeddings(), persist_directory=CHROMA_PATH
    )
    db.persist()
    print(f"Saved {len(chunks)} chunks to {CHROMA_PATH}.")

### 5: Create a Chroma Database

In [10]:
def generate_data_store():
    """
    Function to generate vector database in chroma from documents.
    """
    documents = load_documents()  # Load documents from a source
    chunks = split_text(documents)  # Split documents into manageable chunks
    save_to_chroma(chunks)  # Save the processed data to a data store


In [11]:
# Load environment variables from a .env file
load_dotenv()
# Generate the data store
generate_data_store()

Split 100 documents into 750 chunks.
detection model were YOLOv4 and YOLOv4-tiny. Similarly, the selected feature extractors
selected for the key point regression model were EfﬁcientNet-B0 and EfﬁcientNet-B4.
Results
Both YOLOv4 and YOLOv4-tiny achieve a mean average precision (mAP) of ∼100% and
mean intersection over union (IOU) of ∼85%. YOLOv4-tiny was the preferred object detection
{'source': 'data\\MACPHERSON, Callum - MSc Dissertation.pdf', 'page': 4, 'start_index': 1255}


  warn_deprecated(


Saved 750 chunks to chroma.


#### Embedding example

In [12]:
ex = "apple"
ex_1 = "orange"
ex_2 = "iphone"

In [13]:
embedding_function = OpenAIEmbeddings()
vector = embedding_function.embed_query(ex)
vector_1 = embedding_function.embed_query(ex_1)
vector_2 = embedding_function.embed_query(ex_2)

In [14]:
vector, len(vector)

([0.007788693935724774,
  -0.023086208665530836,
  -0.007563429358468463,
  -0.027796285004661327,
  -0.004546249248985117,
  0.013031215302746993,
  -0.022075930433763543,
  -0.008491792648179218,
  0.01889492183715097,
  -0.029625708000381175,
  -0.002952331420639518,
  0.020123638051757233,
  -0.004467747604137248,
  0.009058367388528664,
  -0.02172096801315829,
  0.002046153398631913,
  0.030663290747172698,
  9.96731824885703e-05,
  0.0020973498299636266,
  -0.025502683390884448,
  -0.02110660990585516,
  -0.008130003633156626,
  0.02122948115478676,
  -0.012410031532349089,
  0.0011160836931474653,
  0.005030909512939308,
  0.010095949203997619,
  -1.3579071383542974e-05,
  0.015877740330683773,
  -0.012921996311327509,
  0.020642427562507858,
  -0.016082526987333194,
  -0.01847169719766258,
  0.005382458636335888,
  -0.019290840098969995,
  -0.009222196341319175,
  -0.012089200221185408,
  -0.008778492849901327,
  -0.005652093283337213,
  -0.006092383477546389,
  0.0104782170709

In [15]:
from langchain.evaluation import load_evaluator

evaluator = load_evaluator("pairwise_embedding_distance")

In [16]:
# run an evaluation

x = evaluator.evaluate_string_pairs(prediction=ex, prediction_b=ex_1)

In [17]:
x

{'score': 0.13536251856245263}

In [18]:
evaluator.evaluate_string_pairs(prediction=ex, prediction_b=ex_2)

{'score': 0.09693673454021046}

In [19]:
evaluator.evaluate_string_pairs(prediction=ex, prediction_b=ex)

{'score': 2.220446049250313e-16}

Bigger distance = strings are more different

### 6: Query vector database for relevant data

In [20]:
query_text = "Explain how the YOLO method works"

In [21]:
PROMPT_TEMPLATE = """
Answer the question based only on the following context:

{context}

---

Answer the question based on the above context: {question}
"""


In [22]:
# Use same embedding function as before
embedding_function = OpenAIEmbeddings()
 
# Prepare the database
db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)

# Search the DB.
results = db.similarity_search_with_relevance_scores(query_text, k=3)
if len(results) == 0 or results[0][1] < 0.7:
    print(f"Unable to find matching results.")

In [23]:
from langchain.prompts import ChatPromptTemplate

In [24]:
context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])
prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
prompt = prompt_template.format(context=context_text, question=query_text)
print(prompt)

Human: 
Answer the question based only on the following context:

background (Wu et al. 2020).
YOLO
You Only Look Once (YOLO) was proposed by Redmon et al. (2016), and was unique to pre-
vious object detection algorithms in the sense that it attempted to solve object detection using
a regression-based approach. This single regression problem took the pixels of an image as

---

in YOLOv4 (Bochkovskiy et al. 2020), detailing all the new design choices are not possible in
this review due to project constraints. All of the core design decisions for the YOLO algorithm
have already been outlined in this section which are still essential building blocks of YOLOv4.
However, some key beneﬁts and techniques will be explained in a high-level overview.

---

The design of YOLO poses high spatial constraints on predicted bounding boxes. As the
image is split into an S ×S grid and each cell can only predict up to two objects from only one
class, the model struggles to predict multiple small objects

In [25]:
model = ChatOpenAI()
response_text = model.predict(prompt)

sources = [doc.metadata.get("source", None) for doc, _score in results]
formatted_response = f"Response: {response_text}\nSources: {sources}"
print(formatted_response)

  warn_deprecated(
  warn_deprecated(


Response: The YOLO method works by dividing the image into a grid of S x S cells, where each cell is responsible for predicting up to two objects from a single class. The model uses a regression-based approach to predict the bounding boxes for these objects. However, this design constraint of only being able to predict two objects per cell makes it difficult for the model to accurately detect multiple small objects that are close to each other in the image space. This limitation is highlighted by the struggle of the model to detect an entire flock of birds.
Sources: ['data\\MACPHERSON, Callum - MSc Dissertation.pdf', 'data\\MACPHERSON, Callum - MSc Dissertation.pdf', 'data\\MACPHERSON, Callum - MSc Dissertation.pdf']


In [26]:
response_text

'The YOLO method works by dividing the image into a grid of S x S cells, where each cell is responsible for predicting up to two objects from a single class. The model uses a regression-based approach to predict the bounding boxes for these objects. However, this design constraint of only being able to predict two objects per cell makes it difficult for the model to accurately detect multiple small objects that are close to each other in the image space. This limitation is highlighted by the struggle of the model to detect an entire flock of birds.'

In [27]:
def query_rag(query_text):
    """
    Query a Retrieval-Augmented Generation (RAG) system using Chroma database and OpenAI.

    Args:
    - query_text (str): The text to query the RAG system with.

    Returns:
    - formatted_response (str): Formatted response including the generated text and sources.
    - response_text (str): The generated response text.
    """
    # Use same embedding function as before
    embedding_function = OpenAIEmbeddings()
    
    # Prepare the database
    db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)

    # Search the DB.
    results = db.similarity_search_with_relevance_scores(query_text, k=3)
    
    # Check if there are any matching results or if the relevance score is too low
    if len(results) == 0 or results[0][1] < 0.7:
        print(f"Unable to find matching results.")

    # Combine context from matching documents
    context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])
    
    # Create prompt template using context and query text
    prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
    prompt = prompt_template.format(context=context_text, question=query_text)

    # Initialize OpenAI chat model
    model = ChatOpenAI()
    
    # Generate response text based on the prompt
    response_text = model.predict(prompt)

    # Get sources of the matching documents
    sources = [doc.metadata.get("source", None) for doc, _score in results]
    
    # Format and return response including generated text and sources
    formatted_response = f"Response: {response_text}\nSources: {sources}"
    return formatted_response, response_text

In [28]:
formatted_response, response_text = query_rag(query_text)

In [29]:
response_text

'The YOLO method works by splitting the image into a grid and using a regression-based approach to predict bounding boxes for objects within each grid cell. Each cell can only predict up to two objects from one class, which can pose spatial constraints and make it difficult to detect multiple small objects in close proximity. This design choice can make it challenging for the model to accurately detect entire groups of objects, such as a flock of birds, in the image space.'