In [25]:
# Install package to enable importing environment variables for secret keys (e.g. API key)
!pip install python-dotenv
# Imports for RAG & Vector DB
!pip install faiss-cpu sentence-transformers



In [26]:
from openai import OpenAI
import os
from dotenv import load_dotenv

load_dotenv()
# Import variables from .env
api_key = os.getenv('API_KEY')

In [27]:
# access the specific OpenAI project
client = OpenAI(api_key=api_key,project=os.getenv('PROJECT_ID'))
# specify vector store id
vec_id = os.getenv('VEC_ID')

In [28]:
def upload_pdf(filepath: str) -> None:
  # open the pdf file and create an object which could be interpreted by openai
  with open(filepath, "rb") as file_obj:
      f = client.files.create(file=file_obj, purpose="assistants")
      # push pdf to vector store
      client.vector_stores.files.create(
          vector_store_id=vec_id,
          file_id=f.id,
      )
  print("Uploaded " + filepath)

In [None]:
slides_dir = '../data/'
folder = os.listdir(slides_dir)
# upload each pdf to vector DB
for f in folder:
    path = slides_dir + f
    if os.path.isfile(path):
        upload_pdf(path)

Uploaded data/Introduction to Support Vector Machines.pdf
Uploaded data/Introduction to Neural Networks.pdf
Uploaded data/(Re)-Introduction to Data Science & Control Flow.pptx.pdf
Uploaded data/Advanced Abstraction.pptx.pdf
Uploaded data/Advanced Control Flow.pptx.pdf
Uploaded data/Bayes Theorem Review.pdf
Uploaded data/Measures of Dispersion & Central Limit Theorem.pdf
Uploaded data/Random Forests.pdf
Uploaded data/Introduction to Data Processing.pptx.pdf
Uploaded data/Introduction to the Naive Bayes Classifier.pdf
Uploaded data/Introduction to Decision Trees.pdf
Uploaded data/Transformer Architecture.pdf
Uploaded data/NLP & Vector Embeddings.pdf
Uploaded data/Introduction to Unsupervised Learning Algorithms.pdf
Uploaded data/Applied LLMs & Agents.pdf
Uploaded data/Dimensionality Reduction with PCA.pdf
Uploaded data/Introduction to K-Nearest-Neighbors.pdf
Uploaded data/Feature Engineering and Wrangling.pdf


In [34]:
def ask(prompt: str) -> str:
    resp = client.responses.create(
      model='gpt-4o-mini',
      input=prompt,
      tools=[{"type": "file_search", "vector_store_ids": [vec_id]}],
    )
    return resp.output_text

In [35]:
ask("Which slide mentioned Euclidean Distance?")

'The concept of Euclidean Distance is mentioned in multiple sections of the document titled "Introduction to K-Nearest-Neighbors." Here are some key mentions:\n\n1. The first mention discusses measuring distance and introduces Euclidean distance as the direct distance between two points, described as the hypotenuse of a right triangle.\n\n2. Additional details explain the mathematical aspects of Euclidean distance, including a breakdown of the triangle sides needed to calculate it, reflecting on the Pythagorean theorem.\n\nFor further details, you can refer to the "Introduction to K-Nearest-Neighbors" document.'