In [1]:
# Install package to enable importing environment variables for secret keys (e.g. API key)
!pip install python-dotenv
# Imports for RAG & Vector DB
!pip install faiss-cpu sentence-transformers

Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (31.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m60.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.12.0


In [4]:
from openai import OpenAI
import os
from dotenv import load_dotenv

load_dotenv()
# Import variables from .env
api_key = os.getenv('API_KEY')

In [5]:
# access the specific OpenAI project
client = OpenAI(api_key=api_key,project=os.getenv('PROJECT_ID'))
# specify vector store id
vec_id = os.getenv('VEC_ID')

In [6]:
def upload_pdf(filepath: str) -> None:
  # open the pdf file and create an object which could be interpreted by openai
  with open(filepath, "rb") as file_obj:
      f = client.files.create(file=file_obj, purpose="assistants")
      # push pdf to vector store
      client.vector_stores.files.create(
          vector_store_id=vec_id,
          file_id=f.id,
      )
  print("Uploaded " + filepath)

In [8]:
slides_dir = 'data/'
folder = os.listdir(slides_dir)
# upload each pdf to vector DB
for f in folder:
    path = slides_dir + f
    if os.path.isfile(path):
        upload_pdf(path)

Uploaded data/Introduction to Support Vector Machines.pdf
Uploaded data/Introduction to Neural Networks.pdf
Uploaded data/(Re)-Introduction to Data Science & Control Flow.pptx.pdf
Uploaded data/Advanced Abstraction.pptx.pdf
Uploaded data/Advanced Control Flow.pptx.pdf
Uploaded data/Bayes Theorem Review.pdf
Uploaded data/Measures of Dispersion & Central Limit Theorem.pdf
Uploaded data/Random Forests.pdf
Uploaded data/Introduction to Data Processing.pptx.pdf
Uploaded data/Introduction to the Naive Bayes Classifier.pdf
Uploaded data/Introduction to Decision Trees.pdf
Uploaded data/Transformer Architecture.pdf
Uploaded data/NLP & Vector Embeddings.pdf
Uploaded data/Introduction to Unsupervised Learning Algorithms.pdf
Uploaded data/Applied LLMs & Agents.pdf
Uploaded data/Dimensionality Reduction with PCA.pdf
Uploaded data/Introduction to K-Nearest-Neighbors.pdf
Uploaded data/Feature Engineering and Wrangling.pdf


In [9]:
system_prompt = """
# Identity

You are a Lecture Navigator program that points the user to the Data Science & Machine Learning lecture slides containing information relevant to the user prompt.

# Instructions

* The answer should only contain the slides file name and page numbers.

* If only one slide file contains relevant information, output in the format below.
+ Slides: <slide_filename>
  Pages: <page_numbers_list>


* If more than one slide file contains relevant information, create a repeated response for the additional findings in the format above.
For example, if there are 2 files containing relevant information, the output should look like the following.
+ Slides: <slide_filename>
 Pages: <page_numbers_list>

+ Slides: <slide_filename>
  Pages: <page_numbers_list>


If there are 3 files containing relevant information, the output should look like the following.
+ Slides: <slide_filename>
  Pages: <page_numbers_list>

+ Slides: <slide_filename>
  Pages: <page_numbers_list>

+ Slides: <slide_filename>
  Pages: <page_numbers_list>


* Numbers in <page_numbers_list> should be displayed in ascending order.

* If <page_numbers_list> contains continuous numbers, shorten them into number ranges. For example, `1, 2, 3, 4, 14, 15, 16, 17` will be `1-4, 14-17`

* If the prompt is irrelevant to Data Science & Machine Learning or no relevant slides can be found, simply out `No relevant slides found.`

# Examples

<user_query>
Which lecture slides mentioned Euclidean Distance?
</user_query>

<assistant_response>
+ Slides: Introduction to K-Nearest-Neighbors
 Pages: 23, 25-31, 73
</assistant_response>
"""

In [10]:
def ask(prompt: str) -> str:
    resp = client.responses.create(
      model='gpt-4o-mini',
      instructions=system_prompt,
      input=prompt,
      tools=[{"type": "file_search", "vector_store_ids": [vec_id]}],
    )
    return resp.output_text

In [11]:
print(ask("Which lecture slides mentioned Euclidean Distance?"))

+ Slides: Introduction to K-Nearest-Neighbors.pdf
  Pages: 6, 9-11

+ Slides: Introduction to Unsupervised Learning Algorithms.pdf
  Pages: 2-3, 4-5

+ Slides: Introduction to Support Vector Machines.pdf
  Pages: 1-2


In [12]:
test_prompts = [
    "During phase one we learned about variance, how is it related to standard deviation",
    "How do you import a CSV file?",
    "What is the difference between Lasso and Ridge?",
    "What day did we learn about elbow, gap and silhouette?",
    "Is K-means the same thing as KNN?",
    "How much is the TKH monthly stipend?",
    "What is the difference between a print statement and a return statement?",
    "How can I combine 2 tables together in SQL?"
    "Can you show me the slide that explains supervised learning?",
    "Which module covers supervised learning?",
    "I need the part about labeled data training",
    "What kind of EDA uses a bar graph?",
    "What is the difference between structured data and unstructured data?",
    "A tuple is immutable, right?"
    "What date did we learn about Explanation of Data Analaysis aka EDA?"
    "Where's the slide that covers wbe scraping use API keys?",
    "After I create my repository on Github I can just hit git push, correct?",
    "What are the steps for pushing code to Github?",
    "How much did Elon Musk donate to TKH last year?",
    "What is the difference between one-hot encoding and dummy encoding?",
    "Compare decision trees vs random forests.",
    "What kind of graph uses red dots?",
]

In [13]:
import time

# Ask each question in the test prompts
for i in range(len(test_prompts)):
    print("Q" + str(i+1) + ". " + test_prompts[i] + "\n   " + ask(test_prompts[i]))
    print("---------------------------------------------------------------------------")
    # delay for 5 seconds before sending the next prompt
    time.sleep(5)

Q1. During phase one we learned about variance, how is it related to standard deviation
   + Slides: Measures of Dispersion & Central Limit Theorem.pdf
  Pages: 5-8, 18
---------------------------------------------------------------------------
Q2. How do you import a CSV file?
   + Slides: Introduction to Data Analytics II (1).pdf
  Pages: 0-1, 2-4, 5

+ Slides: Introduction to Data Processing.pptx.pdf
  Pages: 3-3
---------------------------------------------------------------------------
Q3. What is the difference between Lasso and Ridge?
   + Slides: Introduction to Neural Networks.pdf
  Pages: 2, 4, 6

+ Slides: Introduction to Support Vector Machines.pdf
  Pages: 8-10, 12

+ Slides: Feature Engineering and Wrangling.pdf
  Pages: 5-7
---------------------------------------------------------------------------
Q4. What day did we learn about elbow, gap and silhouette?
   + Slides: Introduction to Unsupervised Learning Algorithms.pdf
  Pages: 0, 1, 4
---------------------------------