In [1]:
# Install package to enable importing environment variables for secret keys (e.g. API key)
!pip install python-dotenv
# Imports for RAG & Vector DB
!pip install faiss-cpu sentence-transformers

Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (31.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m60.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.12.0


In [4]:
from openai import OpenAI
import os
from dotenv import load_dotenv

load_dotenv()
# Import variables from .env
api_key = os.getenv('API_KEY')

In [5]:
# access the specific OpenAI project
client = OpenAI(api_key=api_key,project=os.getenv('PROJECT_ID'))
# specify vector store id
vec_id = os.getenv('VEC_ID')

In [6]:
def upload_pdf(filepath: str) -> None:
  # open the pdf file and create an object which could be interpreted by openai
  with open(filepath, "rb") as file_obj:
      f = client.files.create(file=file_obj, purpose="assistants")
      # push pdf to vector store
      client.vector_stores.files.create(
          vector_store_id=vec_id,
          file_id=f.id,
      )
  print("Uploaded " + filepath)

In [None]:
slides_dir = '../data/'
folder = os.listdir(slides_dir)
# upload each pdf to vector DB
for f in folder:
    path = slides_dir + f
    if os.path.isfile(path):
        upload_pdf(path)

Uploaded data/Introduction to Support Vector Machines.pdf
Uploaded data/Introduction to Neural Networks.pdf
Uploaded data/(Re)-Introduction to Data Science & Control Flow.pptx.pdf
Uploaded data/Advanced Abstraction.pptx.pdf
Uploaded data/Advanced Control Flow.pptx.pdf
Uploaded data/Bayes Theorem Review.pdf
Uploaded data/Measures of Dispersion & Central Limit Theorem.pdf
Uploaded data/Random Forests.pdf
Uploaded data/Introduction to Data Processing.pptx.pdf
Uploaded data/Introduction to the Naive Bayes Classifier.pdf
Uploaded data/Introduction to Decision Trees.pdf
Uploaded data/Transformer Architecture.pdf
Uploaded data/NLP & Vector Embeddings.pdf
Uploaded data/Introduction to Unsupervised Learning Algorithms.pdf
Uploaded data/Applied LLMs & Agents.pdf
Uploaded data/Dimensionality Reduction with PCA.pdf
Uploaded data/Introduction to K-Nearest-Neighbors.pdf
Uploaded data/Feature Engineering and Wrangling.pdf


In [30]:
system_prompt = """
# Identity

You are a Lecture Navigator program that points the user to the Data Science & Machine Learning lecture slides containing information relevant to the user prompt.

# Instructions

* The answer should only contain the slides file name and page numbers.

* If only one slide file contains relevant information, output in the format below.
+ Slides: <slide_filename>
  Pages: <page_numbers_list>
  Explanation:

* If more than one slide file contains relevant information, create a repeated response for the additional findings in the format above.
For example, if there are 2 files containing relevant information, the output should look like the following.
+ Slides: <slide_filename>
 Pages: <page_numbers_list>
 Explanation:

+ Slides: <slide_filename>
  Pages: <page_numbers_list>
  Explanation:

If there are 3 files containing relevant information, the output should look like the following.
+ Slides: <slide_filename>
  Pages: <page_numbers_list>
  Explanation:

+ Slides: <slide_filename>
  Pages: <page_numbers_list>
  Explanation:

+ Slides: <slide_filename>
  Pages: <page_numbers_list>
  Explanation:

* Numbers in <page_numbers_list> should be displayed in ascending order.

* If <page_numbers_list> contains continuous numbers, shorten them into number ranges. For example, `1, 2, 3, 4, 14, 15, 16, 17` will be `1-4, 14-17`

* If the prompt is irrelevant to Data Science & Machine Learning or no relevant slides can be found, simply out `No relevant slides found.`

# Examples

<user_query>
Which lecture slides mentioned Euclidean Distance?
</user_query>

<assistant_response>
+ Slides: Introduction to K-Nearest-Neighbors
 Pages: 23, 25-31, 73
</assistant_response>
"""

In [31]:
def ask(prompt: str) -> str:
    resp = client.responses.create(
      model='gpt-4o-mini',
      instructions=system_prompt,
      input=prompt,
      tools=[{"type": "file_search", "vector_store_ids": [vec_id]}],
    )
    return resp.output_text

In [32]:
print(ask("Which lecture slides mentioned Euclidean Distance?"))

+ Slides: Introduction to Unsupervised Learning Algorithms.pdf
  Pages: 1, 2, 4, 9
  Explanation: Discusses the use of squared Euclidean distance in calculating cluster variance and centroid assignment.

+ Slides: Introduction to Support Vector Machines.pdf
  Pages: 3, 4, 5
  Explanation: Explains the distance from a support vector to the hyperplane using a form akin to the Euclidean distance.

+ Slides: Introduction to K-Nearest-Neighbors.pdf
  Pages: 7, 12, 17
  Explanation: Details the calculation of Euclidean distance in kNN and its importance in measuring distances between data points.


In [33]:
print(ask("What is Euclidean Distance?"))

+ Slides: Introduction to K-Nearest-Neighbors.pdf
  Pages: 1, 3-4, 11-12, 17-19
  Explanation: These pages provide a detailed explanation of Euclidean distance, including its mathematical background using Pythagorean theorem and its application in measuring distance between points in multiple dimensions.

+ Slides: Introduction to Unsupervised Learning Algorithms.pdf
  Pages: 4-5
  Explanation: Discusses the use of squared Euclidean distance in the context of clustering algorithms, specifically K-Means, and how it relates to variance calculations within clusters.


In [34]:
test_prompts = [
    "During phase one we learned about variance, how is it related to standard deviation",
    "How do you import a CSV file?",
    "What is the difference between Lasso and Ridge?",
    "What day did we learn about elbow, gap and silhouette?",
    "Is K-means the same thing as KNN?",
    "How much is the TKH monthly stipend?",
    "What is the difference between a print statement and a return statement?",
    "How can I combine 2 tables together in SQL?"
    "Can you show me the slide that explains supervised learning?",
    "Which module covers supervised learning?",
    "I need the part about labeled data training",
    "What kind of EDA uses a bar graph?",
    "What is the difference between structured data and unstructured data?",
    "A tuple is immutable, right?"
    "What date did we learn about Explanation of Data Analaysis aka EDA?"
    "Where's the slide that covers wbe scraping use API keys?",
    "After I create my repository on Github I can just hit git push, correct?",
    "What are the steps for pushing code to Github?",
    "How much did Elon Musk donate to TKH last year?",
    "What is the difference between one-hot encoding and dummy encoding?",
    "Compare decision trees vs random forests.",
    "What kind of graph uses red dots?",
]

In [35]:
import time

# Ask each question in the test prompts
for i in range(0,5):
    print("Q" + str(i+1) + ". " + test_prompts[i])
    print("   " + ask(test_prompts[i]))
    print("---------------------------------------------------------------------------")
    # delay for 2 seconds before sending the next prompt
    time.sleep(2)

Q1. During phase one we learned about variance, how is it related to standard deviation
   + Slides: Measures of Dispersion & Central Limit Theorem.pdf
  Pages: 6-8, 14
  Explanation: These pages explain the mathematical relationship between variance and standard deviation, detailing how standard deviation is the square root of variance, thus connecting the two concepts directly.
---------------------------------------------------------------------------
Q2. How do you import a CSV file?
   + Slides: Introduction to Data Analytics II (1).pdf
  Pages: 0, 1, 2
  Explanation: This section provides detailed instructions on how to import a CSV file using the pandas library in Python, demonstrating the `pd.read_csv()` method.
---------------------------------------------------------------------------
Q3. What is the difference between Lasso and Ridge?
   + Slides: Introduction to Neural Networks.pdf
  Pages: 3, 11, 22
  Explanation: This section discusses Lasso and Ridge regression, detailin

In [25]:
for i in range(5,10):
    print("Q" + str(i+1) + ". " + test_prompts[i])
    print("   " + ask(test_prompts[i]))
    print("---------------------------------------------------------------------------")
    # delay for 2 seconds before sending the next prompt
    time.sleep(2)

Q6. How much is the TKH monthly stipend?
   No relevant slides found.
---------------------------------------------------------------------------
Q7. What is the difference between a print statement and a return statement?
   No relevant slides found.
---------------------------------------------------------------------------
Q8. How can I combine 2 tables together in SQL?Can you show me the slide that explains supervised learning?
   + Slides: Introduction to Structured Databases I.pdf
  Pages: 0-1, 5-6, 11-12
  Explanation: This section explains how to combine tables using SQL JOINs, including various types of joins like Inner Join, Left Join, Right Join, and Full Join.

+ Slides: Introduction to Decision Trees.pdf
  Pages: 15
  Explanation: This slide provides an overview of supervised learning, emphasizing how predictions are made based on input features and their relationships to target outcomes.
---------------------------------------------------------------------------
Q9. Which

In [36]:
for i in range(10,15):
    print("Q" + str(i+1) + ". " + test_prompts[i])
    print("   " + ask(test_prompts[i]))
    print("---------------------------------------------------------------------------")
    # delay for 2 seconds before sending the next prompt
    time.sleep(2)

Q11. What kind of EDA uses a bar graph?
   + Slides: Measures of Dispersion & Central Limit Theorem.pdf
  Pages: 10, 12
  Explanation: The slides discuss the use of bar charts in exploratory data analysis (EDA) to represent discrete data and showcase successes in relation to events.

+ Slides: Feature Engineering and Wrangling.pdf
  Pages: 7, 8
  Explanation: These slides highlight the significance of visualizing data types and transformations, including the application of bar graphs in EDA.
---------------------------------------------------------------------------
Q12. What is the difference between structured data and unstructured data?
   + Slides: NLP & Vector Embeddings.pdf
  Pages: 1, 2, 4
  Explanation: The slides discuss structured and unstructured data, explaining that approximately 80% of enterprise data is unstructured, and providing examples of various forms of unstructured data.

+ Slides: Feature Engineering and Wrangling.pdf
  Pages: 4, 5, 19
  Explanation: These slides

In [28]:
for i in range(15,len(test_prompts)):
    print("Q" + str(i+1) + ". " + test_prompts[i])
    print("   " + ask(test_prompts[i]))
    print("---------------------------------------------------------------------------")
    # delay for 2 seconds before sending the next prompt
    time.sleep(2)

Q16. How much did Elon Musk donate to TKH last year?
   No relevant slides found.
---------------------------------------------------------------------------
Q17. What is the difference between one-hot encoding and dummy encoding?
   + Slides: Feature Engineering and Wrangling.pdf
  Pages: 0, 1
  Explanation: The slides explain the differences between One-Hot Encoding and Dummy Encoding, highlighting that Dummy Encoding leaves one variable out to avoid multicollinearity issues when categories are closely related.
---------------------------------------------------------------------------
Q18. Compare decision trees vs random forests.
   + Slides: Random Forests.pdf  
  Pages: 1-2, 8-10, 12-13, 18  
  Explanation: These pages compare random forests and decision trees, detailing how random forests enhance decision trees by introducing bootstrapping and feature selection, leading to improved accuracy and reduced overfitting.

+ Slides: Introduction to Decision Trees.pdf  
  Pages: 1-3, 5-