In [1]:
# import libraries
import pandas as pd
import requests
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [2]:
import requests 

docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [3]:
df = pd.DataFrame(documents,columns=["course","section","question","text"])

In [4]:
# TF-IDF Vectorizer

tf_vectorizer = TfidfVectorizer(min_df=5, stop_words="english")
X = tf_vectorizer.fit_transform(df["text"])
word_names = tf_vectorizer.get_feature_names_out()

In [5]:
df_X = pd.DataFrame(X.toarray(), columns=tf_vectorizer.get_feature_names_out())
df_X

Unnamed: 0,01,02,03,04,05,06,09,10,100,11,...,y_val,yaml,year,yellow,yellow_tripdata_2021,yes,yml,youtube,zip,zoomcamp
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.428961
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.279891,0.000000,0.0,0.0,0.000000
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
943,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.000000,0.107298,0.0,0.0,0.000000
944,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000
945,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.167274,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000
946,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000


In [6]:
query = "how to join slack?"
query_transform = tf_vectorizer.transform([query])
query_transform.toarray()

array([[0., 0., 0., ..., 0., 0., 0.]])

In [None]:
query_dict = dict(zip(word_names, query_transform.toarray()[0]))
query_dict

doc_dict = dict(zip(word_names, X.toarray()[1]))
doc_dict

In [7]:
# calculating similarity between each document from the document and the query using cosince similarity

similarity_score = cosine_similarity(X, query_transform).flatten()

'''
argsort sorts the documents based on their similarity as indexes (it gives the index of the similar values based on the score of similarity)
'''
np.argsort(similarity_score)[-5:]
df.iloc[404]



course                              data-engineering-zoomcamp
section                                               Project
question                 Project evaluation - Reproducibility
text        The slack thread : thttps://datatalks-club.sla...
Name: 404, dtype: object

In [9]:
feature_names=["course","section","question","text"]
metrices = {}
vectorizers = {}

tf_vectorizer = TfidfVectorizer(min_df=5, stop_words="english")
X = tf_vectorizer.fit_transform(df["text"])

for f in feature_names:
    tf_vectorizer = TfidfVectorizer(min_df=5, stop_words="english")
    X = tf_vectorizer.fit_transform(df[f])
    vectorizers[f] = tf_vectorizer
    metrices[f] = X

metrices["course"].todense()

matrix([[0.65704068, 0.65704068, 0.        , 0.        , 0.        ,
         0.36958772],
        [0.65704068, 0.65704068, 0.        , 0.        , 0.        ,
         0.36958772],
        [0.65704068, 0.65704068, 0.        , 0.        , 0.        ,
         0.36958772],
        ...,
        [0.        , 0.        , 0.        , 0.        , 0.94609127,
         0.32390016],
        [0.        , 0.        , 0.        , 0.        , 0.94609127,
         0.32390016],
        [0.        , 0.        , 0.        , 0.        , 0.94609127,
         0.32390016]])

In [10]:
vectorizers

{'course': TfidfVectorizer(min_df=5, stop_words='english'),
 'section': TfidfVectorizer(min_df=5, stop_words='english'),
 'question': TfidfVectorizer(min_df=5, stop_words='english'),
 'text': TfidfVectorizer(min_df=5, stop_words='english')}

In [13]:
score = np.zeros(len(df))
query = "what is the deadline for the first homework"

for f in feature_names:
    query_transform = vectorizers[f].transform([query])
    X = metrices[f]

    f_score = cosine_similarity(X,query_transform).flatten()
    score = score + f_score
    

In [14]:
idx = np.argsort(score)[-5:]
df.iloc[idx]

Unnamed: 0,course,section,question,text
17,data-engineering-zoomcamp,General course-related questions,Homework and Leaderboard - what is the system ...,After you submit your homework it will be grad...
870,mlops-zoomcamp,Module 2: Experiment tracking,Parameters Mismatch in Homework Q3,I was using an old version of sklearn due to w...
415,data-engineering-zoomcamp,Workshop 2 - RisingWave,Homework - dlt Exercise 3 - Merge a generator ...,"After loading, you should have a total of 8 re..."
15,data-engineering-zoomcamp,General course-related questions,Homework - Are late submissions of homework al...,"No, late submissions are not allowed. But if t..."
14,data-engineering-zoomcamp,General course-related questions,Homework - What are homework and project deadl...,You can find the latest and up-to-date deadlin...


In [22]:
# filtering the results based on the course itself

filters = {"course":"data-engineering-zoomcamp"}

for feature_name, value in filters.items():
    mask = (df[feature_name] == value).astype(int).values

score_filtered = score * mask 

In [20]:
idx = np.argsort(score_filtered)[-5:]
df.iloc[idx]

Unnamed: 0,course,section,question,text
16,data-engineering-zoomcamp,General course-related questions,Homework - What is the homework URL in the hom...,"Answer: In short, it’s your repository on gith..."
17,data-engineering-zoomcamp,General course-related questions,Homework and Leaderboard - what is the system ...,After you submit your homework it will be grad...
415,data-engineering-zoomcamp,Workshop 2 - RisingWave,Homework - dlt Exercise 3 - Merge a generator ...,"After loading, you should have a total of 8 re..."
15,data-engineering-zoomcamp,General course-related questions,Homework - Are late submissions of homework al...,"No, late submissions are not allowed. But if t..."
14,data-engineering-zoomcamp,General course-related questions,Homework - What are homework and project deadl...,You can find the latest and up-to-date deadlin...
