In [None]:
import requests 

docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [None]:
import pandas as pd

In [None]:
df = pd.DataFrame(documents, columns=['course', 'section', 'question', 'text'])
df.head()

In [None]:
df[df.course == 'data-engineering-zoomcamp'].head()

In [None]:
docs_example = [
    "January course details, register now",
    "Course prerequisites listed in January catalog",
    "Submit January course homework by end of month",
    "Register for January course, no prerequisites",
    "January course setup: Python and Google Cloud"
]

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
cv = CountVectorizer()

In [None]:
cv.fit(docs_example)

In [None]:
names = cv.get_feature_names_out()
names

In [None]:
X = cv.transform(docs_example)

In [None]:
X.toarray()

In [None]:
df_docs = pd.DataFrame(X.toarray(), columns=names).T
df_docs

In [None]:
cv = CountVectorizer(stop_words='english')
X = cv.fit_transform(docs_example)

names = cv.get_feature_names_out()

df_docs = pd.DataFrame(X.toarray(), columns=names).T
df_docs

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
cv = TfidfVectorizer(stop_words='english')
X = cv.fit_transform(docs_example)

names = cv.get_feature_names_out()

df_docs = pd.DataFrame(X.toarray(), columns=names).T
df_docs.round(2)

In [None]:
query = "Do I need to know python to sign up for the January course?"

In [None]:
q = cv.transform([query])
q.toarray()

In [None]:
query_dict = dict(zip(names, q.toarray()[0]))
query_dict

In [None]:
doc_dict = dict(zip(names, X.toarray()[1]))
doc_dict

In [None]:
df_qd = pd.DataFrame([query_dict, doc_dict], index=['query', 'doc']).T