### Today, we will use Jeopardy as our data source

In [1]:
import json, re
import requests
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer

url = 'https://raw.githubusercontent.com/weaviate-tutorials/edu-datasets/main/jeopardy_100.json'
resp = requests.get(url)
data = json.loads(resp.text)
jeopardy_data = pd.DataFrame(data)[['Question', 'Answer']]

In [2]:
# 2. Simple text preprocessing (lowercase, remove non-alpha, remove stopwords)
def simple_preprocess(text):
    stopwords = set([
        'the', 'and', 'is', 'in', 'to', 'of', 'a', 'for', 'on', 'with', 'as', 'by', 'an', 'at', 'from', 'that', 'this', 'it', 'be', 'or', 'are', 'was', 'but', 'not', 'which', 'have', 'has', 'had', 'were', 'their', 'they', 'you', 'he', 'she', 'we', 'his', 'her', 'its', 'them', 'our', 'us', 'can', 'will', 'would', 'should', 'could', 'may', 'might', 'do', 'does', 'did', 'so', 'if', 'then', 'than', 'about', 'into', 'out', 'up', 'down', 'over', 'under', 'again', 'further', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'only', 'own', 'same', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'now'
    ])
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = text.split()
    tokens = [w for w in tokens if w not in stopwords]
    return ' '.join(tokens)

In [3]:
# 3. Preprocess all questions
jeopardy_data['processed_question'] = jeopardy_data['Question'].apply(simple_preprocess)

In [4]:
# 4. Fit TF-IDF vectorizer
vectorizer = TfidfVectorizer(ngram_range=(1,2), min_df=3)
tfidf_matrix = vectorizer.fit_transform(jeopardy_data['processed_question'])

In [5]:
#5. Show sample processed data
print(jeopardy_data[['Question', 'processed_question']].head())

                                            Question  \
0  Abraham Lincoln died across the street from th...   
1  Any pigment on the wall so faded you can barel...   
2  After the original 13, this was the 1st state ...   
3  In 1922 Warren Harding said that this "gauges ...   
4  On Jan. 19, 1977 President Ford pardoned this ...   

                                  processed_question  
0   abraham lincoln died across street theatre april  
1                      pigment wall faded barely see  
2             after original st state admitted union  
3  warren harding said gauges speed presentday li...  
4  jan president ford pardoned woman whod been co...  


In [6]:
from sklearn.metrics.pairwise import cosine_similarity

def get_best_answer(user_question, threshold=0.3):
    processed_q = simple_preprocess(user_question)
    user_vec = vectorizer.transform([processed_q])
    similarities = cosine_similarity(user_vec, tfidf_matrix)
    best_idx = similarities.argmax()
    best_score = similarities[0, best_idx]
    if best_score < threshold:
        return {
            "Matched Question": None,
            "Answer": "No good match found.",
            "Score": best_score
        }
    return {
        "Matched Question": jeopardy_data.iloc[best_idx]['Question'],
        "Answer": jeopardy_data.iloc[best_idx]['Answer'],
        "Score": best_score
    }


In [8]:
test_questions = [
    "Who was the first president of the United States?",
]

for q in test_questions:
    result = get_best_answer(q)
    print(f"Your Question: {q}")
    print(f"Matched Jeopardy Question: {result['Matched Question']}")
    print(f"Jeopardy Answer: {result['Answer']}")
    print(f"Similarity Score: {result['Score']:.2f}")
    print("-" * 60)


Your Question: Who was the first president of the United States?
Matched Jeopardy Question: Seen <a href="http://www.j-archive.com/media/2007-07-25_J_18.jpg" target="_blank">here</a>, he was the earliest president ever to be photographed, & the first to have a middle name
Jeopardy Answer: John Quincy Adams
Similarity Score: 0.73
------------------------------------------------------------
