In [32]:
import pandas as pd
import requests
import numpy as np
import re
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.naive_bayes import MultinomialNB 
from sklearn.model_selection import train_test_split 
from sklearn.metrics import accuracy_score 
from datetime import datetime
import string


ModuleNotFoundError: No module named 'metrics'

In [8]:
# load the data file 
url = "https://www.dropbox.com/scl/fi/7rbdzltgef1gl3tmhv6il/jeopardy.json?rlkey=fa3jeovy07nsfpl01hufvxkz3&st=vnwpphp6&dl=1"
response = requests.get(url)

with open("jeopardy.json", "wb") as f :
    f.write(response.content) 

In [11]:
# load the json file into a df
df = pd.read_json("jeopardy.json") 
print(df.columns)
print(df.head())

'''
columns: category, air_date, question, value, answer, round, show_number
'''

Index(['category', 'air_date', 'question', 'value', 'answer', 'round',
       'show_number'],
      dtype='object')
                          category    air_date  \
0                          HISTORY  2004-12-31   
1  ESPN's TOP 10 ALL-TIME ATHLETES  2004-12-31   
2      EVERYBODY TALKS ABOUT IT...  2004-12-31   
3                 THE COMPANY LINE  2004-12-31   
4              EPITAPHS & TRIBUTES  2004-12-31   

                                            question value       answer  \
0  'For the last 8 years of his life, Galileo was...  $200   Copernicus   
1  'No. 2: 1912 Olympian; football star at Carlis...  $200   Jim Thorpe   
2  'The city of Yuma in this state has a record a...  $200      Arizona   
3  'In 1963, live on "The Art Linkletter Show", t...  $200  McDonald\'s   
4  'Signer of the Dec. of Indep., framer of the C...  $200   John Adams   

       round  show_number  
0  Jeopardy!         4680  
1  Jeopardy!         4680  
2  Jeopardy!         4680  
3  Jeopardy!        

In [16]:
# remove the special characters in the value column 
df['value_clean'] = (
    df['value']
    .replace('[$,]', '', regex = True)
    .replace('None',0)
    .replace('',0)
    .fillna(0)
    .astype(float) 
)

# check what are the values so I could classify them 
print(df['value_clean'].describe()) 
median_value = 600
# the meidan value I could use to classify is 600

count    216930.000000
mean        739.988476
std         639.822693
min           0.000000
25%         400.000000
50%         600.000000
75%        1000.000000
max       18000.000000
Name: value_clean, dtype: float64


In [19]:
# create the binary label for values
df['high_value'] = df['value_clean'].apply(lambda x: 1 if x >= median_value else 0) 

# check the class balance 
df['high_value'].value_counts(normalize=True) # the dataset is quit well balanced 

high_value
1    0.524907
0    0.475093
Name: proportion, dtype: float64

In [25]:
# clean the question text 
def clean_text(text):
    text = str(text).lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text 

df['clean_question'] = df['question'].apply(clean_text)

In [42]:
# split into train and test set
x = df['clean_question']
y = df['high_value']

x_train, x_test, y_train, y_test = train_test_split(x, y, random_state = 123) 

KeyError: 'high_value'

In [38]:
# tfidf
tfidf_vectorize = TfidfVectorizer(use_idf=True)
x_train_tf = tfidf_vectorize.fit_transform(x_train)
x_test_tf = tfidf_vectorize.transform(x_test)

In [39]:
# NB model
naive_bayes = MultinomialNB()
naive_bayes.fit(x_train_tf, y_train)
predictions = naive_bayes.predict(x_test_tf) 

In [43]:
print('Accueracy: ', accuracy_score(y_test, predictions)) 
# I got an accueracy of 56.42% 

Accueracy:  0.5642136706433353


Unnamed: 0,category,air_date,question,value,answer,round,show_number,value_clean,high_value,clean_question
0,HISTORY,2004-12-31,"'For the last 8 years of his life, Galileo was...",$200,Copernicus,Jeopardy!,4680,200.0,0,for the last 8 years of his life galileo was u...
1,ESPN's TOP 10 ALL-TIME ATHLETES,2004-12-31,'No. 2: 1912 Olympian; football star at Carlis...,$200,Jim Thorpe,Jeopardy!,4680,200.0,0,no 2 1912 olympian football star at carlisle i...
2,EVERYBODY TALKS ABOUT IT...,2004-12-31,'The city of Yuma in this state has a record a...,$200,Arizona,Jeopardy!,4680,200.0,0,the city of yuma in this state has a record av...
3,THE COMPANY LINE,2004-12-31,"'In 1963, live on ""The Art Linkletter Show"", t...",$200,McDonald\'s,Jeopardy!,4680,200.0,0,in 1963 live on the art linkletter show this c...
4,EPITAPHS & TRIBUTES,2004-12-31,"'Signer of the Dec. of Indep., framer of the C...",$200,John Adams,Jeopardy!,4680,200.0,0,signer of the dec of indep framer of the const...
...,...,...,...,...,...,...,...,...,...,...
216925,RIDDLE ME THIS,2006-05-11,'This Puccini opera turns on the solution to 3...,$2000,Turandot,Double Jeopardy!,4999,2000.0,1,this puccini opera turns on the solution to 3 ...
216926,"""T"" BIRDS",2006-05-11,'In North America this term is properly applie...,$2000,a titmouse,Double Jeopardy!,4999,2000.0,1,in north america this term is properly applied...
216927,AUTHORS IN THEIR YOUTH,2006-05-11,"'In Penny Lane, where this ""Hellraiser"" grew u...",$2000,Clive Barker,Double Jeopardy!,4999,2000.0,1,in penny lane where this hellraiser grew up th...
216928,QUOTATIONS,2006-05-11,"'From Ft. Sill, Okla. he made the plea, Arizon...",$2000,Geronimo,Double Jeopardy!,4999,2000.0,1,from ft sill okla he made the plea arizona is ...
