# Winning Jeopardy



In [1]:
import pandas as pd

jeopardy = pd.read_csv("additional_files/JEOPARDY_CSV.csv")

In [2]:
jeopardy.head()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams


In [4]:
column_names = jeopardy.columns
column_names

Index(['Show Number', ' Air Date', ' Round', ' Category', ' Value',
       ' Question', ' Answer'],
      dtype='object')

In [5]:
jeopardy.columns = ["Show Number","Air Date","Round","Category","Value","Question","Answer"]

***
### Normalizing Text

In [18]:
import re

def text_normalization(in_string):
    in_string = str(in_string)
    proc_string = in_string.lower()
    proc_string = re.sub(r"[^\w\s]","",proc_string)
    return proc_string

jeopardy["clean_question"] = jeopardy["Question"].apply(text_normalization)
jeopardy["clean_answer"] = jeopardy["Answer"].apply(text_normalization)

jeopardy.head()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer,clean_question,clean_answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus,for the last 8 years of his life galileo was u...,copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe,no 2 1912 olympian football star at carlisle i...,jim thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona,the city of yuma in this state has a record av...,arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's,in 1963 live on the art linkletter show this c...,mcdonalds
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams,signer of the dec of indep framer of the const...,john adams


***
### Normalizing Columns

In [30]:
def currency_normalization(in_string):
    proc_string = re.sub(r"[^\w\s]","",in_string)
    try:
        proc_int = int(proc_string)
    except ValueError:
        proc_int = 0
    return proc_int

In [31]:
jeopardy["clean_value"] = jeopardy["Value"].apply(currency_normalization)
jeopardy.head()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer,clean_question,clean_answer,clean_value
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus,for the last 8 years of his life galileo was u...,copernicus,200
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe,no 2 1912 olympian football star at carlisle i...,jim thorpe,200
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona,the city of yuma in this state has a record av...,arizona,200
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's,in 1963 live on the art linkletter show this c...,mcdonalds,200
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams,signer of the dec of indep framer of the const...,john adams,200


In [33]:
jeopardy["Air Date"] = pd.to_datetime(jeopardy["Air Date"])
jeopardy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 216930 entries, 0 to 216929
Data columns (total 10 columns):
 #   Column          Non-Null Count   Dtype         
---  ------          --------------   -----         
 0   Show Number     216930 non-null  int64         
 1   Air Date        216930 non-null  datetime64[ns]
 2   Round           216930 non-null  object        
 3   Category        216930 non-null  object        
 4   Value           216930 non-null  object        
 5   Question        216930 non-null  object        
 6   Answer          216928 non-null  object        
 7   clean_question  216930 non-null  object        
 8   clean_answer    216930 non-null  object        
 9   clean_value     216930 non-null  int64         
dtypes: datetime64[ns](1), int64(2), object(7)
memory usage: 16.6+ MB


***
### Answers in Questions

In [44]:
def count_matches(row):
    split_answer = row[8].split(" ")
    split_question = row[7].split(" ")
    match_count = 0
    try:
        split_answer.remove("the")
    except ValueError:
        split_answer = split_answer
    
    if len(split_answer) == 0:
        return 0
    for word in split_answer:
        if word in split_question:
            match_count += 1
    return match_count/len(split_answer)

In [45]:
jeopardy["answer_in_question"] = jeopardy.apply(count_matches,axis=1)

In [50]:
mean_answer_in_question = jeopardy["answer_in_question"].mean()
print(mean_answer_in_question)

0.05934282490603389


Thus we can conclude, the answer is nearly never included in the question!

***
### Recycled Questions

On average, the answer only makes up for about 6% of the question. This isn't a huge number, and means that we probably can't just hope that hearing a question will enable us to figure out the answer. We'll probably have to study.

In [60]:
question_overlap = []
terms_used = set()
jeopardy.sort_values("Air Date",ascending=True,inplace=True)
for idx, row in jeopardy.iterrows():
    split_question = row[7].split(" ")
    split_question = [q for q in split_question if len(q) > 5]
    match_count = 0
    for word in split_question:
        if word in terms_used:
            match_count += 1
        terms_used.add(word)
    if len(split_question) > 0:
        match_count = match_count/len(split_question)
    question_overlap.append(match_count)
jeopardy["question_overlap"] = question_overlap
print(jeopardy["question_overlap"].mean())

0.8726600717080424


***
### Low Value vs High Value Questions

There is about 87% overlap between terms in new questions and terms in old questions. This only looks at a small set of questions, and it doesn't look at phrases, it looks at single terms. This makes it relatively insignificant, but it does mean that it's worth looking more into the recycling of questions.

In [71]:
def categorize_value(row):
    clean_value = row[9]
    if clean_value > 800:
        value = 1
    else:
        value = 0
    return value

In [72]:
jeopardy["high_value"] = jeopardy.apply(categorize_value,axis=1)

In [73]:
def determine_counts(word):
    low_count = 0
    high_count = 0
    for idx, row in jeopardy.iterrows():
        question_split = row["clean_question"].split(" ")
        if word in question_split:
            if row["high_value"] == 1:
                high_count += 1
            else:
                low_count += 1
    return high_count, low_count

In [93]:
from random import choice

terms_used_list = list(terms_used)
comparison_terms = [choice(terms_used_list) for _ in range(10)]

In [95]:
print(comparison_terms)

['zamboni', 'shakespeareoriented', 'forbush', 'pimpled', 'simenon', 'sacrifice', 'cottonspinning', 'unsecured', 'obscenity', 'stoppards']


In [101]:
observed_expected = []

for term in comparison_terms:
    observed_expected.append(determine_counts(term))

observed_expected

[(1, 1),
 (0, 1),
 (1, 4),
 (1, 0),
 (0, 1),
 (11, 16),
 (1, 0),
 (0, 1),
 (6, 5),
 (2, 0)]

***
### Applying the Chi-spuared Test

In [104]:
from scipy.stats import chisquare
import numpy as np

high_value_count = jeopardy["high_value"].sum()
total_count = len(jeopardy)
low_value_count = total_count - high_value_count


chi_squared = []

for entry in observed_expected:
    total = entry[0]+entry[1]
    total_prop = total/total_count
    expt_high = total_prop*high_value_count
    expt_low = total_prop*low_value_count
    observed = np.array([entry[0],entry[1]])
    expected = np.array(expt_high,expt_low)
    chi_squared.append(chisquare(observed,expected))

In [106]:
for i in range(10):
    print(comparison_terms[i],"= ",chi_squared[i])

zamboni =  Power_divergenceResult(statistic=0.6643645342622282, pvalue=0.4150233140704003)
shakespeareoriented =  Power_divergenceResult(statistic=2.0980804794980186, pvalue=0.14748418972902788)
forbush =  Power_divergenceResult(statistic=4.839528117915998, pvalue=0.027814505121481037)
pimpled =  Power_divergenceResult(statistic=2.0980804794980186, pvalue=0.14748418972902788)
simenon =  Power_divergenceResult(statistic=2.0980804794980186, pvalue=0.14748418972902788)
sacrifice =  Power_divergenceResult(statistic=10.604012149916837, pvalue=0.0011284252979914596)
cottonspinning =  Power_divergenceResult(statistic=2.0980804794980186, pvalue=0.14748418972902788)
unsecured =  Power_divergenceResult(statistic=2.0980804794980186, pvalue=0.14748418972902788)
obscenity =  Power_divergenceResult(statistic=3.814541139566517, pvalue=0.050809520356018774)
stoppards =  Power_divergenceResult(statistic=4.196160958996037, pvalue=0.04051560264180788)


### Chi-squared results

Three of the terms had a significant difference in usage between high value and low value rows. Nevertheless, the frequencies were nearly all lower than 5, so the chi-squared test isn't as valid. It would be better to run this test with only terms that have higher frequencies.
