### Winning Jeopardy

In [1]:
import pandas as pd 

jeopardy = pd.read_csv('jeopardy.csv')

In [2]:
print(jeopardy.shape)
jeopardy.head()

(19999, 7)


Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams


In [3]:
jeopardy.columns

Index(['Show Number', ' Air Date', ' Round', ' Category', ' Value',
       ' Question', ' Answer'],
      dtype='object')

In [4]:
jeopardy.rename({'Show Number' : 'Show_number', ' Air Date' : 'Air_date', ' Round' :'Round', 
                ' Category': 'Category', ' Value': 'Value', ' Question': 'Question', ' Answer': 'Answer'}, axis=1, inplace=True)
jeopardy.columns

Index(['Show_number', 'Air_date', 'Round', 'Category', 'Value', 'Question',
       'Answer'],
      dtype='object')

In [5]:
import re
def normalize(message):
    message = message.lower()
    message = re.sub(r'[^A-Za-z0-9\s]', '', message)
    message = re.sub(r'\s+' , ' ', message)
    return message

In [6]:
jeopardy['clean_question'] = jeopardy['Question'].apply(normalize)
jeopardy['clean_answer']  = jeopardy['Answer'].apply(normalize)

In [7]:
def clean_convert(message):
    message = str(message)
    message = re.sub(r'[^A-Za-z0-9\s]', '', message)
    try:
        value = int(message)
    except Exception:
        value = 0 
    return value

In [8]:
jeopardy['clean_value'] = jeopardy['Value'].apply(clean_convert)

In [9]:
jeopardy['Air_date'] = pd.to_datetime(jeopardy['Air_date'], errors='coerce') 

In [10]:
jeopardy.dtypes

Show_number                int64
Air_date          datetime64[ns]
Round                     object
Category                  object
Value                     object
Question                  object
Answer                    object
clean_question            object
clean_answer              object
clean_value                int64
dtype: object

In [11]:
## How often the answer can be used for a question 

def answer_in_question(row):
    split_answer = row['clean_answer'].split()
    split_question = row['clean_question'].split()
    match_count = 0 
    if 'the' in split_answer:
        split_answer.remove('the') 
    if len(split_answer)==0:
        return 0        
    for txt in split_answer:
        if txt in split_question:
            match_count += 1 

    return match_count/len(split_answer)

    

In [12]:
#applying the function above to find out how many times answers are used in question
# and their fraction in the total words of the answer
jeopardy['answer_in_question'] = jeopardy.apply(answer_in_question, axis=1)

In [13]:
## finding the mean of the answer_in_question column
## this will give indication of how likely an answer is to be in question
mean_answer_in_question = jeopardy['answer_in_question'].mean()
mean_answer_in_question

np.float64(0.05900196524977763)

In [14]:
## Investigating recycled questions. 
## sorting the dataframe by the air_date column in order to investigate recycled questions

question_overlap = []
terms_used = set()

jeopardy = jeopardy.sort_values(by=['Air_date'], axis=0)

for row in jeopardy.iterrows():
    row = row[1]
    split_question = row['clean_question'].split(' ')
    split_question = [word for word in split_question if len(word) >=6]
    match_count = 0 
    for word in split_question:
        if word in terms_used:
            match_count +=1
        terms_used.add(word)
    if len(split_question) > 0:
        match_count = match_count/len(split_question)
    question_overlap.append(match_count)    
 
jeopardy['question_overlap'] = question_overlap

jeopardy['question_overlap'].mean()

np.float64(0.6894031359073245)

In [15]:
## Low versus high value  questions 
def low_vs_high(row):
    if row['clean_value'] > 800:
        value = 1
    else:
        value = 0 
    return value

jeopardy['high_value'] = jeopardy.apply(low_vs_high, axis = 1)


In [16]:
def word_count(word):
    low_count = 0 
    high_count = 0 
    for row in jeopardy.iterrows():
        row = row[1]
        split_question = row['clean_question'].split(' ')
        if word in split_question:
            if row['high_value'] == 1:
                high_count += 1
            else:
                low_count += 1
    return high_count, low_count 



In [17]:
comparison_terms = iter(terms_used)
observed_expected = []

for _ in range(10):
    observed_expected.append(word_count(next(comparison_terms)))
    
observed_expected

[(0, 1),
 (1, 0),
 (1, 5),
 (1, 2),
 (0, 1),
 (1, 3),
 (0, 1),
 (2, 2),
 (1, 0),
 (0, 1)]

In [18]:
##  Applying Chi-square test
from scipy.stats import chisquare
import numpy as np

high_value = jeopardy['high_value']==1
high_value_count = high_value.sum()

low_value = jeopardy['high_value']==0
low_value_count = low_value.sum()

chi_squared = []
for item in observed_expected:
    total = sum(item)
    total_prop = total / jeopardy.shape[0]
    expected_high = total_prop * high_value_count
    expected_low = total_prop * low_value_count

    observed = np.array([item[0], item[1]])
    expected = np.array([expected_high, expected_low])
    chi_squared.append(chisquare(observed, expected))

In [19]:
chi_squared

[Power_divergenceResult(statistic=np.float64(0.401962846126884), pvalue=np.float64(0.5260772985705469)),
 Power_divergenceResult(statistic=np.float64(2.487792117195675), pvalue=np.float64(0.11473257634454047)),
 Power_divergenceResult(statistic=np.float64(0.42281054506129573), pvalue=np.float64(0.515537958129453)),
 Power_divergenceResult(statistic=np.float64(0.03188116723440362), pvalue=np.float64(0.8582887163235293)),
 Power_divergenceResult(statistic=np.float64(0.401962846126884), pvalue=np.float64(0.5260772985705469)),
 Power_divergenceResult(statistic=np.float64(0.02636443308440769), pvalue=np.float64(0.871013484688921)),
 Power_divergenceResult(statistic=np.float64(0.401962846126884), pvalue=np.float64(0.5260772985705469)),
 Power_divergenceResult(statistic=np.float64(0.889754963322559), pvalue=np.float64(0.3455437191483469)),
 Power_divergenceResult(statistic=np.float64(2.487792117195675), pvalue=np.float64(0.11473257634454047)),
 Power_divergenceResult(statistic=np.float64(0.40