In [95]:
import pandas as pd
import string
import numpy.random as random
import numpy as np
from scipy.stats import chisquare

In [2]:
jeopardy = pd.read_csv('jeopardy.csv')

In [3]:
jeopardy.head()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams


In [4]:
jeopardy.columns

Index(['Show Number', ' Air Date', ' Round', ' Category', ' Value',
       ' Question', ' Answer'],
      dtype='object')

In [5]:
# Fixing leading spaces in column names

jeopardy.columns = ['Show Number', 'Air Date', 'Round', 'Category', 'Value',
       'Question', 'Answer']

In [6]:
def normalize(in_str):
    
    punc = string.punctuation
    
    punc += '“'
    punc += '”'
    
    trans_table = str.maketrans('', '', punc)
    
    return in_str.lower().translate(trans_table)

In [7]:
jeopardy['clean_question'] = jeopardy['Question'].apply(normalize)
jeopardy['clean_answer'] = jeopardy['Answer'].apply(normalize)

In [8]:
jeopardy.head()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer,clean_question,clean_answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus,for the last 8 years of his life galileo was u...,copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe,no 2 1912 olympian football star at carlisle i...,jim thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona,the city of yuma in this state has a record av...,arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's,in 1963 live on the art linkletter show this c...,mcdonalds
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams,signer of the dec of indep framer of the const...,john adams


In [9]:
def norm_dollar(in_str):    
    try:
        return int(normalize(in_str))
    except ValueError:
        return 0

In [10]:
jeopardy['clean_value'] = jeopardy['Value'].apply(norm_dollar)

In [11]:
jeopardy['Air Date'] = pd.to_datetime(jeopardy['Air Date'])

In [12]:
jeopardy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19999 entries, 0 to 19998
Data columns (total 10 columns):
Show Number       19999 non-null int64
Air Date          19999 non-null datetime64[ns]
Round             19999 non-null object
Category          19999 non-null object
Value             19999 non-null object
Question          19999 non-null object
Answer            19999 non-null object
clean_question    19999 non-null object
clean_answer      19999 non-null object
clean_value       19999 non-null int64
dtypes: datetime64[ns](1), int64(2), object(7)
memory usage: 1.5+ MB


In [13]:
def qa_compare(row):
    
    split_answer = set(row['clean_answer'].split())
    split_question = set(row['clean_question'].split())
    
    if not split_answer:
        return 0
    
    common = list(set(split_answer) & set(split_question))
    
    try:
        common.remove('the')
    except ValueError:
        pass
    
    return len(common) / len(split_answer)   

In [14]:
jeopardy['answer_in_question'] = jeopardy.apply(qa_compare, axis=1)

In [15]:
jeopardy['answer_in_question'].mean()

0.055226006666078672

Based on the analysis above, only 5.5% of the Jeopardy answers share words with the question.  Studying question/answer pairs like this would not be an effective strategy.

In [16]:
question_overlap = []
terms_used = set()

In [17]:
jeopardy.sort_values('Air Date', inplace=True)

In [18]:
for index, row in jeopardy.iterrows():

    split_question = list(set(row['clean_question'].split()))
    
    for word in split_question:
        if len(word) < 6:
            split_question.remove(word)
            
    match_count = len(terms_used & set(split_question))
    
    question_overlap.append(match_count)
    
    terms_used.update(set(split_question)) 

In [19]:
jeopardy['question_overlap'] = question_overlap

In [20]:
print(jeopardy['question_overlap'].mean())

6.27536376819


In [21]:
def q_value(row):
    
    if row['clean_value'] > 800:
        return 1
    else:
        return 0

In [22]:
jeopardy['high_value'] = jeopardy.apply(q_value, axis=1)

In [40]:
def high_low(word):
    
    low_count = 0
    high_count = 0
    
    for index, row in jeopardy.iterrows():
        
        q_words = row['clean_question'].split()
        
        if word in q_words:
            
            if row['high_value'] == 1:
                high_count += 1
            else:
                low_count += 1
            
            
    return high_count, low_count

In [82]:
observed_expected = []

In [83]:
random.seed(44)
comparison_terms = choice(list(terms_used), 5)
comparison_terms

array(['trainee', 'singers', 'wilderness', 'manned', 'discoverers'],
      dtype='<U54')

In [84]:
for word in comparison_terms:
    observed_expected.append(high_low(word))

In [85]:
print(observed_expected)

[(1, 1), (2, 8), (0, 3), (2, 2), (0, 3)]


In [101]:
high_value_count = jeopardy['high_value'].sum()
low_value_count = jeopardy.shape[0] - high_value_count
chi_squared = []

In [102]:
for count in observed_expected:
    
    total = sum(count)
    total_prop = total/jeopardy.shape[0]
    expected_high = total_prop * high_value_count
    expected_low = total_prop * low_value_count
    
    observed = np.array([count[0], count[1]])
    expected = np.array([expected_high, expected_low])
    
    chi_squared.append(chisquare(observed, expected))
    

In [103]:
chi_squared

[Power_divergenceResult(statistic=0.44487748166127949, pvalue=0.50477764875459963),
 Power_divergenceResult(statistic=0.36767906209032747, pvalue=0.54427210409625948),
 Power_divergenceResult(statistic=1.2058885383806519, pvalue=0.27214791766901714),
 Power_divergenceResult(statistic=0.88975496332255899, pvalue=0.34554371914834692),
 Power_divergenceResult(statistic=1.2058885383806519, pvalue=0.27214791766901714)]