In [1]:
import pandas as pd
jeopardy = pd.read_csv('jeopardy.csv')
jeopardy.head()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams


In [2]:
jeopardy.columns

Index(['Show Number', ' Air Date', ' Round', ' Category', ' Value',
       ' Question', ' Answer'],
      dtype='object')

In [3]:
new_col = []
for col in jeopardy.columns:
    col = col.strip()
    new_col.append(col)
jeopardy.columns = new_col

In [4]:
jeopardy.columns

Index(['Show Number', 'Air Date', 'Round', 'Category', 'Value', 'Question',
       'Answer'],
      dtype='object')

In [5]:
import re
def normalize_text(a_string):
    a_string = a_string.lower()
    a_string = re.sub('\W', ' ', a_string)
    return a_string
jeopardy['clean_question'] = jeopardy['Question'].apply(normalize_text)
jeopardy['clean_answer'] = jeopardy['Answer'].apply(normalize_text)

In [6]:
jeopardy['clean_answer']

0                                               copernicus
1                                               jim thorpe
2                                                  arizona
3                                               mcdonald s
4                                               john adams
5                                                  the ant
6                                           the appian way
7                                           michael jordan
8                                               washington
9                                           crate   barrel
10                                          jackie gleason
11                                                 the cud
12                                   ceylon  or sri lanka 
13                                               jim brown
14                                            the uv index
15                                                  bulova
16                                             jesse jam

In [7]:
jeopardy['clean_question']

0        for the last 8 years of his life  galileo was ...
1        no  2  1912 olympian  football star at carlisl...
2        the city of yuma in this state has a record av...
3        in 1963  live on  the art linkletter show   th...
4        signer of the dec  of indep   framer of the co...
5        in the title of an aesop fable  this insect sh...
6        built in 312 b c  to link rome   the south of ...
7        no  8  30 steals for the birmingham barons  2 ...
8        in the winter of 1971 72  a record 1 122 inche...
9        this housewares store was named for the packag...
10                                         and away we go 
11       cows regurgitate this from the first stomach t...
12       in 1000 rajaraja i of the cholas battled to ta...
13       no  1  lettered in hoops  football   lacrosse ...
14       on june 28  1994 the nat l weather service beg...
15       this company s accutron watch  introduced in 1...
16       outlaw   murdered by a traitor and a coward wh.

In [8]:
def normalize_value(a_string):
    a_string = re.sub('\W', '', a_string)
    if a_string == 'None':
        a_string = 0
    else:
        a_string = int(a_string)
    return a_string

In [9]:
jeopardy['clean_value'] = jeopardy['Value'].apply(normalize_value)

In [10]:
jeopardy['clean_date'] = pd.to_datetime(jeopardy['Air Date'])

In [11]:
def not_study(row):
    split_answer = row['clean_answer'].split()
    split_question = row['clean_question'].split()
    match_count = 0
    if 'the' in split_answer:
        split_answer.remove('the')
    if len(split_answer) != 0:
        for word in split_answer:
            if word in split_question:
                match_count += 1
            else:
                continue
    else:
        return 0
    return match_count / len(split_answer)                

In [12]:
jeopardy['answer_in_question'] = jeopardy.apply(not_study, axis=1)

In [13]:
jeopardy['answer_in_question'].mean()

0.06294645581984949

In [14]:
question_overlap = []
terms_used = set()
jeopardy = jeopardy.sort_values(by=['Air Date'])

In [16]:
for row in jeopardy.iterrows():
    split_question = row[1]['clean_question'].split(' ')
    split_question = [word for word in split_question if len(word) >= 6 ]
    match_count = 0
    for word in split_question:
        if word in terms_used:
            match_count += 1
    terms_used.add(word)
    if len(split_question) > 0:
        match_count = match_count / len(split_question)
    question_overlap.append(match_count)
jeopardy['question_overlap'] = question_overlap
jeopardy['question_overlap'].mean()

0.5254786645114725

In [17]:
def change_value(row):
    value = 1 if row['clean_value'] > 800 else 0
    return value

In [18]:
jeopardy['high_value'] = jeopardy.apply(change_value, axis=1)

In [25]:
def count_value(word):
    low_count = 0
    high_count = 0
    for row in jeopardy.iterrows():
        split_question = row[1]['clean_question'].split(' ')
        if word in split_question:
            if row[1]['high_value'] == 1:
                high_count += 1
            else:
                low_count += 1
    return low_count, high_count

In [26]:
from random import *
comparison_terms = sample(terms_used, 10)

In [27]:
observed_expected = []
for term in comparison_terms:
    observed_expected.append(count_value(term))

In [28]:
print(observed_expected)

[(5, 3), (7, 4), (1, 0), (1, 1), (4, 0), (1, 1), (2, 0), (1, 2), (2, 0), (1, 1)]


In [29]:
high_value_count = sum(jeopardy['high_value'] == 1)

In [34]:
low_value_count = sum(jeopardy['high_value'] == 0)

In [56]:
from scipy.stats import chisquare
import numpy as np
chi_squared = []
for item in observed_expected:
    total = sum(item)
    total_prop = total / len(jeopardy)
    high_expected = total_prop * high_value_count
    low_expected = total_prop * low_value_count
    observed = np.array(high_value_count, low_value_count)
    expected = np.array(high_expected, low_expected)
    chisq, p_value = chisquare([observed, expected])
    chi_squared.append([chisq, p_value])

TypeError: data type not understood

In [54]:
print(chi_squared)

[[5727.122524602264, 0.0], [5724.545361940761, 0.0], [5733.139914335717, 0.0], [5732.2799433457, 0.0], [5730.560345319671, 0.0], [5732.2799433457, 0.0], [5732.2799433457, 0.0], [5731.42008701275, 0.0], [5732.2799433457, 0.0], [5732.2799433457, 0.0]]


In [47]:
print(high_value_count)

5734


In [48]:
print(low_value_count)

14265


In [52]:
for item in observed_expected:
    total = sum(item)
    print(total / len(jeopardy) * high_value_count, total / len(jeopardy) * low_value_count)

2.2937146857342867 5.706285314265713
3.1538576928846442 7.846142307115356
0.28671433571678584 0.7132856642832142
0.5734286714335717 1.4265713285664283
1.1468573428671434 2.8531426571328566
0.5734286714335717 1.4265713285664283
0.5734286714335717 1.4265713285664283
0.8601430071503575 2.1398569928496425
0.5734286714335717 1.4265713285664283
0.5734286714335717 1.4265713285664283
