In [136]:
import pandas as pd
import numpy as np
import re
from scipy.stats import chisquare

In [24]:
jeopardy = pd.read_csv('jeopardy.csv')
jeopardy.head()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams


In [96]:
lst = []
for i in range(len(jeopardy.columns)):
    lst.append(jeopardy.columns[i].replace(" ", ""))    
jeopardy.columns = lst

In [26]:
def normalize_str(string):
    string = string.lower()
    string = re.sub(r'[^\w\s]', '',string)
    return string

In [29]:
jeopardy['clean_question'] = jeopardy.Question.apply(normalize_str)
jeopardy['clean_answer'] = jeopardy.Answer.apply(normalize_str)

In [30]:
jeopardy.head()

Unnamed: 0,ShowNumber,AirDate,Round,Category,Value,Question,Answer,clean_question,clean_answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus,for the last 8 years of his life galileo was u...,copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe,no 2 1912 olympian football star at carlisle i...,jim thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona,the city of yuma in this state has a record av...,arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's,in 1963 live on the art linkletter show this c...,mcdonalds
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams,signer of the dec of indep framer of the const...,john adams


In [31]:
def normalize_dollar(string):
    string = re.sub(r'[^\w\s]', '',string)
    try:
        integer = int(string)
    except ValueError:
        integer = 0
    return integer

In [34]:
jeopardy['clean_value'] = jeopardy.Value.apply(normalize_dollar)

In [36]:
jeopardy['AirDate'] = pd.to_datetime(jeopardy['AirDate'])

### How often the answer is deducible from the question.

In [62]:
def task_occurs(row):
    split_answer = row[8].split(" ")
    if 'the' in split_answer:
        split_answer.remove('the')
    split_question = row[7].split(" ")
    match_count = 0
    if len(split_answer) ==0:
        return 0
    for item in split_answer:
        if item in split_question:
            match_count+=1
    return match_count / len(split_answer)

In [64]:
answer_in_question = jeopardy.apply(task_occurs, axis = 1)

In [67]:
answer_in_question.mean()

0.060493257069335872

In [68]:
answer_in_question.value_counts()

0.000000    17375
0.500000     1452
0.333333      551
0.250000      170
1.000000      123
0.666667      103
0.200000       82
0.166667       28
0.400000       28
0.142857       20
0.750000       18
0.285714       10
0.600000        9
0.125000        9
0.428571        3
0.181818        2
0.800000        2
0.571429        2
0.300000        2
0.111111        2
0.307692        1
0.444444        1
0.222222        1
0.375000        1
0.100000        1
0.153846        1
0.875000        1
0.272727        1
dtype: int64

In [73]:
1 - 17375 / len(answer_in_question)

0.13120656032801636

Hm, in 13% of answers there are some prompts in questions.

### How often new questions are repeats of older questions.

In [86]:
question_overlap = []
terms_used = set()

for row in jeopardy.iterrows():
    match_count = 0
    split_question = row[1][7].split(" ")
    split_question = [term for term in split_question if len(term) > 5]
    for term in split_question:
        if term in terms_used:
            match_count+=1
        terms_used.add(term)
    if len(split_question) > 0:
        question_overlap.append(match_count / len(split_question))
    else:
        question_overlap.append(0)

In [87]:
jeopardy['question_overlap'] = question_overlap
np.mean(question_overlap)

0.69259350560885835

In [92]:
1 - 6290 / len(answer_in_question)

0.6854842742137106

In [107]:
jeopardy.columns

Index(['ShowNumber', 'AirDate', 'Round', 'Category', 'Value', 'Question',
       'Answer', 'clean_question', 'clean_answer', 'clean_value',
       'question_overlap', 'high_value'],
      dtype='object')

68% of questions are repeats! It was a bad aproach to calculate this metric, but we can see that there is no variancy in questions.

### chi-squared test

In [105]:
jeopardy['high_value'] = jeopardy['clean_value'] > 800

In [108]:
def word_value(word):
    low_count = 0
    high_count = 0
    for row in jeopardy.iterrows():
        split_question = row[1][7].split(' ')
        if word in split_question:
            if row[1][11]:
                high_count+=1
            else:
                low_count+=1
    return high_count, low_count

In [137]:
observed_expected = []
comparison_terms = list(terms_used)

for term in comparison_terms[:10]:
    observed_expected.append(list(word_value(term)))

In [138]:
comparison_terms[:10]

['monstrosity',
 '17801830',
 'astronaut',
 'rebellious',
 'sandburg',
 'peninsula',
 'fisticuffs',
 'eatinthis',
 '1885mile',
 'ounces']

In [139]:
observed_expected

[[1, 0],
 [1, 0],
 [0, 7],
 [0, 1],
 [0, 2],
 [11, 21],
 [1, 0],
 [0, 1],
 [0, 1],
 [0, 4]]

In [127]:
high_value_count = sum(jeopardy['high_value'])
low_value_count = len(jeopardy.high_value) - high_value_count

chi_squared = []
total = high_value_count + low_value_count
chi_squared.append([high_value_count, low_value_count, total])

In [143]:
total_prop = np.sum(np.array(observed_expected))/total # total in sample
total_prop

0.002550127506375319

In [146]:
observed = [0,0]
for lst in observed_expected:
    observed[0]+=lst[0]
    observed[1]+=lst[1]
observed

[14, 37]

In [147]:
expected = [high_value_count  * total_prop, low_value_count  * total_prop]

In [149]:
chi, pvalue = chisquare(observed, expected)
pvalue

0.84717027510849874

P-value more then .05 => we'll reject alternative hiposysis.

Here are some potential next steps:

* Find a better way to eliminate non-informative words than just removing words that are less than 6 characters long. Some ideas:
    * Manually create a list of words to remove, like the, than, etc.
    * Find a list of stopwords to remove.
    * Remove words that occur in more than a certain percentage (like 5%) of questions.
* Perform the chi-squared test across more terms to see what terms have larger differences. This is hard to do currently because the code is slow, but here are some ideas:
    * Use the apply method to make the code that calculates frequencies more efficient.
    * Only select terms that have high frequencies across the dataset, and ignore the others.
* Look more into the Category column and see if any interesting analysis can be done with it. Some ideas:
    * See which categories appear the most often.
    * Find the probability of each category appearing in each round.
* Use the whole Jeopardy dataset (available here) instead of the subset we used in this mission.
* Use phrases instead of single words when seeing if there's overlap between questions. Single words don't capture the whole context of the question well.