# Read and clean the data

In [3]:
import pandas as pd

In [4]:
jeopardy = pd.read_csv("jeopardy.csv")
jeopardy.head()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams


In [5]:
jeopardy.columns

Index(['Show Number', ' Air Date', ' Round', ' Category', ' Value',
       ' Question', ' Answer'],
      dtype='object')

In [6]:
jeopardy.columns = map(str.strip, jeopardy.columns)

In [7]:
import re

def normalize_text(text):
    text = text.lower()
    regex = re.compile('[^\w\s]')
    text = regex.sub('', text)
    return text

In [8]:
jeopardy["clean_question"] = jeopardy["Question"].apply(normalize_text)
jeopardy["clean_answer"] = jeopardy["Answer"].apply(normalize_text)

In [9]:
def normalize_money(text):
    regex = re.compile('[^\w\s]')
    text = regex.sub('', text)
    try:
        money = int(text)
    except Exception:
        money = 0
    return money

In [10]:
jeopardy["clean_value"] = jeopardy["Value"].apply(normalize_money)

In [11]:
jeopardy["Air Date"] = pd.to_datetime(jeopardy["Air Date"])

In [12]:
jeopardy.dtypes

Show Number                int64
Air Date          datetime64[ns]
Round                     object
Category                  object
Value                     object
Question                  object
Answer                    object
clean_question            object
clean_answer              object
clean_value                int64
dtype: object

# Answer terms in question

In [13]:
def count_word_matches(row):
    split_answer = row["clean_answer"].split(" ")
    split_question = row["clean_question"].split(" ")
    match_count = 0
    split_answer = [word for word in split_answer if word != "the"]
    if len(split_answer) == 0:
        return 0
    for word in split_answer:
        if word in split_question:
            match_count += 1
    return match_count / len(split_answer)        

In [14]:
jeopardy["answer_in_question"] = jeopardy.apply(count_word_matches, axis = 1)

In [15]:
jeopardy["answer_in_question"].mean()

0.059877607599993714

The answer appears in the question around 6% of the time, which is extremely low. Deducing the answer just from hearing the question is probably a bad strategy to succeed at the game, and therefore we would need to study in order to have a chance at winning

# Question recycling

In [16]:
question_overlap = list()
terms_used = set()

for i, row in jeopardy.iterrows():
    split_question = [word for word in row["clean_question"].split(" ") if len(word) > 5]
    match_count = 0
    for word in split_question:
        if word in terms_used:
            match_count += 1
        else:
            terms_used.add(word)
    if len(split_question) > 0:
        match_count /= len(split_question)
    question_overlap.append(match_count)    

jeopardy["question_overlap"] = question_overlap
jeopardy["question_overlap"].mean()

0.69259350560885835

We can see that there is a lot of repetition of terms between new and old questions: About 70% of words found on newer questions have appeared before. However, since we are analysing words and not phrases this doesn't prove anything. Without futher analysis is hard to tell if Jeopardy regularly recycles questions or if it only does it every now and then 

# Terms in high prize questions

In [19]:
def determine_value(row):
    if row["clean_value"] > 800:
        return 1
    else:
        return 0

In [20]:
jeopardy["high_value"] = jeopardy.apply(determine_value, axis = 1)

In [21]:
def count_usage(word):
    high_count = 0
    low_count = 0
    for i, row in jeopardy.iterrows():
        if word in row["clean_question"].split(" "):
            if row["high_value"] == 1:
                high_count += 1
            else:
                low_count += 1
                
    return high_count, low_count            

In [27]:
observed_expected = list()
comparison_terms = list(terms_used)[:5]

for word in comparison_terms:
    observed_expected.append(count_usage(word)) 

[(0, 1), (0, 3), (0, 1), (1, 4), (0, 1)]

In [31]:
high_value_count = len(jeopardy[jeopardy["high_value"] == 1])
low_value_count = len(jeopardy[jeopardy["high_value"] == 0])

14265

In [33]:
from scipy.stats import chisquare
import numpy as np

chi_squared = list()
for element in observed_expected:
    total = sum(element)
    total_prop = total / len(jeopardy)
    observed = np.array([element[0], element[1]])
    expected = np.array([total_prop * high_value_count, total_prop * low_value_count])
    chi_squared.append(chisquare(observed, expected))

In [34]:
chi_squared

[Power_divergenceResult(statistic=0.40196284612688399, pvalue=0.52607729857054686),
 Power_divergenceResult(statistic=1.2058885383806519, pvalue=0.27214791766902047),
 Power_divergenceResult(statistic=0.40196284612688399, pvalue=0.52607729857054686),
 Power_divergenceResult(statistic=0.18383953104516373, pvalue=0.66809416232506025),
 Power_divergenceResult(statistic=0.40196284612688399, pvalue=0.52607729857054686)]

At the light of the p-value obtained from the chi-squared test, we can say there is independence between the words found in a question and the prize money given as a reward. However, it is important to notice that the frequency of each word is not larger than 5, so the test loses significance. It would be much better if the test was made with words that have a higher frequency