In [1]:
import pandas as pd
jeopardy = pd.read_csv("jeopardy.csv")

jeopardy.head()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams


In [2]:
jeopardy.columns

Index(['Show Number', ' Air Date', ' Round', ' Category', ' Value',
       ' Question', ' Answer'],
      dtype='object')

In [3]:
jeopardy.columns = ['ShowNumber', 'AirDate', 'Round', 'Category', 'Value',
       'Question', 'Answer']

In [4]:
jeopardy.head()

Unnamed: 0,ShowNumber,AirDate,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams


Write a function to normalize questions and answers. It should:
- Take in a string.
- Convert the string to lowercase.
- Remove all punctuation in the string.
- Return the string.

In [5]:
import re
def normalize_que_ans(string):
    string = string.lower()
    string = re.sub("[^A-Za-z0-9\s]","", string)
    return string
    

In [6]:
# normalize the "Question" column
jeopardy["clean_question"] = jeopardy["Question"].apply(normalize_que_ans)
jeopardy["clean_answer"] = jeopardy["Answer"].apply(normalize_que_ans)


In [7]:
def normalize_values(val):
    val = re.sub("[^A-Za-z0-9\s]", "", val)
    try:
        val = int(val)
    except Exception:
        val = 0
    return val

In [8]:
jeopardy["clean_values"] = jeopardy["Value"].apply(normalize_values)


In [9]:
#convert the Air Date column to a datetime column

jeopardy['AirDate'] = pd.to_datetime(jeopardy['AirDate'])

In [10]:
jeopardy.head()

Unnamed: 0,ShowNumber,AirDate,Round,Category,Value,Question,Answer,clean_question,clean_answer,clean_values
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus,for the last 8 years of his life galileo was u...,copernicus,200
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe,no 2 1912 olympian football star at carlisle i...,jim thorpe,200
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona,the city of yuma in this state has a record av...,arizona,200
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's,in 1963 live on the art linkletter show this c...,mcdonalds,200
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams,signer of the dec of indep framer of the const...,john adams,200


In order to figure out whether to study past questions, study general knowledge, or not study it all, it would be helpful to figure out two things:

1. How often the answer is deducible from the question.
    -  How many times words in the answer also occur in the question
2. How often new questions are repeats of older questions.
    - How often complex words (> 6 characters) reoccur

## How often the answer is deducible from the question?

In [11]:
def count_match(row):
    
    split_answer = row["clean_answer"].split(" ")
    split_question = row["clean_question"].split(" ")
    
    
    
# remove "the" in split_answer
    if "the" in split_answer:
        split_answer.remove("the")
        
# to prevents a division by zero error     
    if len(split_answer) == 0:
        return 0
    match_count = 0
# loop through each item in split_answer
# to see if it occurs in split_question   
    for i in split_answer:
        if i in split_question:
            match_count += 1
    return match_count/len(split_answer)



Count how many times terms in **clean_answer** occur in **clean_question**.
- Use the Pandas DataFrame.apply method to apply the function to each row in jeopardy.
- Pass the axis=1 argument to apply the function across each row.
- Assign the result to the answer_in_question column

In [12]:
jeopardy["answer_in_question"] = jeopardy.apply(count_match, axis=1)

In [13]:
jeopardy["answer_in_question"]

0        0.000000
1        0.000000
2        0.000000
3        0.000000
4        0.000000
5        0.000000
6        0.000000
7        0.000000
8        0.000000
9        0.333333
10       0.000000
11       0.000000
12       0.000000
13       0.000000
14       0.500000
15       0.000000
16       0.000000
17       0.000000
18       0.000000
19       0.000000
20       0.000000
21       0.000000
22       0.000000
23       0.000000
24       0.500000
25       0.000000
26       0.000000
27       0.000000
28       0.000000
29       0.000000
           ...   
19969    0.000000
19970    0.000000
19971    0.000000
19972    0.000000
19973    0.000000
19974    0.333333
19975    0.000000
19976    0.000000
19977    0.000000
19978    0.000000
19979    0.000000
19980    0.500000
19981    0.500000
19982    0.000000
19983    0.000000
19984    0.000000
19985    0.000000
19986    0.000000
19987    0.000000
19988    0.000000
19989    0.000000
19990    0.000000
19991    0.000000
19992    0.000000
19993    0

In [14]:
jeopardy["answer_in_question"].mean()

0.06049325706933587

The answer only appears in the question about 6% of the time. This isn't a huge number, and means that we probably can't just hope that hearing a question will enable us to figure out the answer. We'll probably have to study.

## How often new questions are repeats of older questions?
- Sort jeopardy in order of ascending air date.
- Maintain a set called terms_used that will be empty initially.
- Iterate through each row of jeopardy.
- Split clean_question into words, remove any word shorter than 6 characters, and check if each word occurs in terms_used.
    - If it does, increment a counter.
    - Add each word to terms_used.

In [15]:

jeopardy = jeopardy.sort_values("AirDate")
terms_used = set()
question_overlap = []


for i, row in jeopardy.iterrows():
    split_question = row["clean_question"].split(" ")
    split_question = [q for q in split_question if len(q) > 5]
    match_count = 0 
    for word in split_question:    
        if word in terms_used:
            match_count +=1        
    for word in split_question:
        terms_used.add(word)
            
    if len(split_question)>0:
        match_count /= len(split_question)
        
    question_overlap.append(match_count)    
    
jeopardy["question_overlap"] = question_overlap

jeopardy["question_overlap"].mean()        
            

0.6876260592169802

There is about 70% overlap between terms in new questions and terms in old questions. This only looks at a small set of questions, and it doesn't look at phrases, it looks at single terms. This makes it relatively insignificant, but it does mean that it's worth looking more into the recycling of questions.

In [16]:
def value_category(row):
    value = 0 
    if row["clean_values"] > 800:
        value = 1
    return value

jeopardy["high_value"] = jeopardy.apply(value_category, axis=1)

In [22]:
def count_usage(term):
    low_count = 0
    high_count = 0
    for i, row in jeopardy.iterrows():
        if term in row["clean_question"].split(" "):
            if row["high_value"] == 1:
                high_count += 1
            else:
                low_count += 1
    return high_count, low_count

comparison_terms = list(terms_used)[:5]
observed_expected = []
for word in comparison_terms:
    observed_expected.append(count_usage(word))

observed_expected

[(0, 2), (0, 1), (1, 0), (1, 4), (1, 1)]

In [23]:
from scipy.stats import chisquare
import numpy as np

high_value_count = jeopardy[jeopardy["high_value"] == 1].shape[0]
low_value_count = jeopardy[jeopardy["high_value"] == 0].shape[0]

chi_squared = []
for obs in observed_expected:
    total = sum(obs)
    total_prop = total / jeopardy.shape[0]
    high_value_exp = total_prop * high_value_count
    low_value_exp = total_prop * low_value_count
    
    observed = np.array([obs[0], obs[1]])
    expected = np.array([high_value_exp, low_value_exp])
    chi_squared.append(chisquare(observed, expected))

chi_squared

[Power_divergenceResult(statistic=0.803925692253768, pvalue=0.3699222378079571),
 Power_divergenceResult(statistic=0.401962846126884, pvalue=0.5260772985705469),
 Power_divergenceResult(statistic=2.487792117195675, pvalue=0.11473257634454047),
 Power_divergenceResult(statistic=0.18383953104516373, pvalue=0.6680941623250602),
 Power_divergenceResult(statistic=0.4448774816612795, pvalue=0.5047776487545996)]

- None of the terms had a significant difference in usage between high value and low value rows.
- It would be better to run this test with only terms that have higher frequencies.