In [2]:
import pandas as pd

jeopardy = pd.read_csv('jeopardy.csv')

print(jeopardy.head(5))


   Show Number    Air Date      Round                         Category  Value  \
0         4680  2004-12-31  Jeopardy!                          HISTORY   $200   
1         4680  2004-12-31  Jeopardy!  ESPN's TOP 10 ALL-TIME ATHLETES   $200   
2         4680  2004-12-31  Jeopardy!      EVERYBODY TALKS ABOUT IT...   $200   
3         4680  2004-12-31  Jeopardy!                 THE COMPANY LINE   $200   
4         4680  2004-12-31  Jeopardy!              EPITAPHS & TRIBUTES   $200   

                                            Question      Answer  
0  For the last 8 years of his life, Galileo was ...  Copernicus  
1  No. 2: 1912 Olympian; football star at Carlisl...  Jim Thorpe  
2  The city of Yuma in this state has a record av...     Arizona  
3  In 1963, live on "The Art Linkletter Show", th...  McDonald's  
4  Signer of the Dec. of Indep., framer of the Co...  John Adams  


## Jeopardy questions

In [5]:
new_col=[]

for col in jeopardy.columns:
    new_col.append(col.lstrip())

jeopardy.columns = new_col

In [7]:
jeopardy.dtypes

Show Number     int64
Air Date       object
Round          object
Category       object
Value          object
Question       object
Answer         object
dtype: object

In [10]:
jeopardy['Air Date'] = pd.to_datetime(jeopardy['Air Date'])
jeopardy.dtypes

Show Number             int64
Air Date       datetime64[ns]
Round                  object
Category               object
Value                  object
Question               object
Answer                 object
dtype: object

## Normalize Text

In [17]:
import string

def normalize(texte):
    text = texte.lower()
    table_puntuation = {ord(char): None for char in string.punctuation}
    text = text.translate(table_puntuation)
    return text
    

bonjourmonsieur m  loty po 


In [19]:
jeopardy['clean_question'] = jeopardy['Question'].apply(normalize)

jeopardy['clean_answer'] = jeopardy['Answer'].apply(normalize)


In [20]:
jeopardy['clean_question'].head(5)

0    for the last 8 years of his life galileo was u...
1    no 2 1912 olympian football star at carlisle i...
2    the city of yuma in this state has a record av...
3    in 1963 live on the art linkletter show this c...
4    signer of the dec of indep framer of the const...
Name: clean_question, dtype: object

## Columns normalizing

In [23]:
def normalize_dollars(text):
    table_puntuation = {ord(char): None for char in string.punctuation}
    text = text.translate(table_puntuation)
    try:
        value = int(text)
    except:
        value=0
    return value

jeopardy['clean_value'] = jeopardy['Value'].apply(normalize_dollars)
jeopardy['clean_value'].head(5)

0    200
1    200
2    200
3    200
4    200
Name: clean_value, dtype: int64

In [21]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

## Answer in questions

In [29]:
def question_answer(row):
    split_answer = row['clean_answer'].split(' ')
    
    try:
        split_answer.remove('the')
    except:
        pass
        
    split_question = row['clean_question'].split(' ')
    
    match_count=0
    
    for word in split_answer:
        if word in split_question:
            match_count+=1
    try:
        ratio = match_count / len(split_answer)        
    except:
        ratio = 0
    
    return ratio
   
jeopardy['answer_in_question'] = jeopardy.apply(question_answer,axis=1)

mean_answer_in_question = jeopardy['answer_in_question'].mean()

mean_answer_in_question
    

0.06035277385469894

## Recycled questions

In [33]:
jeopardy_sorted = jeopardy.sort_values(by='Air Date',ascending=True)

In [45]:
terms_used = set()
question_overlap = []

for row in jeopardy_sorted.iterrows():
    split_question = row[1]['clean_question'].split(' ')
    new_split_question = [word for word in split_question if len(word)>6]
    match_count=0
    
    for wd in new_split_question:
        if terms_used.intersection(wd)==set():
            match_count+=1
            terms_used.add(wd)
    
    if len(new_split_question)>0:
        ratio = match_count/len(new_split_question)
        
    else:
        ratio=0
        
    question_overlap.append(ratio)
    
jeopardy_sorted['question_overlaps'] = question_overlap
    
mean_jeopardy_question_overlap = jeopardy_sorted['question_overlaps'].mean()

print(mean_jeopardy_question_overlap)    


0.9375968798439922


## Low values vs high values questions

In [46]:
len(terms_used)

19449

In [47]:
def high_value(row):
    if row['clean_value']>800:
        value=1
    else:
        value=0
    return value

jeopardy_sorted['high_value'] = jeopardy_sorted.apply(high_value,axis=1)


In [63]:
def high_low_count(word):
    low_count = 0
    high_count = 0
    for row in jeopardy_sorted.iterrows():
        if word in row[1]['clean_question'].split(' '):
            if row[1]['high_value']==1:
                high_count+=1
            else:
                low_count+=1
    return low_count,high_count

observed_expected = []
comparaison_terms = list(terms_used)[:5]


In [64]:
comparaison_terms

['senatorelect', 'ambroses', 'involuntary', 'fermors', 'goldwyn']

In [65]:
for word in comparaison_terms:
    observed_expected.append(high_low_count(word))
        

In [66]:
observed_expected

[(0, 1), (0, 1), (4, 0), (0, 1), (0, 1)]

## Applying the chi-squared test

In [57]:
high_value_count = jeopardy_sorted['high_value'].sum()
low_value_count = len(jeopardy_sorted['high_value'])-high_value_count


In [58]:
low_value_count

14265

In [59]:
high_value_count

5734

In [62]:
low_value_count = len(jeopardy_sorted[jeopardy_sorted['high_value']==0])
low_value_count

14265

In [71]:
from scipy.stats import chisquare

chi_squared = []

for liste in observed_expected:
    total = liste[0] + liste[1]
    total_prop = total / len(jeopardy_sorted)
    
    expected_high_value = total_prop * high_value_count
    expected_low_value = total_prop * low_value_count
                                 
    chi_high,p_value_high = chisquare(liste[1],expected_high_value)   
    chi_low,p_value_low = chisquare(liste[0],expected_low_value)                                     
   
    chi_squared.append([chi_high,p_value_high])                             
    chi_squared.append([chi_low,p_value_low])  
                             

## Next steps

In [72]:
chi_squared

[[1.7745064529124608, nan],
 [0.7132856642832142, nan],
 [1.7745064529124608, nan],
 [0.7132856642832142, nan],
 [1.1468573428671434, nan],
 [0.4609940416403926, nan],
 [1.7745064529124608, nan],
 [0.7132856642832142, nan],
 [1.7745064529124608, nan],
 [0.7132856642832142, nan]]