# Jeopardy Questions

In [1]:
import pandas as pd
jeopardy = pd.read_csv('jeopardy.csv')
jeopardy.head()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams


In [2]:
#some columns have spaces in the front
print(jeopardy.columns) 

Index(['Show Number', ' Air Date', ' Round', ' Category', ' Value',
       ' Question', ' Answer'],
      dtype='object')


In [3]:
# strip white spaces
jeopardy.columns = [text.strip() for text in jeopardy.columns]

print(jeopardy.columns)

Index(['Show Number', 'Air Date', 'Round', 'Category', 'Value', 'Question',
       'Answer'],
      dtype='object')


# Normalizing text
Remove punctuations from **Question** and **Answer** columns and convert the text to lower case

In [4]:
from string import punctuation

def normalize_text(text):
    # convert to lowercase.
    text = text.lower()
    # Remove punctuations.
    text = text.translate(str.maketrans('', '', punctuation))
    
    return text

jeopardy['clean_question'] = jeopardy['Question'].apply(normalize_text)
jeopardy['clean_answer'] = jeopardy['Answer'].apply(normalize_text)

jeopardy.head()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer,clean_question,clean_answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus,for the last 8 years of his life galileo was u...,copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe,no 2 1912 olympian football star at carlisle i...,jim thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona,the city of yuma in this state has a record av...,arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's,in 1963 live on the art linkletter show this c...,mcdonalds
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams,signer of the dec of indep framer of the const...,john adams


# Normalizing columns
 - convert the **Value** column to numeric. 
 - convert the **Air Date** column to datetime.

In [5]:
def normalize_value(text):
    
    #remove punctuation from value.
    text = text.translate(str.maketrans('','',punctuation))
    try:
        text = int(text)
    except Exception:
        text = 0
       
    return text

jeopardy['clean_value'] = jeopardy['Value'].apply(normalize_value)

# convert Air Date to datetime format.
jeopardy['Air Date'] = pd.to_datetime(jeopardy['Air Date'])

jeopardy.iloc[30:120]  

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer,clean_question,clean_answer,clean_value
30,4680,2004-12-31,Double Jeopardy!,PRESIDENTIAL STATES OF BIRTH,$400,California,Nixon,california,nixon,400
31,4680,2004-12-31,Double Jeopardy!,AIRLINE TRAVEL,$400,It can be a place to leave your puppy when you...,a kennel,it can be a place to leave your puppy when you...,a kennel,400
32,4680,2004-12-31,Double Jeopardy!,THAT OLD-TIME RELIGION,$400,"He's considered the author of the Pentateuch, ...",Moses,hes considered the author of the pentateuch wh...,moses,400
33,4680,2004-12-31,Double Jeopardy!,MUSICAL TRAINS,$400,Steven Tyler of this band lent his steamin' vo...,Aerosmith,steven tyler of this band lent his steamin voc...,aerosmith,400
34,4680,2004-12-31,Double Jeopardy!,"""X""s & ""O""s",$400,Around 100 A.D. Tacitus wrote a book on how th...,oratory,around 100 ad tacitus wrote a book on how this...,oratory,400
35,4680,2004-12-31,Double Jeopardy!,PRESIDENTIAL STATES OF BIRTH,$800,1 of the 2 born in Vermont,Coolidge (or Chester Arthur),1 of the 2 born in vermont,coolidge or chester arthur,800
36,4680,2004-12-31,Double Jeopardy!,AIRLINE TRAVEL,$800,When it began on Pan Am & Qantas in the late '...,business class,when it began on pan am qantas in the late 70...,business class,800
37,4680,2004-12-31,Double Jeopardy!,THAT OLD-TIME RELIGION,$800,"Ali, who married this man's daughter Fatima, i...",Muhammed,ali who married this mans daughter fatima is c...,muhammed,800
38,4680,2004-12-31,Double Jeopardy!,MUSICAL TRAINS,$800,"During the 1954-1955 Sun sessions, Elvis climb...","the ""Mystery Train""",during the 19541955 sun sessions elvis climbed...,the mystery train,800
39,4680,2004-12-31,Double Jeopardy!,"""X""s & ""O""s",$800,"The shorter glass seen <a href=""http://www.j-a...",an old-fashioned,the shorter glass seen a hrefhttpwwwjarchiveco...,an oldfashioned,800


In [6]:
jeopardy.dtypes

Show Number                int64
Air Date          datetime64[ns]
Round                     object
Category                  object
Value                     object
Question                  object
Answer                    object
clean_question            object
clean_answer              object
clean_value                int64
dtype: object

# Answer in questions
we will count how many words in the answer match the words in the question. 
This will help us determine if it is necessary to study or one can guess the answer just by looking at the question.

In [7]:
def count_matches(row):
    
    # split question and answers ino seperate words.
    split_answer = row['clean_answer'].split(' ')
    split_question = row['clean_question'].split(' ')
    match_count = 0
    
    # check if 'the is present in answer list.
    if 'the' in split_answer:
        # if 'the' is present remove it from the answer list.
        split_answer = [word for word in split_answer if word != 'the']
    # check if length of list is 0    
    if len(split_answer) == 0:
        return 0

    # count the number of times each word in the answer appears in the question.
    for word in split_answer:
        if word in split_question:
            match_count += 1
    # percentage of words in split_answer that are present in split_question.    
    return match_count / len(split_answer) 

jeopardy['answer_in_question'] = jeopardy.apply(count_matches, axis=1)

jeopardy['answer_in_question'].mean()

0.05973712438535688

Thus, on an average 6% of the words are common between the questions and the answers. This isn't a huge number, and means that we probably can't just hope that hearing a question will enable us to figure out the answer. We'll probably have to study.

# Recycled questions
we want to see how often new questions are repeats of older ones.

In [8]:
# it is a list that calculates the proportion of total words in each question that are present in terms used.
question_overlap = []

#it is set that is used to store new words that occur in each question.
terms_used = set()
 
# iterating rows in jeopardy.
for i, row in jeopardy.iterrows():
    
    # split clean_question into individual words.
    split_question = row['clean_question'].split(' ')
    # only retaining words with length greater than 5.
    split_question = [word for word in split_question if len(word)>=6]
    
    match_count = 0
    
    # Iterating through split_question.
    for word in split_question:
        # check if word occurs in terms_used
        if word in terms_used:
            match_count += 1 # increment counter.
        # add each word in split_question to terms_used. set does not have duplicates.    
        terms_used.add(word)   
       
    # percentage of words in split_questions that are present in terms_used.
    if len(split_question) > 0:
        match_count /= len(split_question)
    
    question_overlap.append(match_count)

jeopardy['question_overlap'] = question_overlap
jeopardy['question_overlap'].mean()

0.6919577992203563

## Question overlap
There is about 70% overlap between terms in new questions and terms in old questions. This only looks at a small set of questions, and it doesn't look at phrases, it looks at single terms. This makes it relatively insignificant, but it does mean that it's worth looking more into the recycling of questions.



# Low value vs high value questions

In [9]:
# classify rows based on their question value
def determine_value(row):
    # check value
    if row['clean_value'] > 800:
        value = 1
    else:
        value = 0
       
    return value

jeopardy['high_value'] = jeopardy.apply(determine_value, axis=1)

In [10]:
# count the number of times a term occurs in low value and high value questions.
def count_usage(term):
    
   # initialise counts.
    low_count = 0 # number of times the term occurs in low value question.
    high_count = 0 # number of times the term occurs in a high value question.
    
   # Iterate through rows.
    for i, row in jeopardy.iterrows():
        
        # split clean_question into individual words.
        split_question = row['clean_question'].split(' ')
        
        # check if term occurs in split_question.
        if term in split_question:
            
            # check value of the question
            if row['high_value'] == 1:
                high_count += 1 
            else:
                low_count += 1
                   
    return high_count, low_count       
            
#we only pass the first 5 terms for terms_used.
comparison_terms = list(terms_used)[:5]
observed_high_low_count = [count_usage(term) for term in comparison_terms]
observed_high_low_count    

[(1, 0), (0, 1), (0, 1), (1, 5), (0, 2)]

# Applying the chi-squared test

In [11]:
from scipy.stats import chisquare
import numpy as np
# number of high value questions.
high_value_count = jeopardy[jeopardy["high_value"] == 1].shape[0]

# number of low value questions.
low_value_count = jeopardy[jeopardy["high_value"] == 0].shape[0]

chi_squared = []
for obs in observed_high_low_count:
    
    # total number of questions that the term occured in.
    total = sum(obs)
    
    # proportion of questions that the term occured in.
    total_prop = total / jeopardy.shape[0]
    
    # expected high value count.
    high_value_exp = total_prop * high_value_count
    
    # expected low value count.
    low_value_exp = total_prop * low_value_count
    
    observed = np.array([obs[0], obs[1]])
    expected = np.array([high_value_exp, low_value_exp])
    chi_squared.append(chisquare(observed, expected))

chi_squared

[Power_divergenceResult(statistic=2.4877921171956752, pvalue=0.11473257634454047),
 Power_divergenceResult(statistic=0.40196284612688399, pvalue=0.52607729857054686),
 Power_divergenceResult(statistic=0.40196284612688399, pvalue=0.52607729857054686),
 Power_divergenceResult(statistic=0.42281054506129573, pvalue=0.51553795812945302),
 Power_divergenceResult(statistic=0.80392569225376798, pvalue=0.36992223780795708)]