In [67]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import re
from collections import Counter
from nltk.corpus import gutenberg, stopwords

# Jeopardy!

I want to take this dataset of questions, or answers in this case since answers are the questions and see if my model can correctly predict the correct round in which the questions(answer) might appear.

### Explore Data

In [4]:
jeopardy_raw = pd.read_csv('../../../Data/JEOPARDY.csv')

In [5]:
jeopardy_df = pd.DataFrame(jeopardy_raw)

In [6]:
jeopardy_df.head()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams


In [7]:
jeopardy_df.shape

(216930, 7)

In [11]:
jeopardy_df.columns

Index(['Show Number', ' Air Date', ' Round', ' Category', ' Value',
       ' Question', ' Answer'],
      dtype='object')

In [93]:
# let's get rid of columns we don't need
filtered_df = jeopardy_df.drop(['Show Number', ' Air Date', ' Answer'], axis=1)
filtered_df.head()

Unnamed: 0,Round,Category,Value,Question
0,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ..."
1,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...
2,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...
3,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th..."
4,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co..."


In [14]:
# I haven't decided which column to make the target variable. Let's take a deeper look into the columns to see
# which might be best for categorizing purposes of this exercise
filtered_df[' Round'].value_counts()

Jeopardy!           107384
Double Jeopardy!    105912
Final Jeopardy!       3631
Tiebreaker               3
Name:  Round, dtype: int64

In [19]:
filtered_df[' Category'].describe()

count             216930
unique             27995
top       BEFORE & AFTER
freq                 547
Name:  Category, dtype: object

That's waaay too many categories. I don't think it will be too useful to categorize by 'Category's.

In [20]:
filtered_df[' Value'].describe()

count     216930
unique       150
top         $400
freq       42244
Name:  Value, dtype: object

It looks like I will be using the 'Round' category. Let's move on.

### Parsing and Filtering Data

In [94]:
# clean text
def text_cleaner(text):
    text = re.sub('<.*?>', '', text)    # remove html brackets
    text = re.sub('\(.*?\)', '', text)  # remove parantheses
    text = ' '.join(text.split())
    return text

# test
text_cleaner('<link> some text (some more text)')

'some text'

In [103]:
filtered_df['cleaned'] = filtered_df.apply(lambda x: text_cleaner(x[' Question']), axis=1)
filtered_df.head(10)

Unnamed: 0,Round,Category,Value,Question,cleaned
0,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...","For the last 8 years of his life, Galileo was ..."
1,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,No. 2: 1912 Olympian; football star at Carlisl...
2,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,The city of Yuma in this state has a record av...
3,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...","In 1963, live on ""The Art Linkletter Show"", th..."
4,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...","Signer of the Dec. of Indep., framer of the Co..."
5,Jeopardy!,3-LETTER WORDS,$200,"In the title of an Aesop fable, this insect sh...","In the title of an Aesop fable, this insect sh..."
6,Jeopardy!,HISTORY,$400,Built in 312 B.C. to link Rome & the South of ...,Built in 312 B.C. to link Rome & the South of ...
7,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$400,"No. 8: 30 steals for the Birmingham Barons; 2,...","No. 8: 30 steals for the Birmingham Barons; 2,..."
8,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$400,"In the winter of 1971-72, a record 1,122 inche...","In the winter of 1971-72, a record 1,122 inche..."
9,Jeopardy!,THE COMPANY LINE,$400,This housewares store was named for the packag...,This housewares store was named for the packag...


In [122]:
# parse with spacy
nlp = spacy.load('en')

# apply spacy to each question in the table
filtered_df['spacy'] = filtered_df.apply(lambda x: nlp(x['cleaned']), axis=1)
filtered_df.head()


Unnamed: 0,Round,Category,Value,Question,cleaned,spacy
0,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...","For the last 8 years of his life, Galileo was ...","(For, the, last, 8, years, of, his, life, ,, G..."
1,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,No. 2: 1912 Olympian; football star at Carlisl...,"(No, ., 2, :, 1912, Olympian, ;, football, sta..."
2,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,The city of Yuma in this state has a record av...,"(The, city, of, Yuma, in, this, state, has, a,..."
3,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...","In 1963, live on ""The Art Linkletter Show"", th...","(In, 1963, ,, live, on, "", The, Art, Linklette..."
4,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...","Signer of the Dec. of Indep., framer of the Co...","(Signer, of, the, Dec., of, Indep, ., ,, frame..."


In [140]:
# extract lemmas minus punctuation and stop words
just_lemmas = []

def lemma_maker(text):
    lm = [token.lemma_ for token in text if not token.is_stop and not token.is_punct]
    return lm

just_lemmas = filtered_df.apply(lambda x: lemma_maker(x['spacy']), axis=1)
print(all_lemmas[:10])


0    [8, year, life, Galileo, house, arrest, espous...
1    [2, 1912, olympian, football, star, Carlisle, ...
2    [city, Yuma, state, record, average, 4,055, ho...
3    [1963, live, Art, Linkletter, company, serve, ...
4    [signer, December, Indep, framer, Constitution...
5    [title, Aesop, fable, insect, share, billing, ...
6    [build, 312, B.C., link, Rome, South, Italy, u...
7    [8, 30, steal, Birmingham, Barons, 2,306, stea...
8    [winter, 1971, 72, record, 1,122, inch, snow, ...
9    [houseware, store, name, packaging, merchandis...
dtype: object


### Bag of Words

In [189]:
# Since each question is a separated into rows, let's first consolidate all the words into one
# list as to make it easier get to the most common words
bag = []

for question in just_lemmas:
    for lemma in question:
        bag.append(lemma)
        
print(len(bag))
    
bag_of_words = [item[0] for item in Counter(bag).most_common(2000)]
print(bag_of_words[:100])
print(('1912' in bag_of_words)) # simple test for bag_of_words

1684334
['city', 'play', 'name', 'country', 'man', 'call', '2', 'know', 'see', 'like', 'type', 'film', 'say', 'state', 'U.S.', 'year', 'title', 'write', 'word', 'mean', 'win', 'come', 'include', 'large', 'bear', 'novel', 'find', 'term', 'New', 'time', 'star', 'work', '3', 'capital', 'president', '1', 'book', 'get', 'woman', 'go', 'old', 'take', 'famous', 'hit', 'song', 'day', 'world', 'John', 'give', 'home', 'begin', 'group', 'character', 'island', 'long', 'author', 'american', 'good', 'company', 'tell', 'people', 'century', 'end', 'tv', 'high', 'lead', 'movie', 'form', 'line', 'french', 'game', 'die', 'love', 'life', 'feature', '4', 'base', 'live', 'use', 'great', 'small', 'king', 'big', 'help', 'place', 'set', 'run', 'british', 'found', 'new', 'serve', 'water', 'hold', 'family', 'head', 'leave', 'body', 'number', 'war', 'animal']
True


In [192]:
df = pd.DataFrame(columns=bag_of_words)
df['ques_text'] = filtered_df['cleaned']
df['ques_round'] = filtered_df[' Round']

# initialize counts
df.loc[:, bag_of_words] = 0
print('done')
df.head(5)

done


Unnamed: 0,city,play,name,country,man,call,2,know,see,like,...,district,soap,narrow,rescue,collect,Romans,Warren,Women,ques_text,ques_round
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"For the last 8 years of his life, Galileo was ...",Jeopardy!
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,No. 2: 1912 Olympian; football star at Carlisl...,Jeopardy!
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,The city of Yuma in this state has a record av...,Jeopardy!
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"In 1963, live on ""The Art Linkletter Show"", th...",Jeopardy!
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"Signer of the Dec. of Indep., framer of the Co...",Jeopardy!


In [None]:
# count word values for each sentence
for i, question in enumerate(just_lemmas):
    for word in question:
        if word in bag_of_words:
            df.at[i, word] += 1  #df.at is faster than df.loc
    
    if i % 25000 == 0:
        print(i, ' done')

df.head()

0  done
25000  done
50000  done
75000  done
100000  done
125000  done
150000  done
175000  done


In [178]:
test_list = ['home', 'go']
test_word = ['just', 'go']
for word in test_word:
    if word in test_list:
        print(word, ' is in the list')

go  is in the list


In [None]:
# create a new dataframe with common words as features(columns) and questions as samples(rows)
# questions passed in will be the 'cleaned' column
def bow_features(questions, common_words):
    
    # Scaffold the data frame and initialize counts to zero.
    df = pd.DataFrame(columns=common_words)
    df['question'] = 
    df['text_source'] = sentences[1]
    df.loc[:, common_words] = 0
    
    # Process each row, counting the occurrence of words in each sentence.
    for i, sentence in enumerate(df['text_sentence']):
        
        # Convert the sentence to lemmas, then filter out punctuation,
        # stop words, and uncommon words.
        words = [token.lemma_
                 for token in sentence
                 if (
                     not token.is_punct
                     and not token.is_stop
                     and token.lemma_ in common_words
                 )]
        
        # Populate the row with word counts.
        for word in words:
            df.loc[i, word] += 1
        
        # This counter is just to make sure the kernel didn't hang.
        if i % 50 == 0:
            print("Processing row {}".format(i))
            
    return df