In [1]:
import pandas as pd

In [2]:
# Display full content of the columns names in full
pd.set_option('display.max_colwidth', None)

In [3]:
# Read jeopardy.csv file using pandas
jeopardy_data = pd.read_csv('jeopardy.csv')

In [4]:
# Inspect the columns of jeopardy_csv
print(jeopardy_data.columns)

Index(['Show Number', ' Air Date', ' Round', ' Category', ' Value',
       ' Question', ' Answer'],
      dtype='object')


In [5]:
# Rename columns properly
jeopardy_data = jeopardy_data.rename(
    columns = {
        ' Air Date' : 'Air Date',
        ' Round' : 'Round',
        ' Category': 'Category',
        ' Value' : 'Value',
        ' Question' : 'Question',
        ' Answer' : 'Answer'
    }
)

In [6]:
# Inspect jeopardy_data
print(jeopardy_data.head())

   Show Number    Air Date      Round                         Category Value  \
0         4680  2004-12-31  Jeopardy!                          HISTORY  $200   
1         4680  2004-12-31  Jeopardy!  ESPN's TOP 10 ALL-TIME ATHLETES  $200   
2         4680  2004-12-31  Jeopardy!      EVERYBODY TALKS ABOUT IT...  $200   
3         4680  2004-12-31  Jeopardy!                 THE COMPANY LINE  $200   
4         4680  2004-12-31  Jeopardy!              EPITAPHS & TRIBUTES  $200   

                                                                                                      Question  \
0             For the last 8 years of his life, Galileo was under house arrest for espousing this man's theory   
1  No. 2: 1912 Olympian; football star at Carlisle Indian School; 6 MLB seasons with the Reds, Giants & Braves   
2                     The city of Yuma in this state has a record average of 4,055 hours of sunshine each year   
3                         In 1963, live on "The Art Linkletter 

In [7]:
# A function that filters the jeopardy_data for Question that contains all of the words in a list of words
def filter_jeopardy(data, list_words):
    # Return true for rows in data that contain the words in list_word
    return data.loc[data['Question'].apply(lambda x: all(word.lower() in x.lower() for word in list_words))]

# Testing filter_jeopardy function on jeopardy_data
filtered = filter_jeopardy(jeopardy_data, ["King", "England"])

# Examine filtered on the 'Question' column for jeopardy_data
print(filtered['Question'])

4953                    Both England's King George V & FDR put their stamp of approval on this "King of Hobbies"
6337      In retaliation for Viking raids, this "Unready" king of England attacks Norse areas of the Isle of Man
9191                    This king of England beat the odds to trounce the French in the 1415 Battle of Agincourt
11710               This Scotsman, the first Stuart king of England, was called "The Wisest Fool in Christendom"
13454                                       It's the number that followed the last king of England named William
                                                           ...                                                  
208295        In 1066 this great-great grandson of Rollo made what some call the last Viking invasion of England
208742                      Dutch-born king who ruled England jointly with Mary II & is a tasty New Zealand fish
213870                In 1781 William Herschel discovered Uranus & initially named it after this

In [12]:
# Examine the data for the 'Value' column
print(jeopardy_data['Value'].head())

0    $200
1    $200
2    $200
3    $200
4    $200
Name: Value, dtype: object


In [19]:
'''
1. Create a new column called Float Value
2. Assign a float data of the 'Value' column by stripping the '$' sign 
3. and replacing ',' with nothing if the data point is not None, else 0
'''
jeopardy_data['Float Value'] = jeopardy_data['Value'].apply(\
                                                            lambda x: \
                                                            float(x.strip('$')\
                                                                  .replace(',','')) if x != "None" else 0)

In [24]:
# Filtering jeopardy_data and finding the average value of those questions
filtered = filter_jeopardy(jeopardy_data, ['King'])
print(filtered['Float Value'].mean())

771.8833850722094


In [25]:
# A function to find the unique answers of a set of data
def get_answer_counts(data):
    return data['Answer'].value_counts()

# Testing the answer count function
print(get_answer_counts(filtered))

Henry VIII             55
Solomon                35
Richard III            33
Louis XIV              31
David                  30
                       ..
Whopper                 1
the Mount of Olives     1
"42nd Street"           1
Martin                  1
the Spanish Armada      1
Name: Answer, Length: 5268, dtype: int64


In [26]:
print(jeopardy_data.columns)

Index(['Show Number', 'Air Date', 'Round', 'Category', 'Value', 'Question',
       'Answer', 'Float Value'],
      dtype='object')


In [27]:
print(jeopardy_data['Air Date'].head())

0    2004-12-31
1    2004-12-31
2    2004-12-31
3    2004-12-31
4    2004-12-31
Name: Air Date, dtype: object


In [33]:
#Calculating number of questions containing the word "computer" for each decade
#Add a column called 'Question Year' to jeopardy_data formatted to year
jeopardy_data['Question Year'] = jeopardy_data['Air Date'].apply(lambda x: x[:4])

#Use filter_jeopardy function to filter the word 'Computer'
computer = filter_jeopardy(jeopardy_data, ['Computer'])

# Grouping computer BY 'Question Year'
computer_by_year = computer.groupby('Question Year')['Show Number'].count().reset_index()

# Select rows where 'Question Year' is in the 90s
computer_90s = computer_by_year[(computer_by_year['Question Year'] < '2000') & (computer_by_year['Question Year'] > '1989')]

# Select rows where 'Question Year' is in the 2000s
computer_2000s = computer_by_year[(computer_by_year['Question Year'] < '2010') & (computer_by_year['Question Year'] > '1999')]

# Calculate the total number of questions containing the search term by decade
print("The number of questions featuring the word \"computer\" in the 1990s = " + str(computer_90s['Show Number'].sum()) + "\nThe number of questions containing the word \"computer\" in the 2000s = " + str(computer_2000s['Show Number'].sum()))

The number of questions featuring the word "computer" in the 1990s = 98
The number of questions containing the word "computer" in the 2000s = 268


In [36]:
# Display number of instances of Category occuring in particular Round
category_round = jeopardy_data.groupby(['Category', 'Round'])['Show Number'].count().reset_index()

# Plot in a pivot table to increase readability
category_round_pivot = category_round.pivot(columns = 'Round', index = 'Category', values = 'Show Number').reset_index()

# Rename columns
#category_round_pivot.columns = ['category', 'double', 'final', 'single']

# Display resulting pivot table
print(category_round_pivot)

# To find data on specific category
literature = category_round_pivot[(category_round_pivot.Category == 'LITERATURE')]
print(literature)

Round                     Category  Double Jeopardy!  Final Jeopardy!  \
0       A JIM CARREY FILM FESTIVAL               NaN              NaN   
1                              "!"               NaN              NaN   
2                          "-ARES"               5.0              NaN   
3              "-ICIAN" EXPEDITION               NaN              NaN   
4                    "...OD" WORDS               5.0              NaN   
...                            ...               ...              ...   
27990                   “R” MOVIES               5.0              NaN   
27991                     “SAINTS”               4.0              NaN   
27992                      “SOUTH”               5.0              NaN   
27993                    “STREETS”               NaN              NaN   
27994                “WH”AT IS IT?               5.0              NaN   

Round  Jeopardy!  Tiebreaker  
0            5.0         NaN  
1            5.0         NaN  
2            NaN         NaN  