In [2]:
import pandas as pd
import numpy as np

data_df = pd.read_csv('JEOPARDY_DATA.csv')
data_df.head()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,4680,12/31/2004,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus
1,4680,12/31/2004,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe
2,4680,12/31/2004,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona
3,4680,12/31/2004,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's
4,4680,12/31/2004,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams


In [3]:
data_df.columns

Index(['Show Number', ' Air Date', ' Round', ' Category', ' Value',
       ' Question', ' Answer'],
      dtype='object')

In [6]:
# There's a space in front of each of the column names!
data_df.columns = data_df.columns.str.replace(" ","")
data_df.columns

Index(['ShowNumber', 'AirDate', 'Round', 'Category', 'Value', 'Question',
       'Answer'],
      dtype='object')

In [7]:
# These are fine, ideally they'd be show_number and air_date, but it's fine
# as long as they have no spaces... which makes referencing reallllly tough.

In [9]:
data_df.dtypes

ShowNumber     int64
AirDate       object
Round         object
Category      object
Value         object
Question      object
Answer        object
dtype: object

In [15]:
# It's sorta annoying that AirDate isn't a "datetime"... let's solve that:
data_df['AirDate'] = pd.to_datetime(data_df['AirDate'], format = '%m/%d/%Y') # Format data's currently in is m --> d --> y, reorganizes.
# Preferable to use year --> month --> day, because a lot easier to funnel them.
# As a side note, capital Y means four digits, and capital anything else would mean it expects four (eg; %D = 0001)
data_df.head()

Unnamed: 0,ShowNumber,AirDate,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams


In [18]:
# Assign is used to assign a new column
data_df = data_df.assign(month = lambda x : x['AirDate'].dt.month) # Lambda means "for every x, take x out of given row",
                                                                   # and then insert it, in our case.
data_df.head()

Unnamed: 0,ShowNumber,AirDate,Round,Category,Value,Question,Answer,month
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus,12
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe,12
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona,12
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's,12
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams,12


In [20]:
data_df['month2'] = data_df.apply(lambda x : x['AirDate'].month, axis = 1) # Axis = 1 references headers, axis = 0 ref rows.
data_df.head()

Unnamed: 0,ShowNumber,AirDate,Round,Category,Value,Question,Answer,month,month2
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus,12,12
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe,12,12
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona,12,12
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's,12,12
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams,12,12


Filters:

In [24]:
value_list = data_df['Value'].tolist() # Looks at data within given list.
value_list = value_list[0:50]
value_list
# We might want to do this because it shows the structure of the data.
# "Oh! It's an object, also possessing this shit we can get rid of."

['$200 ',
 '$200 ',
 '$200 ',
 '$200 ',
 '$200 ',
 '$200 ',
 '$400 ',
 '$400 ',
 '$400 ',
 '$400 ',
 '$400 ',
 '$400 ',
 '$600 ',
 '$600 ',
 '$600 ',
 '$600 ',
 '$600 ',
 '$600 ',
 '$800 ',
 '$800 ',
 '$800 ',
 '$800 ',
 '$2,000 ',
 '$800 ',
 '$1,000 ',
 '$1,000 ',
 '$1,000 ',
 '$1,000 ',
 '$1,000 ',
 '$400 ',
 '$400 ',
 '$400 ',
 '$400 ',
 '$400 ',
 '$400 ',
 '$800 ',
 '$800 ',
 '$800 ',
 '$800 ',
 '$800 ',
 '$1,200 ',
 '$2,000 ',
 '$1,200 ',
 '$1,200 ',
 '$1,200 ',
 '$1,600 ',
 '$1,600 ',
 '$1,600 ',
 '$1,600 ',
 '$1,600 ']

In [25]:
# We want to manipulate this data, but preserve the original values:
filtered_list = list(filter(lambda num : int(num.replace("$", "")
                                        .replace(" ", "")
                                        .replace(",", "")) > 300, value_list)) # Only for values greater than 300.
filtered_list

['$400 ',
 '$400 ',
 '$400 ',
 '$400 ',
 '$400 ',
 '$400 ',
 '$600 ',
 '$600 ',
 '$600 ',
 '$600 ',
 '$600 ',
 '$600 ',
 '$800 ',
 '$800 ',
 '$800 ',
 '$800 ',
 '$2,000 ',
 '$800 ',
 '$1,000 ',
 '$1,000 ',
 '$1,000 ',
 '$1,000 ',
 '$1,000 ',
 '$400 ',
 '$400 ',
 '$400 ',
 '$400 ',
 '$400 ',
 '$400 ',
 '$800 ',
 '$800 ',
 '$800 ',
 '$800 ',
 '$800 ',
 '$1,200 ',
 '$2,000 ',
 '$1,200 ',
 '$1,200 ',
 '$1,200 ',
 '$1,600 ',
 '$1,600 ',
 '$1,600 ',
 '$1,600 ',
 '$1,600 ']