In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
# episodes
df_ep = pd.read_csv('./data/simpsons_episodes.csv', error_bad_lines=False)
df_ep.shape

(600, 14)

## Set Popularity Threshold

In [3]:
# define the movie imdb threshold for how popular the movie should be
threshold = 7.5

In [4]:
# assign the categories for the movie threshold and create this as the target set of values
df_ep['target'] = df_ep.apply(lambda x: x['imdb_rating'] >= threshold, axis=1)

# y = df_ep['target']  <-- separate y later to ensure shape

In [5]:
# restrict the episode dataset to numerical features
df_ep = df_ep.dropna()

df_episode_measures = df_ep[['id',
                             'number_in_season',
                             'number_in_series',
                             'original_air_year',
                             'season',
                             'us_viewers_in_millions',
                             'views',
                             'target']]

df_episode_measures.head()

Unnamed: 0,id,number_in_season,number_in_series,original_air_year,season,us_viewers_in_millions,views,target
0,10,10,10,1990,1,30.3,50816.0,False
1,12,12,12,1990,1,30.4,62561.0,True
2,14,1,14,1990,2,33.6,59575.0,True
3,17,4,17,1990,2,26.1,64959.0,True
4,19,6,19,1990,2,25.4,50691.0,True


In [6]:
# set the episode measures index to the episode in order to combine with the character information

# df_episode_measures = df_episode_measures.set_index('id')
# df_episode_measures.head()

In [7]:
df_episode_measures.shape

(593, 8)

# Import the character memorable line information 

In [8]:
high = pd.read_csv('./data/combined_thirds/high_third.csv')
mid = pd.read_csv('./data/combined_thirds/mid_third.csv')
low = pd.read_csv('./data/combined_thirds/low_third.csv')

### High frequency memorable lines merged with episode information

In [9]:
high = high.set_index('name tuple')
high = high.T
high.head()
# high.shape

name tuple,Homer Simpson,Marge Simpson,Bart Simpson,Lisa Simpson,C. Montgomery Burns,Moe Szyslak,Seymour Skinner,Ned Flanders,Grampa Simpson,Milhouse Van Houten,Chief Wiggum,Krusty the Clown,Nelson Muntz,Lenny Leonard,Apu Nahasapeemapetilon,Waylon Smithers,Kent Brockman,Carl Carlson,Edna Krabappel-Flanders,Dr. Julius Hibbert
176,58,14,19,3,16,7,0,0,0,9,1,0,0,14,0,6,6,8,0,0
177,18,21,7,2,0,26,15,0,24,0,27,0,0,0,0,1,0,0,0,0
179,75,23,15,18,0,6,0,0,0,0,0,0,0,2,0,0,0,2,0,0
178,18,14,40,48,0,0,3,0,4,0,14,0,1,0,0,0,0,0,0,0
180,17,16,14,6,0,0,18,0,4,0,0,0,1,0,1,0,2,0,10,0


In [10]:
high.index

Index(['176', '177', '179', '178', '180', '181', '182', '183', '184', '185',
       ...
       '454', '455', '453', '456', '457', '458', '459', '462', '460', '461'],
      dtype='object', length=552)

In [11]:
high = high.reset_index()
high['id'] = high.apply(lambda x: int(x['index']), axis=1)
high = high.drop('index', axis=1)
high.head()

name tuple,Homer Simpson,Marge Simpson,Bart Simpson,Lisa Simpson,C. Montgomery Burns,Moe Szyslak,Seymour Skinner,Ned Flanders,Grampa Simpson,Milhouse Van Houten,...,Krusty the Clown,Nelson Muntz,Lenny Leonard,Apu Nahasapeemapetilon,Waylon Smithers,Kent Brockman,Carl Carlson,Edna Krabappel-Flanders,Dr. Julius Hibbert,id
0,58,14,19,3,16,7,0,0,0,9,...,0,0,14,0,6,6,8,0,0,176
1,18,21,7,2,0,26,15,0,24,0,...,0,0,0,0,1,0,0,0,0,177
2,75,23,15,18,0,6,0,0,0,0,...,0,0,2,0,0,0,2,0,0,179
3,18,14,40,48,0,0,3,0,4,0,...,0,1,0,0,0,0,0,0,0,178
4,17,16,14,6,0,0,18,0,4,0,...,0,1,0,1,0,2,0,10,0,180


In [12]:
df_high = pd.merge(df_episode_measures, high, on='id', how='outer')
df_high.shape

(596, 28)

In [13]:
y_high = df_high['target']
X_high = df_high.drop('target', axis=1)

In [14]:
y_high.to_csv('y_high.csv', index=True)
X_high.to_csv('X_high.csv', index=True)

### Low frequency memorable lines merged with episode information 

In [15]:
low = low.set_index('name tuple')
low = low.T
low = low.reset_index()
low['id'] = low.apply(lambda x: int(x['index']), axis=1)
low = low.drop('index', axis=1)

In [16]:
df_low = pd.merge(df_episode_measures, low, on='id', how='outer')
df_low.shape

(596, 68)

In [17]:
y_low = df_low['target']
X_low = df_low.drop('target', axis=1)
y_low.to_csv('y_low.csv', index=True)
X_low.to_csv('X_low.csv', index=True)

### Mid frequency memorable lines merged with episode information

In [18]:
mid = mid.set_index('name tuple')
mid = mid.T
mid = mid.reset_index()
mid['id'] = mid.apply(lambda x: int(x['index']), axis=1)
mid = mid.drop('index', axis=1)

In [19]:
df_mid = pd.merge(df_episode_measures, mid, on='id', how='outer')
df_mid.shape

(596, 28)

In [20]:
y_mid = df_mid['target']
X_mid = df_mid.drop('target', axis=1)
y_mid.to_csv('y_mid.csv', index=True)
X_mid.to_csv('X_mid.csv', index=True)

### All thirds with episode information

In [21]:
df_all_ = pd.merge(df_high, mid, on='id', how='outer')
df_all = pd.merge(df_all_, low, on='id', how='outer')
df_all = df_all.fillna(0)
df_all

Unnamed: 0,id,number_in_season,number_in_series,original_air_year,season,us_viewers_in_millions,views,target,Homer Simpson,Marge Simpson,...,Artie Ziff,George H.W. Bush,Director,Scratchy,Marty,Salesman,Dan Gillick,Larry,Dr. Marvin Monroe,Clerk
0,10,10.0,10.0,1990.0,1.0,30.30,50816.0,False,73.0,33.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
1,12,12.0,12.0,1990.0,1.0,30.40,62561.0,True,31.0,11.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,14,1.0,14.0,1990.0,2.0,33.60,59575.0,True,23.0,15.0,...,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0
3,17,4.0,17.0,1990.0,2.0,26.10,64959.0,True,45.0,22.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,19,6.0,19.0,1990.0,2.0,25.40,50691.0,True,91.0,27.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
591,542,12.0,542.0,2014.0,25.0,2.69,39292.0,False,31.0,11.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
592,548,18.0,548.0,2014.0,25.0,3.64,55742.0,False,39.0,20.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
593,160,0.0,0.0,0.0,0.0,0.00,0.0,0,16.0,7.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
594,161,0.0,0.0,0.0,0.0,0.00,0.0,0,36.0,17.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
y_all = df_all['target']
X_all = df_all.drop('target', axis=1)

In [23]:
y_all.to_csv('y_all.csv', index=True)
X_all.to_csv('X_all.csv', index=True)