# Doctor Who Feature Extraction

This notebook seeks to work with the `Merged.csv` file created in [the WhoAggregation Notebook](WhoAggregation.ipynb) to engineer meaningful columns. These values are then saved into `Processed.csv`.

In [345]:
# Imports
import pandas as pd
import json

In [346]:
# Load the dataset
df = pd.read_csv('../Data/Merged.csv')
df.set_index('episode_id', inplace=True)

# Drop Junk Columns or columns represented by other columns
df.drop(columns=['number', 'episodenbr', 'air_date'], inplace=True)

# Remove % from Share
df['share'] = df['share'].str.replace('%','')

# Remove m from views
df['views'] = df['views'].str.replace('m','')

df = df.sort_values('date')

df.head()

Unnamed: 0_level_0,title,weekday,broadcasthour,duration,views,share,AI,chart,cast,crew,summary,date,doctorid,rating,votes,description,season
episode_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1.01,Rose,Sat,7:00pm,00:44:14,10.81,44.8,76.0,7,"[{""role"":""Doctor Who"",""name"":""Christopher Eccl...","[{""role"":""Writer"",""name"":""Russell T Davies""},{...",,2005-03-26,9,7.6,6504,When ordinary shop-worker Rose Tyler meets a m...,1
1.02,The End of the World,Sat,6:59pm,00:44:45,7.97,37.8,76.0,19,"[{""role"":""Doctor Who"",""name"":""Christopher Eccl...","[{""role"":""Writer"",""name"":""Russell T Davies""},{...",,2005-04-02,9,7.6,5684,The Doctor takes Rose to the year 5 billion to...,1
1.03,The Unquiet Dead,Sat,7:00pm,00:44:50,8.86,37.8,80.0,15,"[{""role"":""Doctor Who"",""name"":""Christopher Eccl...","[{""role"":""Writer"",""name"":""Mark Gatiss""},{""role...",,2005-04-09,9,7.6,5326,The Doctor has great expectations for his late...,1
1.04,Aliens of London,Sat,7:00pm,00:45:05,7.63,35.7,82.0,18,"[{""role"":""Doctor Who"",""name"":""Christopher Eccl...","[{""role"":""Writer"",""name"":""Russell T Davies""},{...",,2005-04-16,9,7.0,5116,The Doctor returns Rose to her own time - well...,1
1.05,World War Three,Sat,7:01pm,00:40:40,7.98,40.2,81.0,20,"[{""role"":""Doctor Who"",""name"":""Christopher Eccl...","[{""role"":""Writer"",""name"":""Russell T Davies""},{...",,2005-04-23,9,7.1,4943,The Slitheen have infiltrated Parliament and h...,1


In [347]:
# Engineer columns for the relevant doctors
df['Has 9'] = df['doctorid'] == 9
df['Has 10'] = df['doctorid'] == 10
df['Has 11'] = df['doctorid'] == 11
df['Has 12'] = df['doctorid'] == 12
df['Has 13'] = df['doctorid'] == 13

# Convert Boolean Columns to Numeric
df.replace({False: 0, True: 1}, inplace=True)

df.head()

Unnamed: 0_level_0,title,weekday,broadcasthour,duration,views,share,AI,chart,cast,crew,...,doctorid,rating,votes,description,season,Has 9,Has 10,Has 11,Has 12,Has 13
episode_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1.01,Rose,Sat,7:00pm,00:44:14,10.81,44.8,76.0,7,"[{""role"":""Doctor Who"",""name"":""Christopher Eccl...","[{""role"":""Writer"",""name"":""Russell T Davies""},{...",...,9,7.6,6504,When ordinary shop-worker Rose Tyler meets a m...,1,1,0,0,0,0
1.02,The End of the World,Sat,6:59pm,00:44:45,7.97,37.8,76.0,19,"[{""role"":""Doctor Who"",""name"":""Christopher Eccl...","[{""role"":""Writer"",""name"":""Russell T Davies""},{...",...,9,7.6,5684,The Doctor takes Rose to the year 5 billion to...,1,1,0,0,0,0
1.03,The Unquiet Dead,Sat,7:00pm,00:44:50,8.86,37.8,80.0,15,"[{""role"":""Doctor Who"",""name"":""Christopher Eccl...","[{""role"":""Writer"",""name"":""Mark Gatiss""},{""role...",...,9,7.6,5326,The Doctor has great expectations for his late...,1,1,0,0,0,0
1.04,Aliens of London,Sat,7:00pm,00:45:05,7.63,35.7,82.0,18,"[{""role"":""Doctor Who"",""name"":""Christopher Eccl...","[{""role"":""Writer"",""name"":""Russell T Davies""},{...",...,9,7.0,5116,The Doctor returns Rose to her own time - well...,1,1,0,0,0,0
1.05,World War Three,Sat,7:01pm,00:40:40,7.98,40.2,81.0,20,"[{""role"":""Doctor Who"",""name"":""Christopher Eccl...","[{""role"":""Writer"",""name"":""Russell T Davies""},{...",...,9,7.1,4943,The Slitheen have infiltrated Parliament and h...,1,1,0,0,0,0


In [348]:
# Determine which notable characters / monsters are in the episode
noteable_characters = [
    'Rose Tyler', 
    'Mickey Smith', 
    'Jackie Tyler', 
    'Cyberman',
    'Dalek',
    'Commander Strax'
    'Captain Jack Harkness',
    'Sarah Jane Smith',
    'Ood',
    'Donna Noble',
    'Martha Jones',
    'Judoon',
    'Amy Pond',
    'Rory',
    'River Song',
    'Clara',
    'Weeping Angel',
    'The Silent',
    'Madame Kovarian',
    'Madame Vastra',
    'Winston Churchill',
    'Sophie',
    'Jenny Flint',
    'The War Doctor',
    'Danny Pink',
    'Bill',
    'Osgood',
    'The Master',
    'Kate Lethbridge-Stewart',
    'Zygon',
    'Sontaran',
    'Nardole',
    'Yasmin Khan',
    'Graham O\'Brien',
    'Ryan Sinclair',
    'Grace',
    'Winston Churchill'
]

def extract_cast_info(row):

    # Load information about cast
    data = row['cast']
    arr = json.loads(data)
    for role in arr:
        roleName = role['role']

        # Clean up names
        if roleName.startswith('Dalek'):
            roleName = 'Dalek'
        if roleName.startswith('Cyber'):
            roleName = 'Cyberman'
        if roleName == 'Rory Williams':
            roleName = 'Rory'
        if roleName == 'Angel Bob':
            roleName = 'Weeping Angel'
        if roleName.startswith('Judoon'):
            roleName = 'Judoon'
        if roleName == 'Silent':
            roleName = 'The Silent'
        if roleName == 'Emperor Winston Churchill' or roleName == 'Churchill':
            roleName = 'Winston Churchill'

        # Add a column for the character if we care about them
        if roleName in noteable_characters:
            row['Has ' + roleName] = 1

    # Return the row with new characters
    return row

df = df.apply(extract_cast_info, axis=1)
df.head()

Unnamed: 0_level_0,AI,Has 10,Has 11,Has 12,Has 13,Has 9,Has Amy Pond,Has Bill,Has Clara,Has Cyberman,...,doctorid,duration,rating,season,share,summary,title,views,votes,weekday
episode_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1.01,76.0,0,0,0,0,1,,,,,...,9,00:44:14,7.6,1,44.8,,Rose,10.81,6504,Sat
1.02,76.0,0,0,0,0,1,,,,,...,9,00:44:45,7.6,1,37.8,,The End of the World,7.97,5684,Sat
1.03,80.0,0,0,0,0,1,,,,,...,9,00:44:50,7.6,1,37.8,,The Unquiet Dead,8.86,5326,Sat
1.04,82.0,0,0,0,0,1,,,,,...,9,00:45:05,7.0,1,35.7,,Aliens of London,7.63,5116,Sat
1.05,81.0,0,0,0,0,1,,,,,...,9,00:40:40,7.1,1,40.2,,World War Three,7.98,4943,Sat


In [349]:
# Determine key information about crew - director and writer, mostly
def extract_crew_info(row):

    data = row['crew']

    arr = json.loads(data)
    for pair in arr:
        role = pair['role']
        name = pair['name']

        # Note: This code does not adequately handle multiple writers / producers / directors
        if role == 'Writer':
            row['Writer'] = name
        elif role == 'Producer':
            row['Producer'] = name
        elif role == 'Director':
            row['Director'] = name
        elif role == 'Incidental Music' or role == 'Music' or role == 'Composer':
            row['Music'] = name

    return row

df = df.apply(extract_crew_info, axis=1)
df.head()

Unnamed: 0_level_0,AI,Director,Has 10,Has 11,Has 12,Has 13,Has 9,Has Amy Pond,Has Bill,Has Clara,...,doctorid,duration,rating,season,share,summary,title,views,votes,weekday
episode_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1.01,76.0,Keith Boak,0,0,0,0,1,,,,...,9,00:44:14,7.6,1,44.8,,Rose,10.81,6504,Sat
1.02,76.0,Euros Lyn,0,0,0,0,1,,,,...,9,00:44:45,7.6,1,37.8,,The End of the World,7.97,5684,Sat
1.03,80.0,Euros Lyn,0,0,0,0,1,,,,...,9,00:44:50,7.6,1,37.8,,The Unquiet Dead,8.86,5326,Sat
1.04,82.0,Keith Boak,0,0,0,0,1,,,,...,9,00:45:05,7.0,1,35.7,,Aliens of London,7.63,5116,Sat
1.05,81.0,Keith Boak,0,0,0,0,1,,,,...,9,00:40:40,7.1,1,40.2,,World War Three,7.98,4943,Sat


In [350]:
# Fix some missing values in our source data
df.loc[df['title'].eq('The Power of Three'), ['Director']] = 'Douglas Mackinnon'
df.loc[df['title'].eq('Closing Time'), ['Producer']] = 'Denise Paul'

df.head()


Unnamed: 0_level_0,AI,Director,Has 10,Has 11,Has 12,Has 13,Has 9,Has Amy Pond,Has Bill,Has Clara,...,doctorid,duration,rating,season,share,summary,title,views,votes,weekday
episode_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1.01,76.0,Keith Boak,0,0,0,0,1,,,,...,9,00:44:14,7.6,1,44.8,,Rose,10.81,6504,Sat
1.02,76.0,Euros Lyn,0,0,0,0,1,,,,...,9,00:44:45,7.6,1,37.8,,The End of the World,7.97,5684,Sat
1.03,80.0,Euros Lyn,0,0,0,0,1,,,,...,9,00:44:50,7.6,1,37.8,,The Unquiet Dead,8.86,5326,Sat
1.04,82.0,Keith Boak,0,0,0,0,1,,,,...,9,00:45:05,7.0,1,35.7,,Aliens of London,7.63,5116,Sat
1.05,81.0,Keith Boak,0,0,0,0,1,,,,...,9,00:40:40,7.1,1,40.2,,World War Three,7.98,4943,Sat


In [351]:
# Move NaN columns to use 0's instead (helps summarization)
df['Director'] = df['Director'].fillna('Other')
df['Writer'] = df['Writer'].fillna('Other')
df['Producer'] = df['Producer'].fillna('Other')
df['Music'] = df['Music'].fillna('Other')
df = df.fillna(0)

# Drop unnecessary columns now that feature extraction is over
df.drop(columns=['crew', 'cast', 'summary','AI','duration','share','votes','chart','broadcasthour'], inplace=True)

df.head()

Unnamed: 0_level_0,Director,Has 10,Has 11,Has 12,Has 13,Has 9,Has Amy Pond,Has Bill,Has Clara,Has Cyberman,...,Producer,Writer,date,description,doctorid,rating,season,title,views,weekday
episode_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1.01,Keith Boak,0,0,0,0,1,0.0,0.0,0.0,0.0,...,Phil Collinson,Russell T Davies,2005-03-26,When ordinary shop-worker Rose Tyler meets a m...,9,7.6,1,Rose,10.81,Sat
1.02,Euros Lyn,0,0,0,0,1,0.0,0.0,0.0,0.0,...,Phil Collinson,Russell T Davies,2005-04-02,The Doctor takes Rose to the year 5 billion to...,9,7.6,1,The End of the World,7.97,Sat
1.03,Euros Lyn,0,0,0,0,1,0.0,0.0,0.0,0.0,...,Phil Collinson,Mark Gatiss,2005-04-09,The Doctor has great expectations for his late...,9,7.6,1,The Unquiet Dead,8.86,Sat
1.04,Keith Boak,0,0,0,0,1,0.0,0.0,0.0,0.0,...,Phil Collinson,Russell T Davies,2005-04-16,The Doctor returns Rose to her own time - well...,9,7.0,1,Aliens of London,7.63,Sat
1.05,Keith Boak,0,0,0,0,1,0.0,0.0,0.0,0.0,...,Phil Collinson,Russell T Davies,2005-04-23,The Slitheen have infiltrated Parliament and h...,9,7.1,1,World War Three,7.98,Sat


In [352]:
# Convert the multi-category weekday column to individual day columns
df = pd.concat([df, pd.get_dummies(df['weekday'], prefix='Aired')],axis=1)
df.drop(columns=['weekday'], inplace=True)

df.head()

Unnamed: 0_level_0,Director,Has 10,Has 11,Has 12,Has 13,Has 9,Has Amy Pond,Has Bill,Has Clara,Has Cyberman,...,doctorid,rating,season,title,views,Aired_Fri,Aired_Mon,Aired_Sat,Aired_Sun,Aired_Tue
episode_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1.01,Keith Boak,0,0,0,0,1,0.0,0.0,0.0,0.0,...,9,7.6,1,Rose,10.81,0,0,1,0,0
1.02,Euros Lyn,0,0,0,0,1,0.0,0.0,0.0,0.0,...,9,7.6,1,The End of the World,7.97,0,0,1,0,0
1.03,Euros Lyn,0,0,0,0,1,0.0,0.0,0.0,0.0,...,9,7.6,1,The Unquiet Dead,8.86,0,0,1,0,0
1.04,Keith Boak,0,0,0,0,1,0.0,0.0,0.0,0.0,...,9,7.0,1,Aliens of London,7.63,0,0,1,0,0
1.05,Keith Boak,0,0,0,0,1,0.0,0.0,0.0,0.0,...,9,7.1,1,World War Three,7.98,0,0,1,0,0


In [353]:
# Reorder columns
first_cols = ['title', 'season', 'doctorid', 'description','Producer','Director','Writer','Music']
for col_name in reversed(first_cols):
    df = df[ [col_name] + [ col for col in df.columns if col != col_name ] ]

df.head()

Unnamed: 0_level_0,title,season,doctorid,description,Producer,Director,Writer,Music,Has 10,Has 11,...,Has Yasmin Khan,Has Zygon,date,rating,views,Aired_Fri,Aired_Mon,Aired_Sat,Aired_Sun,Aired_Tue
episode_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1.01,Rose,1,9,When ordinary shop-worker Rose Tyler meets a m...,Phil Collinson,Keith Boak,Russell T Davies,Murray Gold,0,0,...,0.0,0.0,2005-03-26,7.6,10.81,0,0,1,0,0
1.02,The End of the World,1,9,The Doctor takes Rose to the year 5 billion to...,Phil Collinson,Euros Lyn,Russell T Davies,Murray Gold,0,0,...,0.0,0.0,2005-04-02,7.6,7.97,0,0,1,0,0
1.03,The Unquiet Dead,1,9,The Doctor has great expectations for his late...,Phil Collinson,Euros Lyn,Mark Gatiss,Murray Gold,0,0,...,0.0,0.0,2005-04-09,7.6,8.86,0,0,1,0,0
1.04,Aliens of London,1,9,The Doctor returns Rose to her own time - well...,Phil Collinson,Keith Boak,Russell T Davies,Murray Gold,0,0,...,0.0,0.0,2005-04-16,7.0,7.63,0,0,1,0,0
1.05,World War Three,1,9,The Slitheen have infiltrated Parliament and h...,Phil Collinson,Keith Boak,Russell T Davies,Murray Gold,0,0,...,0.0,0.0,2005-04-23,7.1,7.98,0,0,1,0,0


In [354]:
# Serialize to File for further analysis
df.to_csv('../Data/Processed.csv')