In [156]:
import pandas as pd
from os import path
import numpy as np

In [180]:
name_title_degree_clean_path = '../../data/imbd_data/name_title_degree_clean.csv'
the_oscar_award_corrected_path = '../../data/imbd_data/the_oscar_award_corrected.csv'
all_acting_path = '../../data/imbd_data/all_acting.csv'

In [158]:
def load_dataset(filepath):
    df = pd.read_csv(filepath)
    return df



def write_to_csv(df,filepath):
    '''
    input: df - a pandas DataFrame
           filepath - an output filepath as a string

    writes to a csv file
    in same diretory as this script

    returns: nothing
    '''
    # if no csv exists
    if not path.exists(filepath):
        df.to_csv(filepath,index=False)
    else:
        df.to_csv(filepath, mode='a', header=False,index=False)

In [159]:
# load data
name_title_degree_df = load_dataset(name_title_degree_clean_path)
kaggle_df = load_dataset(the_oscar_award_corrected_path)

In [160]:
# add num times nominated 
kaggle_df['num_times_nominated']=1
kaggle_num_nom = pd.DataFrame([],columns = kaggle_df.columns)
names = kaggle_df.name.unique()
for name in names:
    name_df = kaggle_df[kaggle_df.name==name]
    name_df = name_df.sort_values(by=['year_film'])
    num_times = list(range(1, len(name_df)+1))
    name_df['num_times_nominated'] = num_times
    kaggle_num_nom = kaggle_num_nom.append(name_df)

print(kaggle_num_nom)

      year_film year_ceremony ceremony   category                 name  \
0          1927          1928        1      ACTOR  Richard Barthelmess   
1          1927          1928        1      ACTOR        Emil Jannings   
2          1927          1928        1    ACTRESS       Louise Dresser   
3          1927          1928        1    ACTRESS         Janet Gaynor   
534        1937          1938       10    ACTRESS         Janet Gaynor   
...         ...           ...      ...        ...                  ...   
10415      2020          2021       93  DIRECTING           Chloé Zhao   
10416      2020          2021       93  DIRECTING    Thomas Vinterberg   
10417      2020          2021       93  DIRECTING        David Fincher   
10418      2020          2021       93  DIRECTING      Lee Isaac Chung   
10419      2020          2021       93  DIRECTING      Emerald Fennell   

                        film winner num_times_nominated  
0                  The Noose  False                  

In [161]:
test = kaggle_num_nom[kaggle_num_nom.name=='20th Century-Fox']
print(test)

     year_film year_ceremony ceremony                    category  \
602       1937          1938       10      OUTSTANDING PRODUCTION   
730       1938          1939       11      OUTSTANDING PRODUCTION   
746       1938          1939       11    SHORT SUBJECT (One-reel)   
1027      1940          1941       13      OUTSTANDING PRODUCTION   
1202      1941          1942       14  OUTSTANDING MOTION PICTURE   
1224      1941          1942       14    SHORT SUBJECT (One-reel)   
1396      1942          1943       15  OUTSTANDING MOTION PICTURE   
1402      1942          1943       15     SHORT SUBJECT (Cartoon)   
1408      1942          1943       15    SHORT SUBJECT (One-reel)   
1586      1943          1944       16  OUTSTANDING MOTION PICTURE   
1580      1943          1944       16  OUTSTANDING MOTION PICTURE   
1585      1943          1944       16  OUTSTANDING MOTION PICTURE   
1755      1944          1945       17         BEST MOTION PICTURE   
2030      1946          1947      

In [162]:
print(name_title_degree_df.head())
print(kaggle_num_nom.head())
print(len(kaggle_num_nom))

      nconst     tconst          name  birthYear  deathYear category  \
0  nm0000001  tt0025164  Fred Astaire       1899     1987.0    actor   
1  nm0000001  tt0026942  Fred Astaire       1899     1987.0    actor   
2  nm0000001  tt0027125  Fred Astaire       1899     1987.0    actor   
3  nm0000001  tt0027630  Fred Astaire       1899     1987.0    actor   
4  nm0000001  tt0028333  Fred Astaire       1899     1987.0    actor   

       primaryTitle     originalTitle  film_year gender  age  degree  
0  The Gay Divorcee  The Gay Divorcee       1934      M   35       3  
1           Roberta           Roberta       1935      M   36       6  
2           Top Hat           Top Hat       1935      M   36       6  
3  Follow the Fleet  Follow the Fleet       1936      M   37       9  
4        Swing Time        Swing Time       1936      M   37       9  
    year_film year_ceremony ceremony category                 name  \
0        1927          1928        1    ACTOR  Richard Barthelmess   
1

In [163]:
# merge kaggle nominees with name_title_degree data
df = pd.merge(name_title_degree_df, kaggle_num_nom, how='left', 
              left_on=['name','primaryTitle'], 
              right_on = ['name','film'])

In [164]:
#compare lengh with newly merged dataset 
print(len(df))
print(len(name_title_degree_df))

9354828
9354812


In [165]:
# checkout merged data
print(df.head(1))

      nconst     tconst          name  birthYear  deathYear category_x  \
0  nm0000001  tt0025164  Fred Astaire       1899     1987.0      actor   

       primaryTitle     originalTitle  film_year gender  age  degree  \
0  The Gay Divorcee  The Gay Divorcee       1934      M   35       3   

  year_film year_ceremony ceremony category_y film winner num_times_nominated  
0       NaN           NaN      NaN        NaN  NaN    NaN                 NaN  


In [166]:
# compare nominee length vs original nominee dataset
print(len(kaggle_num_nom))
print(len(df[df.ceremony.notna()]))

1729
1527


In [167]:
# checkout columns
print(df.head(1))
print(df.columns)

      nconst     tconst          name  birthYear  deathYear category_x  \
0  nm0000001  tt0025164  Fred Astaire       1899     1987.0      actor   

       primaryTitle     originalTitle  film_year gender  age  degree  \
0  The Gay Divorcee  The Gay Divorcee       1934      M   35       3   

  year_film year_ceremony ceremony category_y film winner num_times_nominated  
0       NaN           NaN      NaN        NaN  NaN    NaN                 NaN  
Index(['nconst', 'tconst', 'name', 'birthYear', 'deathYear', 'category_x',
       'primaryTitle', 'originalTitle', 'film_year', 'gender', 'age', 'degree',
       'year_film', 'year_ceremony', 'ceremony', 'category_y', 'film',
       'winner', 'num_times_nominated'],
      dtype='object')


In [170]:
# include only relevant columns
df = df[['nconst', 'tconst', 'name', 'birthYear', 'deathYear', 
         'category_x','primaryTitle', 'originalTitle', 'film_year', 
         'gender', 'age', 'degree','year_ceremony', 'ceremony', 
         'category_y','winner', 'num_times_nominated']]

In [171]:
# fix column names
df.columns = ['nconst', 'tconst', 'name', 'birthYear', 'deathYear', 
         'profession','primaryTitle', 'originalTitle', 'film_year', 
         'gender', 'age', 'degree','year_ceremony', 'ceremony', 
         'category','winner', 'num_times_nominated']

In [177]:
# data sanity check
df[df.num_times_nominated>15]

Unnamed: 0,nconst,tconst,name,birthYear,deathYear,profession,primaryTitle,originalTitle,film_year,gender,age,degree,year_ceremony,ceremony,category,winner,num_times_nominated
5928,nm0000095,tt0113819,Woody Allen,1935,,actor,Mighty Aphrodite,Mighty Aphrodite,1995,M,60,59,1996,68,WRITING (Screenplay Written Directly for the S...,False,16
5931,nm0000095,tt0118954,Woody Allen,1935,,actor,Deconstructing Harry,Deconstructing Harry,1997,M,62,69,1998,70,WRITING (Screenplay Written Directly for the S...,False,17
79097,nm0000658,tt1007029,Meryl Streep,1949,,actress,The Iron Lady,The Iron Lady,2011,F,62,136,2012,84,ACTRESS IN A LEADING ROLE,True,17
79107,nm0000658,tt1135503,Meryl Streep,1949,,actress,Julie & Julia,Julie & Julia,2009,F,60,129,2010,82,ACTRESS IN A LEADING ROLE,False,16
79124,nm0000658,tt1322269,Meryl Streep,1949,,actress,August: Osage County,August: Osage County,2013,F,64,145,2014,86,ACTRESS IN A LEADING ROLE,False,18
79136,nm0000658,tt2180411,Meryl Streep,1949,,actress,Into the Woods,Into the Woods,2014,F,65,152,2015,87,ACTRESS IN A SUPPORTING ROLE,False,19
79145,nm0000658,tt4136084,Meryl Streep,1949,,actress,Florence Foster Jenkins,Florence Foster Jenkins,2016,F,67,160,2017,89,ACTRESS IN A LEADING ROLE,False,20
79148,nm0000658,tt6294822,Meryl Streep,1949,,actress,The Post,The Post,2017,F,68,167,2018,90,ACTRESS IN A LEADING ROLE,False,21


In [181]:
write_to_csv(df,all_acting_path)