In [138]:
import pandas as pd
from os import path
import numpy as np

In [139]:
name_basics_processed_path='../../data/imbd_data/name_basics_processed.csv'
title_basics_processed_path='../../data/imbd_data/title_basics_processed.csv'
title_principals_processed_path='../../data/imbd_data/title_principals_processed.csv'
name_title_processed_path='../../data/imbd_data/name_title_processed.csv'

In [140]:
def write_to_csv(df,filepath):
    '''
    input: df - a pandas DataFrame
           filepath - an output filepath as a string

    writes to a csv file
    in same diretory as this script

    returns: nothing
    '''
    # if no csv exists
    if not path.exists(filepath):
        df.to_csv(filepath,index=False)
    else:
        df.to_csv(filepath, mode='a', header=False,index=False)

In [141]:
def load_dataset(filepath):
    df = pd.read_csv(filepath)
    return df


In [None]:
names_df = load_dataset(name_basics_processed_path)
title_df = load_dataset(title_basics_processed_path)
title_p_df = load_dataset(title_principals_processed_path)

In [None]:
names_df.head()
print(names_df[names_df['primaryName'].isin(['Morgan Freeman'])])

In [None]:
title_p_df.head()

In [None]:
title_df.head()

In [None]:
print(names_df.info())
print(title_df.info())

In [None]:
# merge names with title dataframe
df = names_df.merge(title_p_df, 
                    left_on=['nconst'], 
                    right_on=['nconst'], how='left')

In [None]:
print(df.head())
print(df[df['primaryName'].isin(['Morgan Freeman'])])

In [127]:
# merge names with title dataframe
df = df.merge(title_df, 
                    left_on=['tconst'], 
                    right_on=['tconst'], how='left')

In [128]:
print(df.head())

      nconst   primaryName  birthYear  deathYear     tconst category  \
0  nm0000001  Fred Astaire       1899     1987.0  tt0025164    actor   
1  nm0000001  Fred Astaire       1899     1987.0  tt0026942    actor   
2  nm0000001  Fred Astaire       1899     1987.0  tt0027125    actor   
3  nm0000001  Fred Astaire       1899     1987.0  tt0027630    actor   
4  nm0000001  Fred Astaire       1899     1987.0  tt0028333    actor   

       primaryTitle     originalTitle  startYear  
0  The Gay Divorcee  The Gay Divorcee     1934.0  
1           Roberta           Roberta     1935.0  
2           Top Hat           Top Hat     1935.0  
3  Follow the Fleet  Follow the Fleet     1936.0  
4        Swing Time        Swing Time     1936.0  


In [129]:

df = df.dropna()
df.startYear = df.startYear.astype(int)

df = df[['primaryName','birthYear','deathYear',
         'category','primaryTitle','originalTitle',
         'startYear']]

df.columns = ['name','birthYear','deathYear',
              'category','primaryTitle','originalTitle','film_year']

print(df.columns)
print(df.head(2))

Index(['name', 'birthYear', 'deathYear', 'category', 'primaryTitle',
       'originalTitle', 'film_year'],
      dtype='object')
           name  birthYear  deathYear category      primaryTitle  \
0  Fred Astaire       1899     1987.0    actor  The Gay Divorcee   
1  Fred Astaire       1899     1987.0    actor           Roberta   

      originalTitle  film_year  
0  The Gay Divorcee       1934  
1           Roberta       1935  


In [130]:
# create gender category
df['gender'] = np.where(df['category']=='actor', 'M', 'F')

In [131]:
print(df.head(1))

           name  birthYear  deathYear category      primaryTitle  \
0  Fred Astaire       1899     1987.0    actor  The Gay Divorcee   

      originalTitle  film_year gender  
0  The Gay Divorcee       1934      M  


In [132]:
# make sure years are integers
df.birthYear = df.birthYear.astype(int)
df.deathYear = df.deathYear.astype(int)
df.film_year = df.film_year.astype(int)

# create person age 
df['age'] = df.film_year-df.birthYear

In [133]:
print(df.head(1))
print(len(df))

           name  birthYear  deathYear category      primaryTitle  \
0  Fred Astaire       1899       1987    actor  The Gay Divorcee   

      originalTitle  film_year gender  age  
0  The Gay Divorcee       1934      M   35  
2164540


In [134]:
#test data
print(df[df['name'].isin(['Morgan Freeman'])])

Empty DataFrame
Columns: [name, birthYear, deathYear, category, primaryTitle, originalTitle, film_year, gender, age]
Index: []


In [135]:
write_to_csv(df,name_title_processed_path)