In [1]:
import pandas as pd
from os import path
import numpy as np

In [2]:
name_basics_processed_path='../../data/imbd_data/name_basics_processed.csv'
title_basics_processed_path='../../data/imbd_data/title_basics_processed.csv'
title_principals_processed_path='../../data/imbd_data/title_principals_processed.csv'
name_title_processed_path='../../data/imbd_data/name_title_processed.csv'

In [3]:
def write_to_csv(df,filepath):
    '''
    input: df - a pandas DataFrame
           filepath - an output filepath as a string

    writes to a csv file
    in same diretory as this script

    returns: nothing
    '''
    # if no csv exists
    if not path.exists(filepath):
        df.to_csv(filepath,index=False)
    else:
        df.to_csv(filepath, mode='a', header=False,index=False)

In [4]:
def load_dataset(filepath):
    df = pd.read_csv(filepath)
    return df


In [5]:
names_df = load_dataset(name_basics_processed_path)
title_df = load_dataset(title_basics_processed_path)
title_p_df = load_dataset(title_principals_processed_path)

In [6]:
names_df.head()
print(names_df[names_df['primaryName'].isin(['Morgan Freeman'])])

        nconst     primaryName  birthYear  deathYear
148  nm0000151  Morgan Freeman       1937        NaN


In [7]:
title_p_df.head()

Unnamed: 0,tconst,nconst,category
0,tt0000005,nm0443482,actor
1,tt0000005,nm0653042,actor
2,tt0000007,nm0179163,actor
3,tt0000007,nm0183947,actor
4,tt0000008,nm0653028,actor


In [8]:
title_df.head()

Unnamed: 0,tconst,primaryTitle,originalTitle,startYear
0,tt0000001,Carmencita,Carmencita,1894
1,tt0000002,Le clown et ses chiens,Le clown et ses chiens,1892
2,tt0000003,Pauvre Pierrot,Pauvre Pierrot,1892
3,tt0000004,Un bon bock,Un bon bock,1892
4,tt0000005,Blacksmith Scene,Blacksmith Scene,1893


In [9]:
print(names_df.info())
print(title_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 531160 entries, 0 to 531159
Data columns (total 4 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   nconst       531160 non-null  object 
 1   primaryName  531160 non-null  object 
 2   birthYear    531160 non-null  int64  
 3   deathYear    181393 non-null  float64
dtypes: float64(1), int64(1), object(2)
memory usage: 16.2+ MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7195169 entries, 0 to 7195168
Data columns (total 4 columns):
 #   Column         Dtype 
---  ------         ----- 
 0   tconst         object
 1   primaryTitle   object
 2   originalTitle  object
 3   startYear      int64 
dtypes: int64(1), object(3)
memory usage: 219.6+ MB
None


In [12]:
# merge names with title dataframe
df = names_df.merge(title_p_df, 
                    left_on=['nconst'], 
                    right_on=['nconst'], how='left')

In [13]:
print(df.head())
print(df[df['primaryName'].isin(['Morgan Freeman'])])

      nconst   primaryName  birthYear  deathYear     tconst category
0  nm0000001  Fred Astaire       1899     1987.0  tt0025164    actor
1  nm0000001  Fred Astaire       1899     1987.0  tt0026942    actor
2  nm0000001  Fred Astaire       1899     1987.0  tt0027125    actor
3  nm0000001  Fred Astaire       1899     1987.0  tt0027630    actor
4  nm0000001  Fred Astaire       1899     1987.0  tt0028333    actor
          nconst     primaryName  birthYear  deathYear     tconst category
13587  nm0000151  Morgan Freeman       1937        NaN  tt0066651    actor
13588  nm0000151  Morgan Freeman       1937        NaN  tt0079379    actor
13589  nm0000151  Morgan Freeman       1937        NaN  tt0080392    actor
13590  nm0000151  Morgan Freeman       1937        NaN  tt0080474    actor
13591  nm0000151  Morgan Freeman       1937        NaN  tt0082719    actor
...          ...             ...        ...        ...        ...      ...
14370  nm0000151  Morgan Freeman       1937        NaN  tt586

In [14]:
# merge names with title dataframe
df = df.merge(title_df, 
                    left_on=['tconst'], 
                    right_on=['tconst'], how='left')

In [15]:
print(df.head())
print(df[df['primaryName'].isin(['Morgan Freeman'])])

      nconst   primaryName  birthYear  deathYear     tconst category  \
0  nm0000001  Fred Astaire       1899     1987.0  tt0025164    actor   
1  nm0000001  Fred Astaire       1899     1987.0  tt0026942    actor   
2  nm0000001  Fred Astaire       1899     1987.0  tt0027125    actor   
3  nm0000001  Fred Astaire       1899     1987.0  tt0027630    actor   
4  nm0000001  Fred Astaire       1899     1987.0  tt0028333    actor   

       primaryTitle     originalTitle  startYear  
0  The Gay Divorcee  The Gay Divorcee     1934.0  
1           Roberta           Roberta     1935.0  
2           Top Hat           Top Hat     1935.0  
3  Follow the Fleet  Follow the Fleet     1936.0  
4        Swing Time        Swing Time     1936.0  
          nconst     primaryName  birthYear  deathYear     tconst category  \
13587  nm0000151  Morgan Freeman       1937        NaN  tt0066651    actor   
13588  nm0000151  Morgan Freeman       1937        NaN  tt0079379    actor   
13589  nm0000151  Morgan Fr

In [16]:


df = df[['primaryName','birthYear','deathYear',
         'category','primaryTitle','originalTitle',
         'startYear']]

df.columns = ['name','birthYear','deathYear',
              'category','primaryTitle','originalTitle','film_year']

print(df.columns)
print(df.head(2))

Index(['name', 'birthYear', 'deathYear', 'category', 'primaryTitle',
       'originalTitle', 'film_year'],
      dtype='object')
           name  birthYear  deathYear category      primaryTitle  \
0  Fred Astaire       1899     1987.0    actor  The Gay Divorcee   
1  Fred Astaire       1899     1987.0    actor           Roberta   

      originalTitle  film_year  
0  The Gay Divorcee     1934.0  
1           Roberta     1935.0  


In [17]:
# create gender category
df['gender'] = np.where(df['category']=='actor', 'M', 'F')

In [18]:
print(df.head(1))
print(df[df['name'].isin(['Morgan Freeman'])])

           name  birthYear  deathYear category      primaryTitle  \
0  Fred Astaire       1899     1987.0    actor  The Gay Divorcee   

      originalTitle  film_year gender  
0  The Gay Divorcee     1934.0      M  
                 name  birthYear  deathYear category  \
13587  Morgan Freeman       1937        NaN    actor   
13588  Morgan Freeman       1937        NaN    actor   
13589  Morgan Freeman       1937        NaN    actor   
13590  Morgan Freeman       1937        NaN    actor   
13591  Morgan Freeman       1937        NaN    actor   
...               ...        ...        ...      ...   
14370  Morgan Freeman       1937        NaN    actor   
14371  Morgan Freeman       1937        NaN    actor   
14372  Morgan Freeman       1937        NaN    actor   
14373  Morgan Freeman       1937        NaN    actor   
14374  Morgan Freeman       1937        NaN    actor   

                                            primaryTitle  \
13587                               The Electric C

In [19]:
# only include rows with valid film_year
df = df[df['film_year'].notna()]

# make sure years are integers
df.birthYear = df.birthYear.astype(int)
df.film_year = df.film_year.astype(int)

# create person age 
df['age'] = df.film_year-df.birthYear

In [20]:
print(df.head(1))
print(len(df))

           name  birthYear  deathYear category      primaryTitle  \
0  Fred Astaire       1899     1987.0    actor  The Gay Divorcee   

      originalTitle  film_year gender  age  
0  The Gay Divorcee       1934      M   35  
9477183


In [21]:
#test data
print(df.info())
print(df[df['name'].isin(['Morgan Freeman'])])

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9477183 entries, 0 to 10194518
Data columns (total 9 columns):
 #   Column         Dtype  
---  ------         -----  
 0   name           object 
 1   birthYear      int32  
 2   deathYear      float64
 3   category       object 
 4   primaryTitle   object 
 5   originalTitle  object 
 6   film_year      int32  
 7   gender         object 
 8   age            int32  
dtypes: float64(1), int32(3), object(5)
memory usage: 614.6+ MB
None
                 name  birthYear  deathYear category  \
13587  Morgan Freeman       1937        NaN    actor   
13588  Morgan Freeman       1937        NaN    actor   
13589  Morgan Freeman       1937        NaN    actor   
13590  Morgan Freeman       1937        NaN    actor   
13591  Morgan Freeman       1937        NaN    actor   
...               ...        ...        ...      ...   
14369  Morgan Freeman       1937        NaN    actor   
14370  Morgan Freeman       1937        NaN    actor   
14371 

In [23]:
df['category'].unique()

array(['actor', 'actress'], dtype=object)

In [22]:
write_to_csv(df,name_title_processed_path)