## Data Preprocessing - IMDB Official Data
 - Description: Subsets of IMDb data are available for access to customers for personal and non-commercial use. You can hold local copies of this data, and it is subject to our terms and conditions. Please refer to the Non-Commercial Licensing and copyright/license and verify compliance.

In [1]:
import pandas as pd

In [2]:
path1 = 'IMDB_Official_Data/name.basics.tsv'
path2 = 'IMDB_Official_Data/title.akas.tsv'
path3 = 'IMDB_Official_Data/title.basics.tsv'
path4 = 'IMDB_Official_Data/title.crew.tsv'
path5 = 'IMDB_Official_Data/title.episode.tsv'
path6 = 'IMDB_Official_Data/title.principals.tsv'
path7 = 'IMDB_Official_Data/title.ratings.tsv'

In [3]:
def initial_inspection(file):
    df = pd.read_csv(file, sep='\t')
    print(f'Column attributes: {df.columns}')
    print(f'-------------------------------------------------------------------------------------------------------------------------------------------------------')
    print(df.head())
    return df

### Initial Import

#### Name basic file
 - nconst (string) - alphanumeric unique identifier of the name/person
 - primaryName (string)– name by which the person is most often credited
 - birthYear – in YYYY format
 - deathYear – in YYYY format if applicable, else '\N'
 - primaryProfession (array of strings)– the top-3 professions of the person
 - knownForTitles (array of tconsts) – titles the person is known for

In [4]:
name_basic = initial_inspection(path1)

Column attributes: Index(['nconst', 'primaryName', 'birthYear', 'deathYear', 'primaryProfession',
       'knownForTitles'],
      dtype='object')
-------------------------------------------------------------------------------------------------------------------------------------------------------
      nconst      primaryName birthYear deathYear  \
0  nm0000001     Fred Astaire      1899      1987   
1  nm0000002    Lauren Bacall      1924      2014   
2  nm0000003  Brigitte Bardot      1934        \N   
3  nm0000004     John Belushi      1949      1982   
4  nm0000005   Ingmar Bergman      1918      2007   

                    primaryProfession                           knownForTitles  
0        actor,miscellaneous,producer  tt0072308,tt0050419,tt0027125,tt0031983  
1  actress,soundtrack,archive_footage  tt0037382,tt0075213,tt0117057,tt0038355  
2   actress,music_department,producer  tt0057345,tt0049189,tt0056404,tt0054452  
3       actor,writer,music_department  tt0072562,tt0077975,

#### Title akas file
 - titleId (string) - a tconst, an alphanumeric unique identifier of the title
 - ordering (integer) – a number to uniquely identify rows for a given titleId
 - title (string) – the localized title
 - region (string) - the region for this version of the title
 - language (string) - the language of the title
 - types (array) - Enumerated set of attributes for this alternative title. One or more of the following: "alternative", "dvd", "festival", "tv", "video", "working", "original", "imdbDisplay". New values may be added in the future without warning
 - attributes (array) - Additional terms to describe this alternative title, not enumerated
 - isOriginalTitle (boolean) – 0: not original title; 1: original title

In [5]:
title_akas = initial_inspection(path2)

Column attributes: Index(['titleId', 'ordering', 'title', 'region', 'language', 'types',
       'attributes', 'isOriginalTitle'],
      dtype='object')
-------------------------------------------------------------------------------------------------------------------------------------------------------
     titleId  ordering                      title region language  \
0  tt0000001         1                 Carmencita     \N       \N   
1  tt0000001         2                 Carmencita     DE       \N   
2  tt0000001         3                 Carmencita     US       \N   
3  tt0000001         4  Carmencita - spanyol tánc     HU       \N   
4  tt0000001         5                 Καρμενσίτα     GR       \N   

         types     attributes  isOriginalTitle  
0     original             \N                1  
1           \N  literal title                0  
2  imdbDisplay             \N                0  
3  imdbDisplay             \N                0  
4  imdbDisplay             \N       

#### Title basic file
 - tconst (string) - alphanumeric unique identifier of the title
 - titleType (string) – the type/format of the title (e.g. movie, short, tvseries, tvepisode, video, etc)
 - primaryTitle (string) – the more popular title / the title used by the filmmakers on promotional materials at the point of release
 - originalTitle (string) - original title, in the original language
 - isAdult (boolean) - 0: non-adult title; 1: adult title
 - startYear (YYYY) – represents the release year of a title. In the case of TV Series, it is the series start year
 - endYear (YYYY) – TV Series end year. '\N' for all other title types
 - runtimeMinutes – primary runtime of the title, in minutes
 - genres (string array) – includes up to three genres associated with the title

In [6]:
title_basic = initial_inspection(path3)

  df = pd.read_csv(file, sep='\t')


Column attributes: Index(['tconst', 'titleType', 'primaryTitle', 'originalTitle', 'isAdult',
       'startYear', 'endYear', 'runtimeMinutes', 'genres'],
      dtype='object')
-------------------------------------------------------------------------------------------------------------------------------------------------------
      tconst titleType            primaryTitle           originalTitle  \
0  tt0000001     short              Carmencita              Carmencita   
1  tt0000002     short  Le clown et ses chiens  Le clown et ses chiens   
2  tt0000003     short            Poor Pierrot          Pauvre Pierrot   
3  tt0000004     short             Un bon bock             Un bon bock   
4  tt0000005     short        Blacksmith Scene        Blacksmith Scene   

  isAdult startYear endYear runtimeMinutes                    genres  
0       0      1894      \N              1         Documentary,Short  
1       0      1892      \N              5           Animation,Short  
2       0      

#### Title crew file
 - tconst (string) - alphanumeric unique identifier of the title
 - directors (array of nconsts) - director(s) of the given title
 - writers (array of nconsts) – writer(s) of the given title

In [7]:
title_crew = initial_inspection(path4)

Column attributes: Index(['tconst', 'directors', 'writers'], dtype='object')
-------------------------------------------------------------------------------------------------------------------------------------------------------
      tconst  directors writers
0  tt0000001  nm0005690      \N
1  tt0000002  nm0721526      \N
2  tt0000003  nm0721526      \N
3  tt0000004  nm0721526      \N
4  tt0000005  nm0005690      \N


#### Title rating 
 - tconst (string) - alphanumeric unique identifier of the title
 - averageRating – weighted average of all the individual user ratings
 - numVotes - number of votes the title has received

In [9]:
title_rating = initial_inspection(path7)

Column attributes: Index(['tconst', 'averageRating', 'numVotes'], dtype='object')
-------------------------------------------------------------------------------------------------------------------------------------------------------
      tconst  averageRating  numVotes
0  tt0000001            5.7      2129
1  tt0000002            5.6       288
2  tt0000003            6.4      2165
3  tt0000004            5.3       184
4  tt0000005            6.2      2892


### Initial Merge

In [10]:
# Merged movie basic attributes such as title(primary/original), 
# type, start_year (released year), genre (up to three), director, 
# average rating, and number of votes received
merge_stg0 = title_akas.join(title_basic.set_index('tconst'), 
                             on='titleId')

In [11]:
movie_stg0 = merge_stg0.join(title_crew.set_index('tconst'), 
                                                on='titleId').join(title_rating.set_index('tconst'), 
                                                                   on='titleId')

In [12]:
movie_stg0.columns

Index(['titleId', 'ordering', 'title', 'region', 'language', 'types',
       'attributes', 'isOriginalTitle', 'titleType', 'primaryTitle',
       'originalTitle', 'isAdult', 'startYear', 'endYear', 'runtimeMinutes',
       'genres', 'directors', 'writers', 'averageRating', 'numVotes'],
      dtype='object')

In [13]:
# Remove unrelevant columns
movie_stg1 = movie_stg0[['titleId', 'primaryTitle', 'originalTitle', 'titleType', 
                         'startYear', 'genres', 'directors', 'writers', 'averageRating', 'numVotes']]

In [14]:
# Remove duplicate and null value
movie_stg2 = movie_stg1[movie_stg1['titleType'] == 'movie'].drop_duplicates(subset=['primaryTitle']).dropna()

In [15]:
movie_stg2.head()

Unnamed: 0,titleId,primaryTitle,originalTitle,titleType,startYear,genres,directors,writers,averageRating,numVotes
74,tt0000009,Miss Jerry,Miss Jerry,movie,1894,Romance,nm0085156,nm0085156,5.4,218.0
763,tt0000147,The Corbett-Fitzsimmons Fight,The Corbett-Fitzsimmons Fight,movie,1897,"Documentary,News,Sport",nm0714557,\N,5.3,549.0
2069,tt0000502,Bohemios,Bohemios,movie,1905,\N,nm0063413,"nm0063413,nm0657268,nm0675388",3.8,20.0
2330,tt0000574,The Story of the Kelly Gang,The Story of the Kelly Gang,movie,1906,"Action,Adventure,Biography",nm0846879,nm0846879,6.0,970.0
2399,tt0000591,The Prodigal Son,L'enfant prodigue,movie,1907,Drama,nm0141150,nm0141150,5.6,30.0


In [16]:
movie_stg2.shape

(279145, 10)

In [17]:
len(list(movie_stg2['primaryTitle'].unique()))

279145

In [38]:
movie_stg2.head()

Unnamed: 0,titleId,primaryTitle,originalTitle,titleType,startYear,genres,directors,writers,averageRating,numVotes
74,tt0000009,Miss Jerry,Miss Jerry,movie,1894,Romance,nm0085156,nm0085156,5.4,218.0
763,tt0000147,The Corbett-Fitzsimmons Fight,The Corbett-Fitzsimmons Fight,movie,1897,"Documentary,News,Sport",nm0714557,\N,5.3,549.0
2069,tt0000502,Bohemios,Bohemios,movie,1905,\N,nm0063413,"nm0063413,nm0657268,nm0675388",3.8,20.0
2330,tt0000574,The Story of the Kelly Gang,The Story of the Kelly Gang,movie,1906,"Action,Adventure,Biography",nm0846879,nm0846879,6.0,970.0
2399,tt0000591,The Prodigal Son,L'enfant prodigue,movie,1907,Drama,nm0141150,nm0141150,5.6,30.0


In [41]:
# Unmask director and writer names

# Split `directors` into individual rows
directors_split = movie_stg2[['titleId', 'directors']].copy()
directors_split = directors_split.assign(directors=directors_split['directors'].str.split(',')).explode('directors')


In [107]:
movie_stg3 = movie_stg2.copy(deep=True)

In [62]:
name_dict = {}

for i in range(len(name_basic)):
    name_dict[name_basic.iloc[i][0]] = name_basic.iloc[i][1]

  name_dict[name_basic.iloc[i][0]] = name_basic.iloc[i][1]


In [77]:
x = 'nm0085156'
name_dict[x]

'Alexander Black'

In [109]:
# Assuming movie_stg2 and name_dict are already defined
movie_stg3 = movie_stg2.copy(deep=True)

# Test on the first 5 rows (or any small subset)
test_rows = 5  # Number of rows to test
for i in range(test_rows):  # Iterate over the first `test_rows` rows
    # Access data directly from movie_stg3
    titleId = movie_stg3.iat[i, 0]
    directors = movie_stg3.iat[i, 6]
    writers = movie_stg3.iat[i, 7]

    # Handle missing values (e.g., '\N' or NaN) before splitting
    directors_split = [] if pd.isna(directors) or directors == '\\N' else directors.split(',')
    writers_split = [] if pd.isna(writers) or writers == '\\N' else writers.split(',')

    # Replace placeholders and map names using name_dict
    directors_split = [name_dict.get(d, d) for d in directors_split]
    writers_split = [name_dict.get(w, w) for w in writers_split]

    # Join the lists back into strings
    joined_director = ','.join(directors_split)
    joined_writer = ','.join(writers_split)

    # Update the values in movie_stg3
    movie_stg3.at[i, 'directors'] = joined_director
    movie_stg3.at[i, 'writers'] = joined_writer

# Check the changes in movie_stg3
print(movie_stg3.head(test_rows))

        titleId                   primaryTitle                  originalTitle  \
74    tt0000009                     Miss Jerry                     Miss Jerry   
763   tt0000147  The Corbett-Fitzsimmons Fight  The Corbett-Fitzsimmons Fight   
2069  tt0000502                       Bohemios                       Bohemios   
2330  tt0000574    The Story of the Kelly Gang    The Story of the Kelly Gang   
2399  tt0000591               The Prodigal Son              L'enfant prodigue   

     titleType startYear                      genres  directors  \
74       movie      1894                     Romance  nm0085156   
763      movie      1897      Documentary,News,Sport  nm0714557   
2069     movie      1905                          \N  nm0063413   
2330     movie      1906  Action,Adventure,Biography  nm0846879   
2399     movie      1907                       Drama  nm0141150   

                            writers  averageRating  numVotes  
74                        nm0085156            

In [103]:
movie_stg2.head()

Unnamed: 0,titleId,primaryTitle,originalTitle,titleType,startYear,genres,directors,writers,averageRating,numVotes
74,tt0000009,Miss Jerry,Miss Jerry,movie,1894,Romance,nm0085156,nm0085156,5.4,218.0
763,tt0000147,The Corbett-Fitzsimmons Fight,The Corbett-Fitzsimmons Fight,movie,1897,"Documentary,News,Sport",nm0714557,\N,5.3,549.0
2069,tt0000502,Bohemios,Bohemios,movie,1905,\N,nm0063413,"nm0063413,nm0657268,nm0675388",3.8,20.0
2330,tt0000574,The Story of the Kelly Gang,The Story of the Kelly Gang,movie,1906,"Action,Adventure,Biography",nm0846879,nm0846879,6.0,970.0
2399,tt0000591,The Prodigal Son,L'enfant prodigue,movie,1907,Drama,nm0141150,nm0141150,5.6,30.0


In [106]:
movie_stg3.head(2)

Unnamed: 0,titleId,primaryTitle,originalTitle,titleType,startYear,genres,directors,writers,averageRating,numVotes
74,tt0000009,Miss Jerry,Miss Jerry,movie,1894,Romance,nm0085156,nm0085156,5.4,218.0
763,tt0000147,The Corbett-Fitzsimmons Fight,The Corbett-Fitzsimmons Fight,movie,1897,"Documentary,News,Sport",nm0714557,\N,5.3,549.0


In [26]:
movie_stg3 = movie_stg2.join(movie_with_names.set_index('titleId'), on='titleId')

In [36]:
movie_stg3.head()

Unnamed: 0,titleId,primaryTitle,originalTitle,titleType,startYear,genres,directors,writers,averageRating,numVotes,director_name,writer_name
74,tt0000009,Miss Jerry,Miss Jerry,movie,1894,Romance,nm0085156,nm0085156,5.4,218.0,Alexander Black,Alexander Black
763,tt0000147,The Corbett-Fitzsimmons Fight,The Corbett-Fitzsimmons Fight,movie,1897,"Documentary,News,Sport",nm0714557,\N,5.3,549.0,Enoch J. Rector,
2069,tt0000502,Bohemios,Bohemios,movie,1905,\N,nm0063413,"nm0063413,nm0657268,nm0675388",3.8,20.0,Ricardo de Baños,Ricardo de Baños
2069,tt0000502,Bohemios,Bohemios,movie,1905,\N,nm0063413,"nm0063413,nm0657268,nm0675388",3.8,20.0,Ricardo de Baños,Miguel de Palacios
2069,tt0000502,Bohemios,Bohemios,movie,1905,\N,nm0063413,"nm0063413,nm0657268,nm0675388",3.8,20.0,Ricardo de Baños,Guillermo Perrín


In [None]:
movie_stg3.iloc[3]

titleId                              tt0000502
primaryTitle                          Bohemios
originalTitle                         Bohemios
titleType                                movie
startYear                                 1905
genres                                      \N
directors                            nm0063413
writers          nm0063413,nm0657268,nm0675388
averageRating                              3.8
numVotes                                  20.0
director_name                 Ricardo de Baños
writer_name                 Miguel de Palacios
Name: 2069, dtype: object