In [54]:
import os
import numpy as np
import pandas as pd

In [55]:
# making a new folder with os
# Specify that it is okay if the folder already exists by passing 'exist_ok=True'
os.makedirs('DATA/',exist_ok=True)

In [56]:
# Verify folders have benn created
os.listdir()

['.git',
 '.gitattributes',
 '.gitignore',
 '.ipynb_checkpoints',
 'DATA',
 'DATA_Loading.ipynb',
 'LICENSE',
 'Raw_DATA',
 'README.md']

In [57]:
# Load only US movies
movies_us_only = pd.read_csv("Raw_DATA/title-akas-us-only.csv",sep=',',low_memory=False,on_bad_lines='skip',encoding='utf-8')
movies_us_only.info()
movies_us_only.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1452564 entries, 0 to 1452563
Data columns (total 8 columns):
 #   Column           Non-Null Count    Dtype 
---  ------           --------------    ----- 
 0   titleId          1452564 non-null  object
 1   ordering         1452564 non-null  int64 
 2   title            1452562 non-null  object
 3   region           1452564 non-null  object
 4   language         1452564 non-null  object
 5   types            1452564 non-null  object
 6   attributes       1452564 non-null  object
 7   isOriginalTitle  1452564 non-null  object
dtypes: int64(1), object(7)
memory usage: 88.7+ MB


Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,6,Carmencita,US,\N,imdbDisplay,\N,0
1,tt0000002,7,The Clown and His Dogs,US,\N,\N,literal English title,0
2,tt0000005,10,Blacksmith Scene,US,\N,imdbDisplay,\N,0
3,tt0000005,1,Blacksmithing Scene,US,\N,alternative,\N,0
4,tt0000005,6,Blacksmith Scene #1,US,\N,alternative,\N,0


In [58]:
# load ratings
movies_ratings = pd.read_csv("Raw_DATA/title.ratings.tsv.gz",sep='\t',low_memory=False,on_bad_lines='skip',encoding='utf-8')
movies_ratings.info()
movies_ratings.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1331492 entries, 0 to 1331491
Data columns (total 3 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   tconst         1331492 non-null  object 
 1   averageRating  1331492 non-null  float64
 2   numVotes       1331492 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 30.5+ MB


Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1988
1,tt0000002,5.8,265
2,tt0000003,6.5,1849
3,tt0000004,5.5,178
4,tt0000005,6.2,2632


In [28]:
# load ratings
movies_basics = pd.read_csv("Raw_DATA/title.basics.tsv.gz",sep='\t',low_memory=False,on_bad_lines='skip',encoding='utf-8')
movies_basics.info()
movies_basics.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10017011 entries, 0 to 10017010
Data columns (total 9 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   tconst          object
 1   titleType       object
 2   primaryTitle    object
 3   originalTitle   object
 4   isAdult         object
 5   startYear       object
 6   endYear         object
 7   runtimeMinutes  object
 8   genres          object
dtypes: object(9)
memory usage: 687.8+ MB


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


In [59]:
# create filter to filter movies_basics to only include the US movies that are in movies_us_only
# (filtering based on ids in each table which are titleId and tconst)
filter_us_movies = movies_basics['tconst'].isin(movies_us_only['titleId'])

In [60]:
movies_basics = movies_basics[filter_us_movies]
movies_basics.info()

<class 'pandas.core.frame.DataFrame'>
Index: 203476 entries, 8 to 10016777
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   tconst          203476 non-null  object 
 1   titleType       203476 non-null  object 
 2   primaryTitle    203475 non-null  object 
 3   originalTitle   203475 non-null  object 
 4   isAdult         203476 non-null  object 
 5   startYear       199907 non-null  float64
 6   endYear         0 non-null       object 
 7   runtimeMinutes  203476 non-null  object 
 8   genres          203476 non-null  object 
dtypes: float64(1), object(8)
memory usage: 15.5+ MB


In [61]:
# According to data dictionary the palceholder '\N' indicates a missing value. We will replace it accordingly.
# The backslash in the palaceholder will be read as to ignore what comes next and to be treated as empty string, we will add
# as second backslash to workaround it
movies_basics = movies_basics.replace({
    '\\N':np.nan
})
movies_basics.isna().sum()

tconst                 0
titleType              0
primaryTitle           1
originalTitle          1
isAdult                0
startYear           3569
endYear           203476
runtimeMinutes         0
genres                 0
dtype: int64

In [62]:
# Drop rows with Nul values in runtimeMinutes and genres
movies_basics = movies_basics.dropna(subset=['runtimeMinutes','genres'])
movies_basics.isna().sum()

tconst                 0
titleType              0
primaryTitle           1
originalTitle          1
isAdult                0
startYear           3569
endYear           203476
runtimeMinutes         0
genres                 0
dtype: int64

In [63]:
# cheking distinct values of column 'titleType' to only filter the movies
movies_basics['titleType'].value_counts()

titleType
movie    203476
Name: count, dtype: int64

In [64]:
#filtering fulllength movies
filter_FullLength_movies = movies_basics['titleType'] == 'movie'
movies_basics = movies_basics[filter_FullLength_movies]
# verify filter
movies_basics['titleType'].value_counts()

titleType
movie    203476
Name: count, dtype: int64

In [65]:
movies_basics['startYear'].value_counts()

startYear
2019.0    8102
2018.0    7866
2017.0    7816
2016.0    7415
2015.0    7228
          ... 
1906.0       2
1899.0       1
1904.0       1
1897.0       1
1894.0       1
Name: count, Length: 128, dtype: int64

In [66]:
#Convert startYear to a float dtype
movies_basics['startYear']=movies_basics['startYear'].astype(float)
# verify
movies_basics.info()

<class 'pandas.core.frame.DataFrame'>
Index: 203476 entries, 8 to 10016777
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   tconst          203476 non-null  object 
 1   titleType       203476 non-null  object 
 2   primaryTitle    203475 non-null  object 
 3   originalTitle   203475 non-null  object 
 4   isAdult         203476 non-null  object 
 5   startYear       199907 non-null  float64
 6   endYear         0 non-null       float64
 7   runtimeMinutes  203476 non-null  object 
 8   genres          203476 non-null  object 
dtypes: float64(2), object(7)
memory usage: 15.5+ MB


In [67]:
# startyear filter
filter_start_year = (movies_basics['startYear']<=2022) & (movies_basics['startYear']>=2000)
filter_start_year

8           False
144         False
570         False
587         False
672         False
            ...  
10016366     True
10016544     True
10016684     True
10016693     True
10016777     True
Name: startYear, Length: 203476, dtype: bool

In [69]:
movies_basics = movies_basics[filter_start_year]
# verify
print(movies_basics['startYear'].min(),movies_basics['startYear'].max())

2000.0 2022.0


In [71]:
# cheking value_counts for genres
movies_basics['genres'].value_counts()

genres
Documentary                      21375
Drama                            17085
Comedy                            7148
Horror                            4071
Comedy,Drama                      4000
                                 ...  
Adult,Crime,Mystery                  1
Comedy,Documentary,Reality-TV        1
Biography,Music,Mystery              1
Comedy,Reality-TV,Romance            1
Biography,Fantasy,Musical            1
Name: count, Length: 1054, dtype: int64

In [74]:
# filter Documentary (we will use contains since it shows up in multiple genres) 
filter_documentary = movies_basics['genres'].str.contains('Documentary')
filter_documentary

34802       False
61114       False
67666       False
86793       False
93930       False
            ...  
10016366     True
10016544    False
10016684    False
10016693    False
10016777    False
Name: genres, Length: 121127, dtype: bool

In [75]:
# excluding docs
movies_basics = movies_basics[~filter_documentary]

In [76]:
movies_basics['genres'].value_counts()

genres
Drama                        17085
Comedy                        7148
Horror                        4071
Comedy,Drama                  4000
Drama,Romance                 2623
                             ...  
Music,Mystery,Romance            1
History,Horror,Mystery           1
Crime,Music,Mystery              1
Crime,Fantasy,Romance            1
Biography,Fantasy,Musical        1
Name: count, Length: 854, dtype: int64

In [77]:
# display overall infos after preprocessing
movies_basics.info()
movies_basics.head()

<class 'pandas.core.frame.DataFrame'>
Index: 86979 entries, 34802 to 10016777
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   tconst          86979 non-null  object 
 1   titleType       86979 non-null  object 
 2   primaryTitle    86978 non-null  object 
 3   originalTitle   86978 non-null  object 
 4   isAdult         86979 non-null  object 
 5   startYear       86979 non-null  float64
 6   endYear         0 non-null      float64
 7   runtimeMinutes  86979 non-null  object 
 8   genres          86979 non-null  object 
dtypes: float64(2), object(7)
memory usage: 6.6+ MB


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
34802,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
61114,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020.0,,70,Drama
67666,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018.0,,122,Drama
86793,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005.0,,100,"Comedy,Horror,Sci-Fi"
93930,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002.0,,126,Drama


In [79]:
# save basics
movies_basics.to_csv('DATA/basics.csv',index=False,sep=',',encoding='utf-8')

In [81]:
# For the movies_ratings, we will only keep those present in movies_basics
filter_ratings = movies_ratings['tconst'].isin(movies_basics['tconst'])
movies_ratings = movies_ratings[filter_ratings]

In [82]:
# chek null values
movies_ratings.isna().sum()

tconst           0
averageRating    0
numVotes         0
dtype: int64

In [83]:
# chek overall 
movies_ratings.info()
movies_ratings.head()

<class 'pandas.core.frame.DataFrame'>
Index: 71900 entries, 17961 to 1331462
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   tconst         71900 non-null  object 
 1   averageRating  71900 non-null  float64
 2   numVotes       71900 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 2.2+ MB


Unnamed: 0,tconst,averageRating,numVotes
17961,tt0035423,6.4,87153
40764,tt0062336,6.4,175
46645,tt0069049,6.7,7754
63640,tt0088751,5.2,336
69953,tt0096056,5.6,846


- **=>We should note that some movies in the future will not have ratings because number of rows are not equal.**

In [84]:
# save ratings
movies_ratings.to_csv('DATA/ratings.csv',index=False,sep=',',encoding='utf-8')

In [86]:
# save movies
movies_ratings.to_csv('DATA/movies.csv',index=False,sep=',',encoding='utf-8')