## IMPORTS

In [1]:
import pandas as pd
import numpy as np

## LOAD THE DATA FILES

In [2]:
basics_url = "https://datasets.imdbws.com/title.basics.tsv.gz"
akas_url = "https://datasets.imdbws.com/title.akas.tsv.gz"
ratings_url = "https://datasets.imdbws.com/title.ratings.tsv.gz"

## LOAD AND FILTER BASICS DATA

In [3]:
basics = pd.read_csv(basics_url, sep='\t', low_memory=False)
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


In [4]:
##Make a coppy of basics
basics1 = basics.copy()

In [5]:
## Replace Missing Valuess with NaN
basics.replace({'\\N':np.nan}, inplace = True)

In [6]:
## Checcking to see if they have changed
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,,1,"Comedy,Short"


In [7]:
## Check for duplicates
basics.duplicated().sum()

0

In [8]:
## Check missing values
basics1.isna().sum()

tconst             0
titleType          0
primaryTitle      11
originalTitle     11
isAdult            0
startYear          0
endYear            0
runtimeMinutes     0
genres            11
dtype: int64

### Dropping rows with null values for runtime

In [9]:
basics1.dropna(subset = ['runtimeMinutes'],inplace=True)
basics1.isna().sum()

tconst             0
titleType          0
primaryTitle      11
originalTitle     11
isAdult            0
startYear          0
endYear            0
runtimeMinutes     0
genres            11
dtype: int64

### Dropping rows with missing values for genres

In [10]:
basics1.dropna(subset = ['genres'],inplace=True)
basics1.isna().sum()

tconst             0
titleType          0
primaryTitle      11
originalTitle     11
isAdult            0
startYear          0
endYear            0
runtimeMinutes     0
genres             0
dtype: int64

### Picking only Movies

In [11]:
basics1 = basics1.loc[basics1['titleType']== 'movie']

In [12]:
basics1['titleType'].value_counts()

movie    639658
Name: titleType, dtype: int64

### Specifying the years of the movies

In [13]:
basics1 = basics1.loc[basics1['startYear']>= '2000'] 
basics1 = basics1.loc[basics1['startYear']<= '2022']
basics1['startYear'].value_counts()

2022    19047
2018    18750
2017    18644
2019    18497
2016    18179
2021    17986
2015    16958
2014    16286
2020    15913
2013    15150
2012    14477
2011    13421
2010    12472
2009    11677
2008    10198
2007     8742
2006     8049
2005     7520
2004     6531
2003     5984
2002     5744
2001     5522
2000     5169
Name: startYear, dtype: int64

### Include only fictional movies (not from documentary genre)

In [14]:
is_documentary = basics1['genres'].str.contains('Documentary',case=False)
basics1 = basics1[~is_documentary]

## Inspect the AKAS Data

In [15]:
akas = pd.read_csv(akas_url, sep = '\t', low_memory = False)
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,\N,imdbDisplay,\N,0
1,tt0000001,2,Carmencita,DE,\N,\N,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,\N,imdbDisplay,\N,0
3,tt0000001,4,Καρμενσίτα,GR,\N,imdbDisplay,\N,0
4,tt0000001,5,Карменсита,RU,\N,imdbDisplay,\N,0


In [16]:
## Make a copy of data
akas1 = akas.copy()

In [17]:
akas1 = akas1.loc[akas1['region']== 'US']

In [18]:
akas1['region'].value_counts()

US    1424064
Name: region, dtype: int64

### Replace "\N" with np.nan

In [19]:
akas1.replace({'\\N':np.nan}, inplace = True)

In [20]:
## Check to see if it worked
akas1.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
5,tt0000001,6,Carmencita,US,,imdbDisplay,,0
14,tt0000002,7,The Clown and His Dogs,US,,,literal English title,0
33,tt0000005,10,Blacksmith Scene,US,,imdbDisplay,,0
36,tt0000005,1,Blacksmithing Scene,US,,alternative,,0
41,tt0000005,6,Blacksmith Scene #1,US,,alternative,,0


In [21]:
keepers =basics1['tconst'].isin(akas['titleId'])
keepers

11636      True
15178      True
34803      True
61116      True
67669      True
           ... 
9710750    True
9710782    True
9710834    True
9710914    True
9710924    True
Name: tconst, Length: 199662, dtype: bool

In [22]:
basics1 = basics1[keepers]
basics1

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
11636,tt0011801,movie,Tötet nicht mehr,Tötet nicht mehr,0,2019,\N,\N,"Action,Crime"
15178,tt0015414,movie,La tierra de los toros,La tierra de los toros,0,2000,\N,60,\N
34803,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,\N,118,"Comedy,Fantasy,Romance"
61116,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020,\N,70,Drama
67669,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,\N,122,Drama
...,...,...,...,...,...,...,...,...,...
9710750,tt9916362,movie,Coven,Akelarre,0,2020,\N,92,"Drama,History"
9710782,tt9916428,movie,The Secret of China,Hong xing zhao yao Zhong guo,0,2019,\N,\N,"Adventure,History,War"
9710834,tt9916538,movie,Kuambil Lagi Hatiku,Kuambil Lagi Hatiku,0,2019,\N,123,Drama
9710914,tt9916706,movie,Dankyavar Danka,Dankyavar Danka,0,2013,\N,\N,Comedy


In [23]:
ratings = pd.read_csv(ratings_url, sep = '\t', low_memory = False)
ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1961
1,tt0000002,5.8,263
2,tt0000003,6.5,1799
3,tt0000004,5.6,179
4,tt0000005,6.2,2600


In [24]:
ratings1 = ratings.copy()

In [25]:
keepers =ratings1['tconst'].isin(akas['titleId'])
keepers

0           True
1           True
2           True
3           True
4           True
           ...  
1292593     True
1292594     True
1292595    False
1292596    False
1292597    False
Name: tconst, Length: 1292598, dtype: bool

In [26]:
ratings1 = ratings1[keepers]
ratings1 

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1961
1,tt0000002,5.8,263
2,tt0000003,6.5,1799
3,tt0000004,5.6,179
4,tt0000005,6.2,2600
...,...,...,...
1292578,tt9916460,9.4,18
1292581,tt9916538,8.6,7
1292582,tt9916544,6.9,62
1292593,tt9916730,8.3,10


In [27]:
# example making new folder with os
import os
os.makedirs('Data/',exist_ok=True) 
# Confirm folder created
os.listdir("Data/")

['title_ratings1.csv.gz',
 'title_basics1.csv.gz',
 'title_akas1.csv.gz',
 '.ipynb_checkpoints']

In [28]:
basics1.to_csv("Data/title_basics1.csv.gz",compression='gzip',index=False)
basics1.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
11636,tt0011801,movie,Tötet nicht mehr,Tötet nicht mehr,0,2019,\N,\N,"Action,Crime"
15178,tt0015414,movie,La tierra de los toros,La tierra de los toros,0,2000,\N,60,\N
34803,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,\N,118,"Comedy,Fantasy,Romance"
61116,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020,\N,70,Drama
67669,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,\N,122,Drama


In [29]:
akas1.to_csv("Data/title_akas1.csv.gz", compression = 'gzip', index = False)
akas1.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
5,tt0000001,6,Carmencita,US,,imdbDisplay,,0
14,tt0000002,7,The Clown and His Dogs,US,,,literal English title,0
33,tt0000005,10,Blacksmith Scene,US,,imdbDisplay,,0
36,tt0000005,1,Blacksmithing Scene,US,,alternative,,0
41,tt0000005,6,Blacksmith Scene #1,US,,alternative,,0


In [30]:
ratings1.to_csv("Data/title_ratings1.csv.gz", compression = 'gzip', index = False)
ratings1.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1961
1,tt0000002,5.8,263
2,tt0000003,6.5,1799
3,tt0000004,5.6,179
4,tt0000005,6.2,2600
