In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
os.listdir("Data/")

['basics.csv',
 'ratings.csv',
 'title-akas-us-only.csv',
 'title.basics.tsv.gz',
 'title.ratings.tsv.gz']

In [3]:
titles_us_only = pd.read_csv('Data/title-akas-us-only.csv', low_memory=False)

In [4]:
basics = pd.read_csv('Data/title.basics.tsv.gz', sep='\t', low_memory=False)

In [5]:
# filter basics to include only US
filter_us_titles = basics['tconst'].isin(titles_us_only['titleId'])
basics = basics[filter_us_titles]
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"
5,tt0000006,short,Chinese Opium Den,Chinese Opium Den,0,1894,\N,1,Short
6,tt0000007,short,Corbett and Courtney Before the Kinetograph,Corbett and Courtney Before the Kinetograph,0,1894,\N,1,"Short,Sport"
...,...,...,...,...,...,...,...,...,...
10016872,tt9916560,tvMovie,March of Dimes Presents: Once Upon a Dime,March of Dimes Presents: Once Upon a Dime,0,1963,\N,58,Family
10016901,tt9916620,movie,The Copeland Case,The Copeland Case,0,\N,\N,\N,Drama
10016939,tt9916702,short,Loving London: The Playground,Loving London: The Playground,0,\N,\N,\N,"Drama,Short"
10016962,tt9916756,short,Pretty Pretty Black Girl,Pretty Pretty Black Girl,0,2019,\N,\N,Short


In [6]:
basics = basics.replace({'\\N':np.nan})

In [7]:
# drop rows with null values in runtimeMinutes and genres columns only
basics.isna().sum()
basics = basics.dropna(subset = ['runtimeMinutes', 'genres'])
basics.isna().sum()
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,,5,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,,1,"Comedy,Short"
5,tt0000006,short,Chinese Opium Den,Chinese Opium Den,0,1894,,1,Short
6,tt0000007,short,Corbett and Courtney Before the Kinetograph,Corbett and Courtney Before the Kinetograph,0,1894,,1,"Short,Sport"
...,...,...,...,...,...,...,...,...,...
10016704,tt9916214,short,Drown the Clown,Drown the Clown,0,2019,,8,"Drama,Short"
10016724,tt9916254,video,Big Tit Cream Pie 32,Big Tit Cream Pie 32,1,2015,,226,Adult
10016770,tt9916348,video,Ancient World Exposed,Ancient World Exposed,0,2019,,67,History
10016777,tt9916362,movie,Coven,Akelarre,0,2020,,92,"Drama,History"


In [8]:
# filter basics to include only full-length movies (titleType==Movie)
basics_movies = basics[basics['titleType'] == 'movie']
basics = basics_movies
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
8,tt0000009,movie,Miss Jerry,Miss Jerry,0,1894,,45,Romance
144,tt0000147,movie,The Corbett-Fitzsimmons Fight,The Corbett-Fitzsimmons Fight,0,1897,,100,"Documentary,News,Sport"
570,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,,70,"Action,Adventure,Biography"
587,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0,1907,,90,Drama
672,tt0000679,movie,The Fairylogue and Radio-Plays,The Fairylogue and Radio-Plays,0,1908,,120,"Adventure,Fantasy"
...,...,...,...,...,...,...,...,...,...
10016366,tt9915436,movie,Vida em Movimento,Vida em Movimento,0,2019,,70,Documentary
10016544,tt9915872,movie,The Last White Witch,My Girlfriend is a Wizard,0,2019,,97,"Comedy,Drama,Fantasy"
10016684,tt9916170,movie,The Rehearsal,O Ensaio,0,2019,,51,Drama
10016693,tt9916190,movie,Safeguard,Safeguard,0,2020,,95,"Action,Adventure,Thriller"


In [9]:
# disable warning for converting startYear to float
pd.options.mode.chained_assignment = None

In [10]:
# convert startYear to a float dtype
basics['startYear'] = basics['startYear'].astype(float)

In [11]:
# filter basics to keep movies with startYear >= 2000 AND <= 2022
basics_movies = basics[(basics['startYear'] >= 2000)
& (basics['startYear'] <= 2022)]

# apply filter
basics = basics_movies

# verify changes to startYear: min = 2000, max = 2022
basics.describe()

Unnamed: 0,startYear
count,121127.0
mean,2013.571252
std,5.748217
min,2000.0
25%,2010.0
50%,2014.0
75%,2018.0
max,2022.0


In [12]:
# create filter for documentaries in 'genres'
filter_documentaries = basics['genres'].str.contains('Documentary')

# exclude documentaries from basics
basics = basics[~filter_documentaries]

# verify changes
# basics['genres'].value_counts()['Documentary']

In [13]:
# display .info() and . head() of basics df
basics_info = basics.info()
basics_head = basics.head()
basics_info
basics_head

<class 'pandas.core.frame.DataFrame'>
Int64Index: 86979 entries, 34802 to 10016777
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   tconst          86979 non-null  object 
 1   titleType       86979 non-null  object 
 2   primaryTitle    86979 non-null  object 
 3   originalTitle   86979 non-null  object 
 4   isAdult         86979 non-null  object 
 5   startYear       86979 non-null  float64
 6   endYear         0 non-null      object 
 7   runtimeMinutes  86979 non-null  object 
 8   genres          86979 non-null  object 
dtypes: float64(1), object(8)
memory usage: 6.6+ MB


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
34802,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
61114,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020.0,,70,Drama
67666,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018.0,,122,Drama
86793,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005.0,,100,"Comedy,Horror,Sci-Fi"
93930,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002.0,,126,Drama


In [14]:
# save basics dataframe as csv file in Data folder
basics.to_csv('Data/basics.csv')

## Load and filter title ratings file

In [15]:
# load data
fpath = 'Data/title.ratings.tsv.gz'
ratings = pd.read_csv(fpath, sep='\t',low_memory=False)

In [16]:
# keep only movies that are included in final title basics df
filter_basics = ratings['tconst'].isin(basics['tconst'])
ratings = ratings[filter_basics]

## Display final preview of filtered title ratings and save to csv

In [28]:
# display preview
ratings.head()

# save to csv file
ratings.to_csv('Data/ratings.csv')

basics_length = basics['startYear'].min()

basics_length

2000.0