Project 2: Part 1 - Preprocessed Data
By Jeffrey Prichard

# Overview

In [1]:
#Imports/Settings
import pandas as pd

import numpy as np

import matplotlib.pyplot as plt

import seaborn as sns

pd.set_option('display.max_column', 50)
pd.set_option('display.max_rows', 50)

# Load/Inspect Data - titles

In [2]:
titles = pd.read_csv("Data/title-akas-us-only.csv", low_memory=False)
titles.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,6,Carmencita,US,\N,imdbDisplay,\N,0
1,tt0000002,7,The Clown and His Dogs,US,\N,\N,literal English title,0
2,tt0000005,10,Blacksmith Scene,US,\N,imdbDisplay,\N,0
3,tt0000005,1,Blacksmithing Scene,US,\N,alternative,\N,0
4,tt0000005,6,Blacksmith Scene #1,US,\N,alternative,\N,0


In [3]:
titles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1452564 entries, 0 to 1452563
Data columns (total 8 columns):
 #   Column           Non-Null Count    Dtype 
---  ------           --------------    ----- 
 0   titleId          1452564 non-null  object
 1   ordering         1452564 non-null  int64 
 2   title            1452564 non-null  object
 3   region           1452564 non-null  object
 4   language         1452564 non-null  object
 5   types            1452564 non-null  object
 6   attributes       1452564 non-null  object
 7   isOriginalTitle  1452564 non-null  object
dtypes: int64(1), object(7)
memory usage: 88.7+ MB


In [4]:
titles.shape

(1452564, 8)

In [5]:
titles.dtypes

titleId            object
ordering            int64
title              object
region             object
language           object
types              object
attributes         object
isOriginalTitle    object
dtype: object

# Load/Inspect - basics

In [6]:
basics = pd.read_csv("Data/title.basics.tsv", sep='\t', low_memory=False)
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


In [7]:
basics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10017011 entries, 0 to 10017010
Data columns (total 9 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   tconst          object
 1   titleType       object
 2   primaryTitle    object
 3   originalTitle   object
 4   isAdult         object
 5   startYear       object
 6   endYear         object
 7   runtimeMinutes  object
 8   genres          object
dtypes: object(9)
memory usage: 687.8+ MB


In [8]:
basics.shape

(10017011, 9)

# Clean Data

## Filter: Remove Non-US Movies from title basics

In [9]:
#Create Filter for US Movies only
filter_us_titles = basics['tconst'].isin(titles['titleId'])

#Update basics DF to contain only those filtered
basics = basics[filter_us_titles]
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"
5,tt0000006,short,Chinese Opium Den,Chinese Opium Den,0,1894,\N,1,Short
6,tt0000007,short,Corbett and Courtney Before the Kinetograph,Corbett and Courtney Before the Kinetograph,0,1894,\N,1,"Short,Sport"


In [10]:
#Confirm new info/shape
basics.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1365643 entries, 0 to 10016966
Data columns (total 9 columns):
 #   Column          Non-Null Count    Dtype 
---  ------          --------------    ----- 
 0   tconst          1365643 non-null  object
 1   titleType       1365643 non-null  object
 2   primaryTitle    1365643 non-null  object
 3   originalTitle   1365643 non-null  object
 4   isAdult         1365643 non-null  object
 5   startYear       1365643 non-null  object
 6   endYear         1365643 non-null  object
 7   runtimeMinutes  1365643 non-null  object
 8   genres          1365643 non-null  object
dtypes: object(9)
memory usage: 104.2+ MB


We have now filtered our basics df to contain only the US based movie titles. This has reduced our rows from over 10 million to nearly 1.5million.

## Null Values

In [11]:
#Replace "\N" with Null
#titles DF
titles = titles.replace({'\\N':np.nan})
titles.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,6,Carmencita,US,,imdbDisplay,,0
1,tt0000002,7,The Clown and His Dogs,US,,,literal English title,0
2,tt0000005,10,Blacksmith Scene,US,,imdbDisplay,,0
3,tt0000005,1,Blacksmithing Scene,US,,alternative,,0
4,tt0000005,6,Blacksmith Scene #1,US,,alternative,,0


In [12]:
#basics DF
basics = basics.replace({'\\N':np.nan})
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,,5,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,,1,"Comedy,Short"
5,tt0000006,short,Chinese Opium Den,Chinese Opium Den,0,1894,,1,Short
6,tt0000007,short,Corbett and Courtney Before the Kinetograph,Corbett and Courtney Before the Kinetograph,0,1894,,1,"Short,Sport"


## Dropping Rows

We need to drop our Rows with Null values in EITHER runtimeMinutes and genre columns from our basics DF. The titles DF does not contain features with these titles.

In [13]:
#check for null values within 'runtimeMinutes' and 'genres' features
null_runtime = basics["runtimeMinutes"].isnull().sum()
null_genres = basics["genres"].isnull().sum()

print("Null Values in 'runtimeMinutes':")
print(null_runtime)

print("\nNull Values in 'genres':")
print(null_genres)

Null Values in 'runtimeMinutes':
503119

Null Values in 'genres':
28616


In [14]:
#Drop rows with NaN in columns: runtimeMinutes/Genre
basics = basics.dropna(subset=['runtimeMinutes','genres'])
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,,5,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,,1,"Comedy,Short"
5,tt0000006,short,Chinese Opium Den,Chinese Opium Den,0,1894,,1,Short
6,tt0000007,short,Corbett and Courtney Before the Kinetograph,Corbett and Courtney Before the Kinetograph,0,1894,,1,"Short,Sport"


In [15]:
basics.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 853335 entries, 0 to 10016872
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   tconst          853335 non-null  object
 1   titleType       853335 non-null  object
 2   primaryTitle    853335 non-null  object
 3   originalTitle   853335 non-null  object
 4   isAdult         853335 non-null  object
 5   startYear       844533 non-null  object
 6   endYear         20211 non-null   object
 7   runtimeMinutes  853335 non-null  object
 8   genres          853335 non-null  object
dtypes: object(9)
memory usage: 65.1+ MB


In [16]:
#confirm for null values within 'runtimeMinutes' and 'genres' features
null_runtime = basics["runtimeMinutes"].isnull().sum()
null_genres = basics["genres"].isnull().sum()

print("Null Values in 'runtimeMinutes':")
print(null_runtime)

print("\nNull Values in 'genres':")
print(null_genres)

Null Values in 'runtimeMinutes':
0

Null Values in 'genres':
0


## Filter for ONLY full Length Movies

In [17]:
#filter for titletype == Movie
movie_filter = basics['titleType'] == 'movie'

##update DataFrame with filter
basics = basics[movie_filter]
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
8,tt0000009,movie,Miss Jerry,Miss Jerry,0,1894,,45,Romance
144,tt0000147,movie,The Corbett-Fitzsimmons Fight,The Corbett-Fitzsimmons Fight,0,1897,,100,"Documentary,News,Sport"
570,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,,70,"Action,Adventure,Biography"
587,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0,1907,,90,Drama
672,tt0000679,movie,The Fairylogue and Radio-Plays,The Fairylogue and Radio-Plays,0,1908,,120,"Adventure,Fantasy"


## Convert DataTypes: startYear

In [18]:
#view value counts for startYear before conversion
basics['startYear'].value_counts(dropna=False)

2019    8102
2018    7866
2017    7816
2016    7415
2015    7228
        ... 
1906       2
1899       1
1904       1
1897       1
1894       1
Name: startYear, Length: 129, dtype: int64

In [22]:
#convert from obj to float
basics['startYear'] = basics['startYear'].astype(float)
basics.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 203476 entries, 8 to 10016777
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   tconst          203476 non-null  object 
 1   titleType       203476 non-null  object 
 2   primaryTitle    203476 non-null  object 
 3   originalTitle   203476 non-null  object 
 4   isAdult         203476 non-null  object 
 5   startYear       199907 non-null  float64
 6   endYear         0 non-null       object 
 7   runtimeMinutes  203476 non-null  object 
 8   genres          203476 non-null  object 
dtypes: float64(1), object(8)
memory usage: 15.5+ MB


## Filter for Movies released within 2000-2022

In [23]:
#filter
basics = basics[(basics['startYear'] >= 2000) & (basics['startYear'] <= 2022)]

basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
34802,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
61114,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020.0,,70,Drama
67666,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018.0,,122,Drama
86793,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005.0,,100,"Comedy,Horror,Sci-Fi"
93930,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002.0,,126,Drama


## Filter OUT documentaries from genre

#filter
filter_docs = basics['genres'].str.contains('Documentary')

#filtered df
basics = basics[~filter_docs]
basics.head()

In [25]:
#confirmation
basics['genres'].value_counts(dropna=False)

Drama                        17085
Comedy                        7148
Horror                        4071
Comedy,Drama                  4000
Drama,Romance                 2623
                             ...  
Music,Mystery,Romance            1
History,Horror,Mystery           1
Crime,Music,Mystery              1
Crime,Fantasy,Romance            1
Biography,Fantasy,Musical        1
Name: genres, Length: 854, dtype: int64

# basics: Reviewing final Dataframe and Saving as CSV

In [26]:
#info/head
basics.info()
basics.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 86979 entries, 34802 to 10016777
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   tconst          86979 non-null  object 
 1   titleType       86979 non-null  object 
 2   primaryTitle    86979 non-null  object 
 3   originalTitle   86979 non-null  object 
 4   isAdult         86979 non-null  object 
 5   startYear       86979 non-null  float64
 6   endYear         0 non-null      object 
 7   runtimeMinutes  86979 non-null  object 
 8   genres          86979 non-null  object 
dtypes: float64(1), object(8)
memory usage: 6.6+ MB


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
34802,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
61114,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020.0,,70,Drama
67666,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018.0,,122,Drama
86793,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005.0,,100,"Comedy,Horror,Sci-Fi"
93930,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002.0,,126,Drama


In [27]:
#saving as csv in data folder

fpath = "Data/FINAL_title_basics.csv"

basics.to_csv(fpath, index=False)

#confirm
print(f"DataFrame: basics saves as {fpath}")

DataFrame: basics saves as Data/FINAL_title_basics.csv


# Load/Inspect - title ratings

In [28]:
#Load
file = "Data/title.ratings.tsv"

ratings = pd.read_csv(file, sep='\t', low_memory=False)

In [31]:
#head // info
ratings.info()
ratings.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1331492 entries, 0 to 1331491
Data columns (total 3 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   tconst         1331492 non-null  object 
 1   averageRating  1331492 non-null  float64
 2   numVotes       1331492 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 30.5+ MB


Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1988
1,tt0000002,5.8,265
2,tt0000003,6.5,1849
3,tt0000004,5.5,178
4,tt0000005,6.2,2632


## Filter: remove non-US movies

In [33]:
#Create Filter for US Movies only
filter_us_titles = ratings['tconst'].isin(titles['titleId'])

#Update basics DF to contain only those filtered
ratings = ratings[filter_us_titles]

<class 'pandas.core.frame.DataFrame'>
Int64Index: 504000 entries, 0 to 1331467
Data columns (total 3 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   tconst         504000 non-null  object 
 1   averageRating  504000 non-null  float64
 2   numVotes       504000 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 15.4+ MB


Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1988
1,tt0000002,5.8,265
4,tt0000005,6.2,2632
5,tt0000006,5.1,182
6,tt0000007,5.4,825


## Null Values

In [34]:
#Replace "\N" with Null
#ratings DF
ratings = ratings.replace({'\\N':np.nan})

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1988
1,tt0000002,5.8,265
4,tt0000005,6.2,2632
5,tt0000006,5.1,182
6,tt0000007,5.4,825


## Review // Saving as CSV

In [35]:
#review
ratings.info()
ratings.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 504000 entries, 0 to 1331467
Data columns (total 3 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   tconst         504000 non-null  object 
 1   averageRating  504000 non-null  float64
 2   numVotes       504000 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 15.4+ MB


Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1988
1,tt0000002,5.8,265
4,tt0000005,6.2,2632
5,tt0000006,5.1,182
6,tt0000007,5.4,825


In [36]:
#csv

fpath = "Data/FINAL_title_ratings.csv"

ratings.to_csv(fpath, index=False)

#confirm
print(f"DataFrame: basics saves as {fpath}")

DataFrame: basics saves as Data/FINAL_title_ratings.csv
