# Create Project

## Load Libraries and Functions

In [51]:
# import pandas
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os


## Load Data

In [2]:
# imdb urls for datasets
basics_url = "https://datasets.imdbws.com/title.basics.tsv.gz"
akas_url = "https://datasets.imdbws.com/title.akas.tsv.gz"
ratings_url = "https://datasets.imdbws.com/title.ratings.tsv.gz"

In [22]:
# loading the data
basics = pd.read_csv(basics_url, sep='\t', low_memory=False)
akas = pd.read_csv(akas_url, sep='\t', low_memory=False)
ratings = pd.read_csv(ratings_url, sep='\t', low_memory=False)

# viewing the data
display(basics.head(), akas.head(), ratings.head())


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,\N,imdbDisplay,\N,0
1,tt0000001,2,Carmencita,DE,\N,\N,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,\N,imdbDisplay,\N,0
3,tt0000001,4,Καρμενσίτα,GR,\N,imdbDisplay,\N,0
4,tt0000001,5,Карменсита,RU,\N,imdbDisplay,\N,0


Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1980
1,tt0000002,5.8,265
2,tt0000003,6.5,1833
3,tt0000004,5.6,179
4,tt0000005,6.2,2622


## Cleaning the Data

In [4]:
# checking changes
display(basics.head(3), akas.head(3))

# ratings did not appear to have any NaN values
ratings.isna().sum()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"


Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,\N,imdbDisplay,\N,0
1,tt0000001,2,Carmencita,DE,\N,\N,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,\N,imdbDisplay,\N,0


tconst           0
averageRating    0
numVotes         0
dtype: int64

### Basics: Prepare to Guidelines and Save

#### Guidelines
* [x] Eliminate movies that are null for runtimeMinutes
* [x] Eliminate movies that are null for genre
* [x] keep only titleType==Movie
* [x] keep startYear 2000-2022
* [x] Eliminate movies that include "Documentary" in genre (see tip below)


In [64]:
#load basics
basics = pd.read_csv(basics_url, sep='\t', low_memory=False)


In [65]:
# look at info
basics.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9926879 entries, 0 to 9926878
Data columns (total 9 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   tconst          object
 1   titleType       object
 2   primaryTitle    object
 3   originalTitle   object
 4   isAdult         object
 5   startYear       object
 6   endYear         object
 7   runtimeMinutes  object
 8   genres          object
dtypes: object(9)
memory usage: 681.6+ MB


In [66]:
# replace null values
basics.replace({'\\N':np.nan}, inplace = True)

In [67]:
basics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9926879 entries, 0 to 9926878
Data columns (total 9 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   tconst          object
 1   titleType       object
 2   primaryTitle    object
 3   originalTitle   object
 4   isAdult         object
 5   startYear       object
 6   endYear         object
 7   runtimeMinutes  object
 8   genres          object
dtypes: object(9)
memory usage: 681.6+ MB


In [68]:
# 'startYear' is a string column, changing it to an float (to account for NaNs)
basics['startYear'] = basics['startYear'].astype(float)

# confirming
basics.dtypes

tconst             object
titleType          object
primaryTitle       object
originalTitle      object
isAdult            object
startYear         float64
endYear            object
runtimeMinutes     object
genres             object
dtype: object

In [69]:
# keep only 'Movie'
basics = basics.loc[ basics['titleType']=='movie']

In [70]:
basics.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 647968 entries, 8 to 9926829
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   tconst          647968 non-null  object 
 1   titleType       647968 non-null  object 
 2   primaryTitle    647968 non-null  object 
 3   originalTitle   647968 non-null  object 
 4   isAdult         647968 non-null  object 
 5   startYear       556615 non-null  float64
 6   endYear         0 non-null       object 
 7   runtimeMinutes  408129 non-null  object 
 8   genres          575679 non-null  object 
dtypes: float64(1), object(8)
memory usage: 49.4+ MB


In [71]:
# filtering out nulls in genres and runtimeMinutes
basics = basics[(basics['genres'].notnull()) &
               (basics['runtimeMinutes'].notnull())
        ].copy()

In [72]:
basics.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 383787 entries, 8 to 9926829
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   tconst          383787 non-null  object 
 1   titleType       383787 non-null  object 
 2   primaryTitle    383787 non-null  object 
 3   originalTitle   383787 non-null  object 
 4   isAdult         383787 non-null  object 
 5   startYear       377300 non-null  float64
 6   endYear         0 non-null       object 
 7   runtimeMinutes  383787 non-null  object 
 8   genres          383787 non-null  object 
dtypes: float64(1), object(8)
memory usage: 29.3+ MB


In [73]:
# keep only startYear '2000-2022'
basics = basics [(basics['startYear']>=2000) & (basics['startYear']<=2021)]

In [74]:
# keep only documentary
is_documentary = basics['genres'].str.contains('documentary',case=False)
basics = basics[~is_documentary]

In [75]:
# creating folder in directory
os.makedirs('Data/',exist_ok=True) 
# confirming folder creation
os.listdir("Data/")

[]

In [76]:
## Save current dataframe to file.
basics.to_csv("Data/title_basics.csv.gz",compression='gzip',index=False)