# Project 3 - Part 1

In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt

In [2]:
basic_gz = 'https://datasets.imdbws.com/title.basics.tsv.gz'
ratings_gz = 'https://datasets.imdbws.com/title.ratings.tsv.gz'
akas_gz = 'https://datasets.imdbws.com/title.akas.tsv.gz'

In [3]:
basics = pd.read_csv(basic_gz, sep='\t', low_memory=False)
ratings = pd.read_csv(ratings_gz, sep='\t', low_memory=False)
akas = pd.read_csv(akas_gz, sep='\t', low_memory=False)

**Basics DF Data Cleanse**

In [4]:
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


In [5]:
basics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9292084 entries, 0 to 9292083
Data columns (total 9 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   tconst          object
 1   titleType       object
 2   primaryTitle    object
 3   originalTitle   object
 4   isAdult         object
 5   startYear       object
 6   endYear         object
 7   runtimeMinutes  object
 8   genres          object
dtypes: object(9)
memory usage: 638.0+ MB


In [6]:
# replacing null values with np.nan
basics = basics.replace({'\\N':np.nan})
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,,1,"Comedy,Short"


In [7]:
# eliminate movies that are null for runtimeMinutes
basics = basics.dropna(subset=['runtimeMinutes'])
basics.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2502185 entries, 0 to 9292083
Data columns (total 9 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   tconst          object
 1   titleType       object
 2   primaryTitle    object
 3   originalTitle   object
 4   isAdult         object
 5   startYear       object
 6   endYear         object
 7   runtimeMinutes  object
 8   genres          object
dtypes: object(9)
memory usage: 190.9+ MB


In [8]:
# eliminate movies that are null for genres
basics = basics.dropna(subset=['genres'])
basics.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2434368 entries, 0 to 9292083
Data columns (total 9 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   tconst          object
 1   titleType       object
 2   primaryTitle    object
 3   originalTitle   object
 4   isAdult         object
 5   startYear       object
 6   endYear         object
 7   runtimeMinutes  object
 8   genres          object
dtypes: object(9)
memory usage: 185.7+ MB


In [9]:
# keep only titleType==Movie
basics['titleType'].value_counts()

tvEpisode       1090817
short            579193
movie            369539
video            176015
tvMovie           88724
tvSeries          87277
tvSpecial         16792
tvMiniSeries      16274
tvShort            9429
videoGame           308
Name: titleType, dtype: int64

In [10]:
basics = basics[basics.titleType == 'movie']
basics['titleType'].value_counts()

movie    369539
Name: titleType, dtype: int64

In [15]:
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
8,tt0000009,movie,Miss Jerry,Miss Jerry,0,1894,,45,Romance
570,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,,70,"Action,Adventure,Biography"
587,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0,1907,,90,Drama
672,tt0000679,movie,The Fairylogue and Radio-Plays,The Fairylogue and Radio-Plays,0,1908,,120,"Adventure,Fantasy"
1172,tt0001184,movie,Don Juan de Serrallonga,Don Juan de Serrallonga,0,1910,,58,"Adventure,Drama"


In [12]:
basics.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 369539 entries, 8 to 9292034
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   tconst          369539 non-null  object
 1   titleType       369539 non-null  object
 2   primaryTitle    369539 non-null  object
 3   originalTitle   369539 non-null  object
 4   isAdult         369539 non-null  object
 5   startYear       363626 non-null  object
 6   endYear         0 non-null       object
 7   runtimeMinutes  369539 non-null  object
 8   genres          369539 non-null  object
dtypes: object(9)
memory usage: 28.2+ MB


In [14]:
# keep startYear 2000-2022
basics['startYear'].value_counts()

2017    14225
2018    14158
2016    13844
2019    13833
2015    13345
        ...  
1896        1
1894        1
1899        1
1904        1
2026        1
Name: startYear, Length: 128, dtype: int64