# Project 3 Part 1


## Import

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
akas_url = "data/title.akas.tsv.gz"
akas= pd.read_csv(akas_url, sep='\t', low_memory= False)

In [3]:
basics_url = "data/title.basics.tsv.gz"
basics = pd.read_csv(basics_url, sep='\t', low_memory=False)

In [4]:
ratings_url = "data/title.ratings.tsv.gz"
ratings = pd.read_csv(ratings_url, sep = "\t", low_memory = False)

## Cleaning & Chunking

In [6]:
akas.isna().sum()

titleId              0
ordering             0
title                5
region             111
language             0
types                0
attributes           0
isOriginalTitle      0
dtype: int64

In [7]:
basics.isna().sum()

tconst             0
titleType          0
primaryTitle      11
originalTitle     11
isAdult            0
startYear          0
endYear            0
runtimeMinutes     0
genres            14
dtype: int64

In [8]:
ratings.isna().sum()

tconst           0
averageRating    0
numVotes         0
dtype: int64

In [9]:
akas.replace({'\\N':np.nan},inplace=True)
basics.replace({'\\N':np.nan},inplace=True)
ratings.replace({'\\N':np.nan},inplace=True)

In [10]:
akas

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,,imdbDisplay,,0
1,tt0000001,2,Carmencita,DE,,,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,,imdbDisplay,,0
3,tt0000001,4,Καρμενσίτα,GR,,imdbDisplay,,0
4,tt0000001,5,Карменсита,RU,,imdbDisplay,,0
...,...,...,...,...,...,...,...,...
35723976,tt9916852,5,Episódio #3.20,PT,pt,,,0
35723977,tt9916852,6,Episodio #3.20,IT,it,,,0
35723978,tt9916852,7,एपिसोड #3.20,IN,hi,,,0
35723979,tt9916856,1,The Wind,DE,,imdbDisplay,,0


In [11]:
akas = akas[(akas['region']==('US'))].fillna(False)

In [12]:
basics = basics.dropna(subset=['runtimeMinutes','genres','startYear'])

In [13]:
basics['startYear'] = basics['startYear'].astype(float)
## keep startYear 2000-2022
basics = basics[(basics['startYear']>=2000)&(basics['startYear']<2022)]
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
13082,tt0013274,movie,Istoriya grazhdanskoy voyny,Istoriya grazhdanskoy voyny,0,2021.0,,94,Documentary
33803,tt0034413,short,Youth Gets a Break,Youth Gets a Break,0,2001.0,,20,Short
34617,tt0035235,short,Radio Dynamics,Radio Dynamics,0,2016.0,,4,Short
34803,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
39545,tt0040241,short,Color Rhapsodie,Color Rhapsodie,0,2021.0,,6,Short
...,...,...,...,...,...,...,...,...,...
9803632,tt9916754,movie,Chico Albuquerque - Revelações,Chico Albuquerque - Revelações,0,2013.0,,49,Documentary
9803638,tt9916766,tvEpisode,Episode #10.15,Episode #10.15,0,2019.0,,43,"Family,Game-Show,Reality-TV"
9803673,tt9916840,tvEpisode,Horrid Henry's Comic Caper,Horrid Henry's Comic Caper,0,2014.0,,11,"Adventure,Animation,Comedy"
9803680,tt9916856,short,The Wind,The Wind,0,2015.0,,27,Short


In [14]:
is_documentary = basics['genres'].str.contains('documentary',case=False)
basics = basics[~is_documentary]

In [15]:
basics = basics[(basics['titleType'] == "movie")]

In [16]:
temp = basics['tconst'].isin(akas['titleId'])
basics = basics[temp]

In [17]:
temp2 = ratings['tconst'].isin(akas['titleId'])
ratings = ratings[temp2]
ratings

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1965
1,tt0000002,5.8,263
4,tt0000005,6.2,2607
5,tt0000006,5.2,181
6,tt0000007,5.4,816
...,...,...,...
1305574,tt9916200,8.1,229
1305575,tt9916204,8.1,262
1305582,tt9916348,8.1,18
1305583,tt9916362,6.4,5306


In [18]:
# example making new folder with os
import os
os.makedirs('Data/',exist_ok=True) 
# Confirm folder created
os.listdir("Data/")


['.ipynb_checkpoints',
 'title.akas.tsv.gz',
 'title.basics.tsv.gz',
 'title.ratings.tsv.gz']

In [19]:
## Save current dataframe to file.
basics.to_csv("Data/title_basics.csv.gz",compression='gzip',index=False)
akas.to_csv("Data/title_akas.csv.gz",compression='gzip',index=False)
ratings.to_csv("Data/title_ratings.csv.gz",compression='gzip',index=False)

In [20]:
# Open saved file and preview again
basics = pd.read_csv("Data/title_basics.csv.gz", low_memory = True)
basics.info()
basics.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81624 entries, 0 to 81623
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   tconst          81624 non-null  object 
 1   titleType       81624 non-null  object 
 2   primaryTitle    81624 non-null  object 
 3   originalTitle   81624 non-null  object 
 4   isAdult         81624 non-null  int64  
 5   startYear       81624 non-null  float64
 6   endYear         0 non-null      float64
 7   runtimeMinutes  81624 non-null  int64  
 8   genres          81624 non-null  object 
dtypes: float64(2), int64(2), object(5)
memory usage: 5.6+ MB


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020.0,,70,Drama
2,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018.0,,122,Drama
3,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005.0,,100,"Comedy,Horror,Sci-Fi"
4,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002.0,,126,Drama


In [21]:
akas = pd.read_csv("Data/title_akas.csv.gz", low_memory = True)
akas.info()
akas.head()

  akas = pd.read_csv("Data/title_akas.csv.gz", low_memory = True)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1432658 entries, 0 to 1432657
Data columns (total 8 columns):
 #   Column           Non-Null Count    Dtype 
---  ------           --------------    ----- 
 0   titleId          1432658 non-null  object
 1   ordering         1432658 non-null  int64 
 2   title            1432658 non-null  object
 3   region           1432658 non-null  object
 4   language         1432658 non-null  object
 5   types            1432658 non-null  object
 6   attributes       1432658 non-null  object
 7   isOriginalTitle  1432658 non-null  object
dtypes: int64(1), object(7)
memory usage: 87.4+ MB


Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,6,Carmencita,US,False,imdbDisplay,False,0
1,tt0000002,7,The Clown and His Dogs,US,False,False,literal English title,0
2,tt0000005,10,Blacksmith Scene,US,False,imdbDisplay,False,0
3,tt0000005,1,Blacksmithing Scene,US,False,alternative,False,0
4,tt0000005,6,Blacksmith Scene #1,US,False,alternative,False,0


In [22]:
ratings = pd.read_csv("Data/title_ratings.csv.gz", low_memory = True)
ratings.info()
ratings.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 496664 entries, 0 to 496663
Data columns (total 3 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   tconst         496664 non-null  object 
 1   averageRating  496664 non-null  float64
 2   numVotes       496664 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 11.4+ MB


Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1965
1,tt0000002,5.8,263
2,tt0000005,6.2,2607
3,tt0000006,5.2,181
4,tt0000007,5.4,816


# Cite

 - This data is from https://www.imdb.com/interfaces/ (IMBD)