# Create Project 3

## Load files

In [13]:
title_basics = 'https://datasets.imdbws.com/title.basics.tsv.gz'

In [14]:
title_akas = 'https://datasets.imdbws.com/title.akas.tsv.gz'

In [15]:
title_rating = 'https://datasets.imdbws.com/title.ratings.tsv.gz'

## import libraries

In [16]:
# import pandas
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


# import SQL

## Read basics and preprocess

In [25]:
# create the pandas df
basics = pd.read_csv(title_basics, sep='\t', low_memory=False)


In [26]:
# look at info
basics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9710984 entries, 0 to 9710983
Data columns (total 9 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   tconst          object
 1   titleType       object
 2   primaryTitle    object
 3   originalTitle   object
 4   isAdult         object
 5   startYear       object
 6   endYear         object
 7   runtimeMinutes  object
 8   genres          object
dtypes: object(9)
memory usage: 666.8+ MB


In [27]:
#check the head
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


In [28]:
# replace null values
basics.replace({'\\N':np.nan}, inplace = True)

In [29]:
# replace O values
basics.dropna(subset= ['runtimeMinutes', 'genres'], inplace=True)

In [30]:
# keep only 'Movie'
basics = basics.loc[ basics['titleType']=='movie']


In [31]:
# keep only startyear = 2000-2023
# basics = basics['startYear'].str.contains('2000-2023', case = True)
basics.drop(basics[basics['startYear'] < '2000'].index, inplace = True)
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
13082,tt0013274,movie,Istoriya grazhdanskoy voyny,Istoriya grazhdanskoy voyny,0,2021,,133,Documentary
34803,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
61116,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020,,70,Drama
67669,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
76059,tt0077684,movie,Histórias de Combóios em Portugal,Histórias de Combóios em Portugal,0,2022,,46,Documentary


In [32]:
# keep only documentary
is_documentary = basics['genres'].str.contains('documentary',case=False)
basics = basics[~is_documentary]

In [33]:
# example making new folder with os
import os
os.makedirs('Data/',exist_ok=True) 
# Confirm folder created
os.listdir("Data/")

[]

In [34]:
## Save current dataframe to file.
basics.to_csv("Data/title_basics.csv.gz",compression='gzip',index=False)

In [35]:
# Open saved file and preview again
basics = pd.read_csv("Data/title_basics.csv.gz", low_memory = False)
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020.0,,70,Drama
2,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018.0,,122,Drama
3,tt0079644,movie,November 1828,November 1828,0,2001.0,,140,"Drama,War"
4,tt0080155,movie,The Wonderful Years,Die wunderbaren Jahre,0,,,104,Drama


## Load AKA and preprocess

In [36]:
# create the pandas df
aka = pd.read_csv(title_akas, sep='\t', low_memory=False)


In [37]:
#look ar the column names
aka.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35344846 entries, 0 to 35344845
Data columns (total 8 columns):
 #   Column           Dtype 
---  ------           ----- 
 0   titleId          object
 1   ordering         int64 
 2   title            object
 3   region           object
 4   language         object
 5   types            object
 6   attributes       object
 7   isOriginalTitle  object
dtypes: int64(1), object(7)
memory usage: 2.1+ GB


In [38]:
# replace null values
aka.replace({'\\N':np.nan}, inplace = True)

In [None]:
# filter I am not sure what this is all about ask next week
#basics = basics[keepers]
#basics

In [40]:
# Filter the basics table down to only include the US by using the filter akas dataframe
keepers =basics['tconst'].isin(aka['titleId'])
keepers

0         True
1         True
2         True
3         True
4         True
          ... 
154060    True
154061    True
154062    True
154063    True
154064    True
Name: tconst, Length: 154065, dtype: bool

In [41]:
## Save current dataframe to file.
aka.to_csv("Data/title_aka.csv.gz",compression='gzip',index=False)

In [42]:
# Open saved file and preview again
aka = pd.read_csv("Data/title_aka.csv.gz", low_memory = False)
aka.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,,imdbDisplay,,0.0
1,tt0000001,2,Carmencita,DE,,,literal title,0.0
2,tt0000001,3,Carmencita - spanyol tánc,HU,,imdbDisplay,,0.0
3,tt0000001,4,Καρμενσίτα,GR,,imdbDisplay,,0.0
4,tt0000001,5,Карменсита,RU,,imdbDisplay,,0.0


## Load ratings and preprocess


In [43]:
# create the pandas df
rating = pd.read_csv(title_rating, sep='\t', low_memory=False)


In [44]:
# replace null values
rating.replace({'\\N':np.nan}, inplace = True)

In [46]:
rating.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1293170 entries, 0 to 1293169
Data columns (total 3 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   tconst         1293170 non-null  object 
 1   averageRating  1293170 non-null  float64
 2   numVotes       1293170 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 29.6+ MB


In [48]:
# Filter the basics table down to only include the US by using the filter akas dataframe
keepers =rating['tconst'].isin(aka['titleId'])
keepers

0           True
1           True
2           True
3           True
4           True
           ...  
1293165     True
1293166     True
1293167    False
1293168    False
1293169    False
Name: tconst, Length: 1293170, dtype: bool

In [49]:
## Save current dataframe to file.
aka.to_csv("Data/title_rating.csv.gz",compression='gzip',index=False)

In [50]:
# Open saved file and preview again
rating = pd.read_csv("Data/title_rating.csv.gz", low_memory = False)
rating.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,,imdbDisplay,,0.0
1,tt0000001,2,Carmencita,DE,,,literal title,0.0
2,tt0000001,3,Carmencita - spanyol tánc,HU,,imdbDisplay,,0.0
3,tt0000001,4,Καρμενσίτα,GR,,imdbDisplay,,0.0
4,tt0000001,5,Карменсита,RU,,imdbDisplay,,0.0


# Show the info on each dataframe

In [51]:
#check the info on the basics
basics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 154065 entries, 0 to 154064
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   tconst          154065 non-null  object 
 1   titleType       154065 non-null  object 
 2   primaryTitle    154065 non-null  object 
 3   originalTitle   154065 non-null  object 
 4   isAdult         154065 non-null  int64  
 5   startYear       149353 non-null  float64
 6   endYear         0 non-null       float64
 7   runtimeMinutes  154065 non-null  int64  
 8   genres          154065 non-null  object 
dtypes: float64(2), int64(2), object(5)
memory usage: 10.6+ MB


In [52]:
#check the aka info
aka.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35344846 entries, 0 to 35344845
Data columns (total 8 columns):
 #   Column           Dtype  
---  ------           -----  
 0   titleId          object 
 1   ordering         int64  
 2   title            object 
 3   region           object 
 4   language         object 
 5   types            object 
 6   attributes       object 
 7   isOriginalTitle  float64
dtypes: float64(1), int64(1), object(6)
memory usage: 2.1+ GB


In [53]:
# check the rartings info
rating.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35344846 entries, 0 to 35344845
Data columns (total 8 columns):
 #   Column           Dtype  
---  ------           -----  
 0   titleId          object 
 1   ordering         int64  
 2   title            object 
 3   region           object 
 4   language         object 
 5   types            object 
 6   attributes       object 
 7   isOriginalTitle  float64
dtypes: float64(1), int64(1), object(6)
memory usage: 2.1+ GB
