# Project 2

## Import libraries

In [11]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [13]:
# Confirm folder was created and files added successfully
import os
os.listdir("Data/")



['title.basics.tsv.gz',
 'title.ratings.tsv.gz',
 'title-akas-us-only.csv',
 'imdb-logo-transparent.png',
 '.ipynb_checkpoints',
 'movietables.png']

### Load akas data

In [14]:
#Load akas data
akas = pd.read_csv("Data/title-akas-us-only.csv", low_memory=False)



In [15]:
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,6,Carmencita,US,\N,imdbDisplay,\N,0
1,tt0000002,7,The Clown and His Dogs,US,\N,\N,literal English title,0
2,tt0000005,10,Blacksmith Scene,US,\N,imdbDisplay,\N,0
3,tt0000005,1,Blacksmithing Scene,US,\N,alternative,\N,0
4,tt0000005,6,Blacksmith Scene #1,US,\N,alternative,\N,0


In [5]:
akas.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1452564 entries, 0 to 1452563
Data columns (total 8 columns):
 #   Column           Non-Null Count    Dtype 
---  ------           --------------    ----- 
 0   titleId          1452564 non-null  object
 1   ordering         1452564 non-null  int64 
 2   title            1452564 non-null  object
 3   region           1452564 non-null  object
 4   language         1452564 non-null  object
 5   types            1452564 non-null  object
 6   attributes       1452564 non-null  object
 7   isOriginalTitle  1452564 non-null  object
dtypes: int64(1), object(7)
memory usage: 88.7+ MB


### Load basics data

In [16]:
#load basics.tsv
basics = pd.read_csv("Data/title.basics.tsv.gz", sep='\t', low_memory=False)





In [17]:
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


## Filter basics to contain US movies

In [18]:
# Filter the basics table down to only include the US by using the filter akas dataframe
filter_us_titles = basics['tconst'].isin(akas['titleId'])
basics = basics[filter_us_titles]
basics



Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"
5,tt0000006,short,Chinese Opium Den,Chinese Opium Den,0,1894,\N,1,Short
6,tt0000007,short,Corbett and Courtney Before the Kinetograph,Corbett and Courtney Before the Kinetograph,0,1894,\N,1,"Short,Sport"
...,...,...,...,...,...,...,...,...,...
10016872,tt9916560,tvMovie,March of Dimes Presents: Once Upon a Dime,March of Dimes Presents: Once Upon a Dime,0,1963,\N,58,Family
10016901,tt9916620,movie,The Copeland Case,The Copeland Case,0,\N,\N,\N,Drama
10016939,tt9916702,short,Loving London: The Playground,Loving London: The Playground,0,\N,\N,\N,"Drama,Short"
10016962,tt9916756,short,Pretty Pretty Black Girl,Pretty Pretty Black Girl,0,2019,\N,\N,Short


## Dealing with "\N"

In [19]:
#Replacing \N with np.nan
basics = basics.replace({'\\N':np.nan})



In [20]:
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,,5,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,,1,"Comedy,Short"
5,tt0000006,short,Chinese Opium Den,Chinese Opium Den,0,1894,,1,Short
6,tt0000007,short,Corbett and Courtney Before the Kinetograph,Corbett and Courtney Before the Kinetograph,0,1894,,1,"Short,Sport"


### Droping nan in "runtime" and "genres"

In [21]:
#dropping np.nan in "runtime" and "genres"
col_to_check=["runtimeMinutes","genres"]
basics.dropna(subset=col_to_check,inplace=True)

## Filtering out movies

In [22]:
#filter out movies
basics=basics[basics["titleType"]=="movie"]
print(basics)

             tconst titleType                    primaryTitle  \
8         tt0000009     movie                      Miss Jerry   
144       tt0000147     movie   The Corbett-Fitzsimmons Fight   
570       tt0000574     movie     The Story of the Kelly Gang   
587       tt0000591     movie                The Prodigal Son   
672       tt0000679     movie  The Fairylogue and Radio-Plays   
...             ...       ...                             ...   
10016366  tt9915436     movie               Vida em Movimento   
10016544  tt9915872     movie            The Last White Witch   
10016684  tt9916170     movie                   The Rehearsal   
10016693  tt9916190     movie                       Safeguard   
10016777  tt9916362     movie                           Coven   

                           originalTitle isAdult startYear endYear  \
8                             Miss Jerry       0      1894     NaN   
144        The Corbett-Fitzsimmons Fight       0      1897     NaN   
570      

In [23]:
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
8,tt0000009,movie,Miss Jerry,Miss Jerry,0,1894,,45,Romance
144,tt0000147,movie,The Corbett-Fitzsimmons Fight,The Corbett-Fitzsimmons Fight,0,1897,,100,"Documentary,News,Sport"
570,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,,70,"Action,Adventure,Biography"
587,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0,1907,,90,Drama
672,tt0000679,movie,The Fairylogue and Radio-Plays,The Fairylogue and Radio-Plays,0,1908,,120,"Adventure,Fantasy"


## Changing start year from inter to float

In [24]:
#from chatgpt
#changing startyear to a float
#fl_basics["startYear"]=pd.to_numeric(fl_basics["startYear"],errors="coerce").astype(float)
#fl_basics.loc[:,"startYear"]=pd.to_numeric(fl_basics.loc[:,"startYear"],errors="coerce").astype(float)
basics['startYear'] = basics['startYear'].astype(float)
basics.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 203476 entries, 8 to 10016777
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   tconst          203476 non-null  object 
 1   titleType       203476 non-null  object 
 2   primaryTitle    203476 non-null  object 
 3   originalTitle   203476 non-null  object 
 4   isAdult         203476 non-null  object 
 5   startYear       199907 non-null  float64
 6   endYear         0 non-null       object 
 7   runtimeMinutes  203476 non-null  object 
 8   genres          203476 non-null  object 
dtypes: float64(1), object(8)
memory usage: 15.5+ MB


## Filtering out movies made between 2000-20022

In [25]:
#filter out movies not between 2000-2022
basics=basics[(basics["startYear"]>=2000) & (basics["startYear"]<=2022)]
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
34802,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
61114,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020.0,,70,Drama
67666,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018.0,,122,Drama
86793,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005.0,,100,"Comedy,Horror,Sci-Fi"
93930,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002.0,,126,Drama
...,...,...,...,...,...,...,...,...,...
10016366,tt9915436,movie,Vida em Movimento,Vida em Movimento,0,2019.0,,70,Documentary
10016544,tt9915872,movie,The Last White Witch,My Girlfriend is a Wizard,0,2019.0,,97,"Comedy,Drama,Fantasy"
10016684,tt9916170,movie,The Rehearsal,O Ensaio,0,2019.0,,51,Drama
10016693,tt9916190,movie,Safeguard,Safeguard,0,2020.0,,95,"Action,Adventure,Thriller"


## Droping Na in "runtime" and "genres"

In [26]:
#droping documentaries
#filter_documentaries = filter_movies['genres'].str.contains('Documentary',na=False)
basics = basics.dropna(subset=['runtimeMinutes', 'genres'])


## Filtering  out documentary out of "genres"

In [27]:
filter_documentaries = basics['genres'].str.contains('Documentary')
basics = basics[~filter_documentaries]
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
34802,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
61114,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020.0,,70,Drama
67666,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018.0,,122,Drama
86793,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005.0,,100,"Comedy,Horror,Sci-Fi"
93930,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002.0,,126,Drama
...,...,...,...,...,...,...,...,...,...
10016149,tt9914942,movie,Life Without Sara Amat,La vida sense la Sara Amat,0,2019.0,,74,Drama
10016544,tt9915872,movie,The Last White Witch,My Girlfriend is a Wizard,0,2019.0,,97,"Comedy,Drama,Fantasy"
10016684,tt9916170,movie,The Rehearsal,O Ensaio,0,2019.0,,51,Drama
10016693,tt9916190,movie,Safeguard,Safeguard,0,2020.0,,95,"Action,Adventure,Thriller"


## Downloading rating data

In [28]:
rating = pd.read_csv("DATA/title.ratings.tsv.gz", sep='\t', low_memory=False)



In [29]:
rating.info()
rating.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1331492 entries, 0 to 1331491
Data columns (total 3 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   tconst         1331492 non-null  object 
 1   averageRating  1331492 non-null  float64
 2   numVotes       1331492 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 30.5+ MB


Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1988
1,tt0000002,5.8,265
2,tt0000003,6.5,1849
3,tt0000004,5.5,178
4,tt0000005,6.2,2632


## Filtering out rating to match basics data via "tconst"

In [30]:
filter_basics = rating['tconst'].isin(basics['tconst'])
rating=rating[filter_basics]
rating



Unnamed: 0,tconst,averageRating,numVotes
17961,tt0035423,6.4,87153
40764,tt0062336,6.4,175
46645,tt0069049,6.7,7754
63640,tt0088751,5.2,336
69953,tt0096056,5.6,846
...,...,...,...
1331411,tt9914942,6.6,178
1331437,tt9915872,6.4,9
1331450,tt9916170,7.0,7
1331451,tt9916190,3.7,243


In [31]:
rating.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 71900 entries, 17961 to 1331462
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   tconst         71900 non-null  object 
 1   averageRating  71900 non-null  float64
 2   numVotes       71900 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 2.2+ MB


In [None]:
frame_out='Data/'
filename="cleaned"

![](Data/imdb-logo-transparent.png)