# **Movie Perdictions Part 1**

**Name:** **Derek Overton**

**Date:** **2/19/2023**

**Project:** **Movie Predictions Part 1**

## **Import Data**

In [1]:
import pandas as pd
import numpy as np 
import os

In [2]:
# Import Basic:
basics = pd.read_csv('https://datasets.imdbws.com/title.basics.tsv.gz', sep='\t', low_memory=False)

In [3]:
# Import Ratings:
ratings = pd.read_csv('https://datasets.imdbws.com/title.ratings.tsv.gz', sep='\t', low_memory=False)

In [4]:
# Import Akas:
akas = pd.read_csv('https://datasets.imdbws.com/title.akas.tsv.gz', sep='\t', low_memory=False)

# **Preprocessing Data**

## **AKAS**

In [8]:
#replace \N with np.nan
akas = akas.replace({'\\N':np.nan})

In [9]:
#Keep only US movies
akas=akas.loc[akas['region']=="US"]

##  **Basics**

In [10]:
#replace \N with np.nan
basics = basics.replace({'\\N':np.nan})
basics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9691332 entries, 0 to 9691331
Data columns (total 9 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   tconst          object
 1   titleType       object
 2   primaryTitle    object
 3   originalTitle   object
 4   isAdult         object
 5   startYear       object
 6   endYear         object
 7   runtimeMinutes  object
 8   genres          object
dtypes: object(9)
memory usage: 665.5+ MB


In [11]:
#drop null runtimes
basics = basics.dropna(axis=0, subset='runtimeMinutes')
basics.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2841277 entries, 0 to 9691331
Data columns (total 9 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   tconst          object
 1   titleType       object
 2   primaryTitle    object
 3   originalTitle   object
 4   isAdult         object
 5   startYear       object
 6   endYear         object
 7   runtimeMinutes  object
 8   genres          object
dtypes: object(9)
memory usage: 216.8+ MB


In [12]:
#drop null genre
basics = basics.dropna(axis=0,subset = 'genres')
basics.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2765274 entries, 0 to 9691331
Data columns (total 9 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   tconst          object
 1   titleType       object
 2   primaryTitle    object
 3   originalTitle   object
 4   isAdult         object
 5   startYear       object
 6   endYear         object
 7   runtimeMinutes  object
 8   genres          object
dtypes: object(9)
memory usage: 211.0+ MB


In [15]:
#keep titletype=movie
basics= basics[basics['titleType']=='movie']
basics.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 372670 entries, 8 to 9691282
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   tconst          372670 non-null  object
 1   titleType       372670 non-null  object
 2   primaryTitle    372670 non-null  object
 3   originalTitle   372670 non-null  object
 4   isAdult         372670 non-null  object
 5   startYear       372670 non-null  object
 6   endYear         0 non-null       object
 7   runtimeMinutes  372670 non-null  object
 8   genres          372670 non-null  object
dtypes: object(9)
memory usage: 28.4+ MB


In [13]:
#drop null startYears
basics = basics.dropna(axis=0,subset = 'startYear')
basics.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2606772 entries, 0 to 9691331
Data columns (total 9 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   tconst          object
 1   titleType       object
 2   primaryTitle    object
 3   originalTitle   object
 4   isAdult         object
 5   startYear       object
 6   endYear         object
 7   runtimeMinutes  object
 8   genres          object
dtypes: object(9)
memory usage: 198.9+ MB


In [16]:
#convert year to an int
basics['startYear']= basics['startYear'].astype('int')

#Keep only the movies between 2000-2022
basics= basics.loc[(basics["startYear"]>= 2000) 
                        & (basics["startYear"]<= 2022)]
basics['startYear'].describe()

count    222379.000000
mean       2013.356837
std           5.845654
min        2000.000000
25%        2009.000000
50%        2014.000000
75%        2018.000000
max        2022.000000
Name: startYear, dtype: float64

In [18]:
# Exclude movies that are included in the documentary category.
documentary = basics['genres'].str.contains('documentary',case=False)
basics = basics[~documentary]
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
34803,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
61116,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020,,70,Drama
67669,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
77964,tt0079644,movie,November 1828,November 1828,0,2001,,140,"Drama,War"
86801,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi"


In [19]:
# Filter the basics table down to only include the US by using the filter ...
#Akas dataframe
keepers =basics['tconst'].isin(akas['titleId'])
basics = basics[keepers]
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
34803,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
61116,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020,,70,Drama
67669,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
86801,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi"
93938,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002,,126,Drama


## **Ratings**

In [20]:
#replace \N with np.nan
ratings = ratings.replace({'\\N':np.nan})

In [21]:
# Filter the ratings table down to only include the US by using the filter ...
#Akas dataframe
keepers = ratings['tconst'].isin(akas['titleId'])
ratings = ratings[keepers]
ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1959
1,tt0000002,5.8,263
4,tt0000005,6.2,2596
5,tt0000006,5.1,177
6,tt0000007,5.4,815


#  **Data File Storage**

In [32]:
# example making new folder with os
os.makedirs('Data/',exist_ok=True) 
# Confirm folder created
os.listdir("Data/")

['.ipynb_checkpoints', 'akas.csv.gz', 'basics.csv.gz', 'ratings.csv.gz']

In [24]:
## Save current dataframe to file.
basics.to_csv("Data/basics.csv.gz",compression='gzip',index=False)
ratings.to_csv("Data/ratings.csv.gz",compression='gzip',index=False)
akas.to_csv("Data/akas.csv.gz",compression='gzip',index=False)

In [25]:
# Open saved file and preview again
basics = pd.read_csv("Data/basics.csv.gz", low_memory = False)
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020,,70,Drama
2,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
3,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi"
4,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002,,126,Drama


In [27]:
# Open saved file and preview again
akas = pd.read_csv("Data/akas.csv.gz", low_memory = False)
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,6,Carmencita,US,,imdbDisplay,,0.0
1,tt0000002,7,The Clown and His Dogs,US,,,literal English title,0.0
2,tt0000005,10,Blacksmith Scene,US,,imdbDisplay,,0.0
3,tt0000005,1,Blacksmithing Scene,US,,alternative,,0.0
4,tt0000005,6,Blacksmith Scene #1,US,,alternative,,0.0


In [28]:
# Open saved file and preview again
ratings = pd.read_csv("Data/ratings.csv.gz", low_memory = False)
ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1959
1,tt0000002,5.8,263
2,tt0000005,6.2,2596
3,tt0000006,5.1,177
4,tt0000007,5.4,815


## **File Information Summary**

In [29]:
basics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86012 entries, 0 to 86011
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   tconst          86012 non-null  object 
 1   titleType       86012 non-null  object 
 2   primaryTitle    86012 non-null  object 
 3   originalTitle   86012 non-null  object 
 4   isAdult         86012 non-null  int64  
 5   startYear       86012 non-null  int64  
 6   endYear         0 non-null      float64
 7   runtimeMinutes  86012 non-null  int64  
 8   genres          86012 non-null  object 
dtypes: float64(1), int64(3), object(5)
memory usage: 5.9+ MB


In [30]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 492147 entries, 0 to 492146
Data columns (total 3 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   tconst         492147 non-null  object 
 1   averageRating  492147 non-null  float64
 2   numVotes       492147 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 11.3+ MB


In [31]:
akas.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1421760 entries, 0 to 1421759
Data columns (total 8 columns):
 #   Column           Non-Null Count    Dtype  
---  ------           --------------    -----  
 0   titleId          1421760 non-null  object 
 1   ordering         1421760 non-null  int64  
 2   title            1421760 non-null  object 
 3   region           1421760 non-null  object 
 4   language         3852 non-null     object 
 5   types            975783 non-null   object 
 6   attributes       46188 non-null    object 
 7   isOriginalTitle  1420415 non-null  float64
dtypes: float64(1), int64(1), object(6)
memory usage: 86.8+ MB
