# **Movie Perdictions Part 1**

**Name:** **Derek Overton**

**Date:** **2/19/2023**

**Project:** **Movie Predictions**

## **Import Data**

In [1]:
import pandas as pd
import numpy as np 

In [2]:
# Import Basic:
bdf = pd.read_csv('https://datasets.imdbws.com/title.basics.tsv.gz', sep='\t', low_memory=False)

In [3]:
# Import Ratings:
rdf = pd.read_csv('https://datasets.imdbws.com/title.ratings.tsv.gz', sep='\t', low_memory=False)

In [4]:
# Import Askas:
aka_df = pd.read_csv('https://datasets.imdbws.com/title.akas.tsv.gz', sep='\t', low_memory=False)

In [5]:
# Review Data Set
bdf.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


- It appears that the basic data loaded correctly.

# **Preprocessing Data**

## **Replace '\N' with np.nan**

In [6]:
# Replace '\N' with np.nan for Basic:
bdf.replace({'\\N':np.nan}, inplace = True)

In [7]:
# Display Changes to '\N' with np.nan:
bdf.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,,1,"Comedy,Short"


In [8]:
# Replace '\N' with np.nan for Ratings:
rdf.replace({'\\N':np.nan}, inplace = True)

In [9]:
# Display Changes to '\N' with np.nan:
rdf.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1954
1,tt0000002,5.8,263
2,tt0000003,6.5,1787
3,tt0000004,5.6,179
4,tt0000005,6.2,2589


In [10]:
# Replace '\N' with np.nan for AKAS:
aka_df.replace({'\\N':np.nan}, inplace = True)

In [11]:
# Display Changes to '\N' with np.nan:
aka_df.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,,imdbDisplay,,0
1,tt0000001,2,Carmencita,DE,,,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,,imdbDisplay,,0
3,tt0000001,4,Καρμενσίτα,GR,,imdbDisplay,,0
4,tt0000001,5,Карменсита,RU,,imdbDisplay,,0


## **Eliminate movies that are null for runtimeMinutes**

In [12]:
# Exclude any movie with missing values for genre or runtime:
bdf = bdf.dropna(subset=["runtimeMinutes"])

## **Eliminate movies that are null for genre**

In [13]:
# Exclude any movie with missing values for genre or runtime:
bdf = bdf.dropna(subset=["genres"])

## **Keep only titleType==Movie**

In [14]:
bdf = bdf[bdf['titleType'].str.lower() == 'movie']

## **Keep startYear 2000-2022**

In [15]:
bdf.isna().sum()

tconst                 0
titleType              0
primaryTitle           0
originalTitle          0
isAdult                0
startYear           6379
endYear           377587
runtimeMinutes         0
genres                 0
dtype: int64

In [16]:
bdf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 377587 entries, 8 to 9634718
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   tconst          377587 non-null  object
 1   titleType       377587 non-null  object
 2   primaryTitle    377587 non-null  object
 3   originalTitle   377587 non-null  object
 4   isAdult         377587 non-null  object
 5   startYear       371208 non-null  object
 6   endYear         0 non-null       object
 7   runtimeMinutes  377587 non-null  object
 8   genres          377587 non-null  object
dtypes: object(9)
memory usage: 28.8+ MB


In [17]:
# Fill the missing values with (0) and convert the column to integer type
bdf['startYear']=bdf['startYear'].fillna(0).astype(float)

In [18]:
# Check Column types:
bdf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 377587 entries, 8 to 9634718
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   tconst          377587 non-null  object 
 1   titleType       377587 non-null  object 
 2   primaryTitle    377587 non-null  object 
 3   originalTitle   377587 non-null  object 
 4   isAdult         377587 non-null  object 
 5   startYear       377587 non-null  float64
 6   endYear         0 non-null       object 
 7   runtimeMinutes  377587 non-null  object 
 8   genres          377587 non-null  object 
dtypes: float64(1), object(8)
memory usage: 28.8+ MB


In [19]:
#Include only movies that were released 2000 - 2021 (include 2000 and 2021)
filtered_movies = bdf[(bdf['startYear'] >= 2000) & (bdf['startYear'] <= 2021)]

In [20]:
# Check Filter for Start Year:
filtered_movies.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
13082,tt0013274,movie,Istoriya grazhdanskoy voyny,Istoriya grazhdanskoy voyny,0,2021.0,,133,Documentary
34803,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
61116,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020.0,,70,Drama
67669,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018.0,,122,Drama
77964,tt0079644,movie,November 1828,November 1828,0,2001.0,,140,"Drama,War"


## **Eliminate movies that include "Documentary" in genre (see tip below)**

In [21]:
# Exclude movies that are included in the documentary category.
is_documentary = filtered_movies['genres'].str.contains('documentary',case=False)
filtered_movies = filtered_movies[~is_documentary]

## **Include only movies that were released in the United States**

In [22]:
usfilter = aka_df['region']=='US'
aka_df = aka_df[usfilter]

In [23]:
aka_df = aka_df.rename(columns={'titleId': 'tconst'})
merged_df = pd.merge(filtered_movies, aka_df, on='tconst')

In [24]:
merged_df.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance",37,Kate and Leopold,US,,,alternative spelling,0
1,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance",39,Kate & Leopold,US,,imdbDisplay,,0
2,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020.0,,70,Drama,6,The Tango of the Widower and Its Distorting Mi...,US,,imdbDisplay,,0
3,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018.0,,122,Drama,3,The Other Side of the Wind,US,,imdbDisplay,,0
4,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005.0,,100,"Comedy,Horror,Sci-Fi",1,Attack of the B-Movie Monster,US,,working,,0


In [25]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 90406 entries, 0 to 90405
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   tconst           90406 non-null  object 
 1   titleType        90406 non-null  object 
 2   primaryTitle     90406 non-null  object 
 3   originalTitle    90406 non-null  object 
 4   isAdult          90406 non-null  object 
 5   startYear        90406 non-null  float64
 6   endYear          0 non-null      object 
 7   runtimeMinutes   90406 non-null  object 
 8   genres           90406 non-null  object 
 9   ordering         90406 non-null  int64  
 10  title            90406 non-null  object 
 11  region           90406 non-null  object 
 12  language         877 non-null    object 
 13  types            83883 non-null  object 
 14  attributes       4217 non-null   object 
 15  isOriginalTitle  90406 non-null  object 
dtypes: float64(1), int64(1), object(14)
memory usage: 11.7+ MB

In [26]:
rdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1282932 entries, 0 to 1282931
Data columns (total 3 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   tconst         1282932 non-null  object 
 1   averageRating  1282932 non-null  float64
 2   numVotes       1282932 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 29.4+ MB


In [27]:
aka_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1416568 entries, 5 to 35026261
Data columns (total 8 columns):
 #   Column           Non-Null Count    Dtype 
---  ------           --------------    ----- 
 0   tconst           1416568 non-null  object
 1   ordering         1416568 non-null  int64 
 2   title            1416568 non-null  object
 3   region           1416568 non-null  object
 4   language         3833 non-null     object
 5   types            974118 non-null   object
 6   attributes       46043 non-null    object
 7   isOriginalTitle  1415223 non-null  object
dtypes: int64(1), object(7)
memory usage: 97.3+ MB
