# Movie Revenue Project Data Cleaning and Modeling Notebook

In [1]:
# Needed to pip install psycopg2
# pip install psycopg2-binary

In [2]:
# imports 
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import plotly.express as px
from sqlalchemy import create_engine

In [3]:
# Imports for modeling: 
import statsmodels.api as sm
# Need to import this to deal with missing data 
from sklearn.impute import SimpleImputer
# Need these for creating pipeline 
from sklearn.pipeline import make_pipeline 
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn import metrics

## Step 1: Connect to SQL server to get data

In [207]:
# Establish a connection using SQLAlchemy 

# Try following string 
# Note: 'psycopg2' is the assumed driver for postgres 
connection_url = 'postgresql+psycopg2://oahwyljl:sSrk8smQ16BCOVhHQBVWVtK2nVcCDmiF@peanut.db.elephantsql.com/oahwyljl'

try:
    # GET THE CONNECTION OBJECT (ENGINE) FOR THE DATABASE
    engine = create_engine(connection_url)
    print( f"Connection created successfully.")
    
except Exception as ex:
    print("Connection could not be made due to the following error: \n", ex)

Connection created successfully.


In [142]:
# df is the movie_info dataframe
df = pd.read_sql_query("SELECT * from movieinfo", con=engine, parse_dates = ['released'])

In [208]:
# We have 8,980 movies for now (will add more tomorrow) 
df.shape

(8980, 15)

In [215]:
# df_revenue is the dataframe containing revenue 
df_revenue = pd.read_sql_query("SELECT * from moviesgross", con=engine, parse_dates = ['release_date'])

In [217]:
# We have 14,939 movies from The Numbers 
df_revenue.shape

(14939, 9)

## Step 2: Inspect and Remove 'NaN's from movie df: 

In [230]:
# Remove movieinfo_id extra column from both datasets: 
df = df.drop('movieinfo_id', axis = 1)
df_revenue = df_revenue.drop('moviegross_id', axis = 1)

In [218]:
# Replace 'NaN' strings with none type in both dataframes 
df = df.replace('NaN', np.nan)
df_revenue = df_revenue.replace('NaN', np.nan)

In [219]:
# See how much data is missing in df and df_revunue
df.isna().sum()

movieinfo_id       0
title              5
year               5
rated           1264
released         140
runtime          114
genre             18
director         210
writer           616
actors           127
plot             111
language          60
country          196
poster           110
top_10_dir         0
dtype: int64

In [220]:
# Good - no missing values for gross revenue 
# 'Runtime' column has 114 NAN values 
df_revenue.isna().sum()

moviegross_id       0
year                0
rank                0
title               0
release_date       36
distributor      1127
genre             931
gross               0
tickets_sold        0
dtype: int64

In [223]:
# Good that 'gross' is already an integer and won't need to be recoded! 
df_revenue['gross']

0        572984769
1        224543292
2        212609036
3        183651655
4        173005945
           ...    
14934          869
14935          589
14936          516
14937          374
14938          150
Name: gross, Length: 14939, dtype: int64

## Step 3: Recode columns in df 

In [227]:
# The following columns will need to be recoded: 
# Year --> to int 
# Runtime --> to int 
# Genre --> need to convert to list and dummy code genres 
# Directors --> create extra columns code top 10, 50, 100 
# Writer --> ignore for now? Can do same thing as with Director and actors 
# Actors --> convert to list and code top 10, 50, and 100 
# Language --> code as English only, English and other(s), Foreign lang only --> then one hot encode as 2 variables 
# Country --> code as US only, US and other countries, Foreign only --> then one hot encode 
for column in df.columns: 
    print(column, df[column].dtype) 

movieinfo_id int64
title object
year object
rated object
released datetime64[ns]
runtime float64
genre object
director object
writer object
actors object
plot object
language object
country object
poster object
top_10_dir int64


In [185]:
# Function for converting 'runtime' to int 
def get_minutes(x): 
    if pd.isna(x): 
        return np.nan 
    else: 
        try: 
            return int(x.split(' ')[0])
        except: 
            return np.nan

In [193]:
# Convert 'runtime'
df['runtime'] = df['runtime'].apply(get_minutes) 

# Create columns for Top Director and Top Actors

## Directors

In [144]:
num_directors = len(df['director'].value_counts())
print(f'There are {num_directors} directors in the dataset.') 

There are 4985 directors in the dataset.


### What % of movies did the top 10, 50, 100 directors make? 

In [121]:
# Lists of the top directors: 
# Problem with this approach: need to see how many movies top 10 director made and include anyone 
# who made the same number of movies in that list too 
top_10_directors = list(df['director'].value_counts()[0:10].index)
top_50_directors = list(df['director'].value_counts()[0:50].index)
top_100_directors = list(df['director'].value_counts()[0:100].index)

In [262]:
df['director'].value_counts()[0:11]

Ridley Scott          23
Clint Eastwood        20
Ben Stassen           20
Robert Zemeckis       20
Ron Howard            20
Steven Spielberg      19
Woody Allen           17
Steven Soderbergh     15
Martin Scorsese       14
François Ozon         14
M. Night Shyamalan    13
Name: director, dtype: int64

In [283]:
# New approach: 
cutoff_10 = df['director'].value_counts()[9]
top_10_directors = list(df['director'].value_counts()[df['director'].value_counts() >= cutoff_10].index)
num_top = len((df['director'].value_counts()[df['director'].value_counts() >= cutoff_10]))

# Movies made by the top 10 directors: 
top_10 = df['director'].apply(lambda x: x in top_10_directors).sum()
print(f'There were {num_top_10} "top 10" directors, who EACH made at least {cutoff_10} movies. \n \
They made {top_10} movies ({round(top_10 / len(df) * 100, 1)}% of all movies in dataset).')

There were 10 "top 10" directors, who EACH made at least 14 movies. 
 They made 182 movies (2.0% of all movies in dataset).


In [284]:
cutoff_50 = df['director'].value_counts()[49]
top_50_directors = list(df['director'].value_counts()[df['director'].value_counts() >= cutoff_50].index)
num_top_50 = len((df['director'].value_counts()[df['director'].value_counts() >= cutoff_50]))

# Movies made by the top 50 directors: 
top_50 = df['director'].apply(lambda x: x in top_50_directors).sum()
print(f'There were {num_top_50} "top 50" directors, who EACH made at least {cutoff_50} movies.\n \
They made {top_50} movies ({round(top_50 / len(df) * 100, 1)}% of all movies in dataset).')

There were 62 "top 50" directors, who EACH made at least 8 movies.
 They made 668 movies (7.4% of all movies in dataset).


In [285]:
cutoff_100 = df['director'].value_counts()[99]
top_100_directors = list(df['director'].value_counts()[df['director'].value_counts() >= cutoff_100].index)
num_top_100 = len((df['director'].value_counts()[df['director'].value_counts() >= cutoff_100]))

# Movies made by the top 100 directors: 
top_100 = df['director'].apply(lambda x: x in top_100_directors).sum()
print(f'There were {num_top_100} "top 100" directors, who EACH made at least {cutoff_100} movies.\n \
They made {top_100} movies ({round(top_100 / len(df) * 100, 1)}% of all movies in dataset).')

There were 100 "top 100" directors, who EACH made at least 7 movies.
 They made 934 movies (10.4% of all movies in dataset).


### Create columns to code for top directors: 

In [286]:
# Function for coding top director 
def has_top_director(x, director_list): 
    if pd.isna(x): 
        return np.nan 
    elif x in director_list: 
        return 1 
    else: 
        return 0 

In [289]:
top_10_directors

['Ridley Scott',
 'Clint Eastwood',
 'Ben Stassen',
 'Robert Zemeckis',
 'Ron Howard',
 'Steven Spielberg',
 'Woody Allen',
 'Steven Soderbergh',
 'Martin Scorsese',
 'François Ozon']

In [297]:
df['director'].apply(lambda x: np.nan if pd.isna(x) elif 1 if x in top_100_directors else 0)

SyntaxError: invalid syntax (<ipython-input-297-2b95eb163fcd>, line 1)

In [298]:
df['director'].apply(lambda x: has_top_director(x, top_10_directors))

0       0.0
1       0.0
2       0.0
3       0.0
4       0.0
       ... 
8975    0.0
8976    0.0
8977    0.0
8978    0.0
8979    0.0
Name: director, Length: 8980, dtype: float64

In [200]:
df['top_10_dir'] = df['director'].apply(lambda x: 1 if x in top_100_directors else 0)

In [72]:
df.head()

Unnamed: 0,movieinfo_id,title,year,rated,released,runtime,genre,director,writer,actors,plot,language,country,poster
0,1,Spider-Man: No Way Home,2021,PG-13,2021-12-17,148 min,"Action, Adventure, Fantasy",Jon Watts,"Chris McKenna, Erik Sommers, Stan Lee","Tom Holland, Zendaya, Benedict Cumberbatch","With Spider-Man's identity now revealed, Peter...",English,United States,https://m.media-amazon.com/images/M/MV5BZWMyYz...
1,2,Shang-Chi and the Legend of the Ten Rings,2021,PG-13,2021-09-03,132 min,"Action, Adventure, Fantasy",Destin Daniel Cretton,"Dave Callaham, Destin Daniel Cretton, Andrew L...","Simu Liu, Awkwafina, Tony Chiu-Wai Leung","Shang-Chi, the master of weaponry-based Kung F...","English, Mandarin",United States,https://m.media-amazon.com/images/M/MV5BNTliYj...
2,3,Venom: Let There Be Carnage,2021,PG-13,2021-10-01,97 min,"Action, Adventure, Sci-Fi",Andy Serkis,"Kelly Marcel, Tom Hardy","Tom Hardy, Woody Harrelson, Michelle Williams",Eddie Brock attempts to reignite his career by...,English,"United States, China",https://m.media-amazon.com/images/M/MV5BYTc3ZT...
3,4,Black Widow,2021,PG-13,2021-07-09,134 min,"Action, Adventure, Sci-Fi",Cate Shortland,"Eric Pearson, Jac Schaeffer, Ned Benson","Scarlett Johansson, Florence Pugh, David Harbour",Natasha Romanoff confronts the darker parts of...,"English, Russian, Norwegian, Hungarian, Macedo...",United States,https://m.media-amazon.com/images/M/MV5BNjRmND...
4,5,F9: The Fast Saga,2021,PG-13,2021-06-25,143 min,"Action, Crime, Thriller",Justin Lin,"Daniel Casey, Justin Lin, Alfredo Botello","Vin Diesel, Michelle Rodriguez, Jordana Brewster",Dom and the crew must take on an international...,English,United States,https://m.media-amazon.com/images/M/MV5BMjI0Nm...


In [113]:
df['actors']

0             Tom Holland, Zendaya, Benedict Cumberbatch
1               Simu Liu, Awkwafina, Tony Chiu-Wai Leung
2          Tom Hardy, Woody Harrelson, Michelle Williams
3       Scarlett Johansson, Florence Pugh, David Harbour
4       Vin Diesel, Michelle Rodriguez, Jordana Brewster
                              ...                       
8975              David Rosen, Alexa Jago, Jonathan Aube
8976             Tony Nardi, Timothy Webber, Tygh Runyan
8977              Steven Nelson, Honey Lauren, Jeri Ryan
8978                   Roshan Seth, Soni Razdan, Om Puri
8979       Colin Friels, Jack Thompson, Donald Pleasence
Name: actors, Length: 8980, dtype: object

In [91]:
df_revenue['gross']

0        572984769
1        224543292
2        212609036
3        183651655
4        173005945
           ...    
14934          869
14935          589
14936          516
14937          374
14938          150
Name: gross, Length: 14939, dtype: int64