In [1]:
# Import Libraries
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_absolute_error, median_absolute_error, r2_score
from sqlalchemy import create_engine

from config import db_password

In [2]:
# Connect to Database
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/IMDB_Movie_Data"

engine = create_engine(db_string).connect()

In [3]:
# Load data from Database into pandas DataFrame

# # not taking genre into account
# X = pd.read_sql("""select budget, runtime, popularity, has_collection, release_month, release_day
#     from clean_kaggle where runtime is not null""", con=engine)

X = pd.read_sql("""select budget, runtime, popularity, animation, western, mystery, horror, romance,
    tv_movie, war, action, clean_kaggle.foreign, music,
    thriller, documentary, science_fiction, adventure, drama, 
    comedy, family, fantasy, history, crime, has_collection, release_month, release_day
    from clean_kaggle where runtime is not null""", con=engine)
y = pd.read_sql("select revenue from clean_kaggle where runtime is not null", con=engine)

In [4]:
# Features
X.shape

(2991, 26)

In [5]:
# Predictor Variable
y.shape

(2991, 1)

In [6]:
# Split data features/targets into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

## Linear Regression Model

In [7]:
# Initiate the model
model = LinearRegression().fit(X_train, y_train.values.ravel())

# Start Prediction
y_pred = model.predict(X_test)

In [8]:
# Check model scores
# median absolute error
medae = round(median_absolute_error(y_test, y_pred),2)
# r2 score
r2score = round(r2_score(y_test, y_pred),2)

print(f"Median Absolute Error: {medae}")
print(f"r-squared Score: {r2score}")

# Score with cross validation
print(cross_val_score(model, X, y, cv=3))

Median Absolute Error: 28883516.65
r-squared Score: 0.53
[0.63125495 0.60667479 0.6886015 ]


## Random Forest Regression

In [9]:
# Initiate model 
model = RandomForestRegressor(n_estimators=100).fit(X_train,y_train.values.ravel())

# Start Prediction
y_pred = model.predict(X_test)

In [10]:
# Check model scores
# median absolute error
medae = round(median_absolute_error(y_test, y_pred),2)
# r2 score
r2score = round(r2_score(y_test, y_pred),2)

print(f"Median Absolute Error: {medae}")
print(f"r-squared Score: {r2score}")

# Score with cross validation
print(cross_val_score(model, X, y, cv=3))

Median Absolute Error: 16404849.97
r-squared Score: 0.55


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


[0.68349285 0.65713156 0.7171593 ]


## Final Model with K-Fold Validation

After comparing each model's scores, we choose one and use K-Folds cross-validation for the final prediction 

In [11]:
from sklearn.model_selection import KFold

# Initialize the model
final_model = RandomForestRegressor(n_estimators=200, random_state=99).fit(X_train, y_train.values.ravel())

In [12]:
# Use K-Folds cross-validation
kf = KFold(n_splits=10, shuffle=True, random_state=91)
for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    final_model.fit(X_train, y_train.values.ravel())
    y_pred = final_model.predict(X_test)
    print(mean_absolute_error(y_test, y_pred))
    print(r2_score(y_test, y_pred))

39714531.44461666
0.7537855304193286
39364123.177742474
0.7446281540460635
38234024.07744147
0.6901136917781454
33392858.265451502
0.8306069734956791
37729137.50316054
0.2976769245191643
46134217.081421405
0.6816485272230439
33026505.840050165
0.7488049951836215
37548403.35755853
0.637340574841372
44882902.962943144
0.6657101141599678
42220530.73928094
0.7215768072334271


In [13]:
y_pred = final_model.predict(X_test)
# Check final scores
# mean absolute error
mae = round(mean_absolute_error(y_test, y_pred),2)
# median absolute error
medae = round(median_absolute_error(y_test, y_pred),2)
# r2 score
r2score = round(r2_score(y_test, y_pred),2)

print(f"Mean Absolute Error: {mae}")
print(f"Median Absolute Error: {medae}")
print(f"r-squared Score: {r2score}")

# Score with cross validation
print(cross_val_score(final_model, X, y, cv=3))

Mean Absolute Error: 42220530.74
Median Absolute Error: 18149502.92
r-squared Score: 0.72


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


[0.68530503 0.64467589 0.71711233]


## Using Random Search CV

In [14]:
from sklearn.model_selection import RandomizedSearchCV

regr = RandomForestRegressor()
n_estimators = [100,200,300,400,500,600,700,800,900,1000]
max_features = ['auto', 'sqrt']
max_depth = [2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]
min_samples_split = [2, 5, 10, 15, 100]
min_samples_leaf = [1, 2, 5, 10,20,40]


In [15]:
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}

In [17]:
random_search = RandomizedSearchCV(regr, param_distributions=random_grid,
                                   n_iter=30, cv=5, random_state=42)
model_random_search = random_search.fit(X_train,y_train.values.ravel())
model_random_search.best_params_

{'n_estimators': 400,
 'min_samples_split': 15,
 'min_samples_leaf': 1,
 'max_features': 'auto',
 'max_depth': 10}

In [18]:
regr = RandomForestRegressor(n_estimators= 400,
    min_samples_split= 15,
    min_samples_leaf=1,
    max_features = 'auto',
    max_depth= 10)

model = regr.fit(X_train,y_train.values.ravel()) 

In [19]:
y_pred = model.predict(X_test)
# Check final scores
# mean absolute error
mae = round(mean_absolute_error(y_test, y_pred),2)
# median absolute error
medae = round(median_absolute_error(y_test, y_pred),2)
# r2 score
r2score = round(r2_score(y_test, y_pred),2)

print(f"Mean Absolute Error: {mae}")
print(f"Median Absolute Error: {medae}")
print(f"r-squared Score: {r2score}")

# Score with cross validation
print(cross_val_score(final_model, X, y.values.ravel(), cv=3))

Mean Absolute Error: 42263546.5
Median Absolute Error: 16036896.73
r-squared Score: 0.7
[0.68530503 0.64467589 0.71711233]
