In [1]:
# Import Libraries
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_absolute_error, median_absolute_error, r2_score
from sqlalchemy import create_engine

from config import db_password

In [2]:
# Connect to Database
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/IMDB_Movie_Data"

engine = create_engine(db_string).connect()

In [3]:
# Load data from Database into pandas DataFrame
X = pd.read_sql("""select budget, runtime, popularity
    from clean_kaggle where runtime is not null""", con=engine)
y = pd.read_sql("select revenue from clean_kaggle where runtime is not null", con=engine)



In [4]:
# Features
X.shape

(2991, 3)

In [5]:
# Predictor Variable
y.shape

(2991, 1)

In [6]:
# Split data features/targets into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

## Linear Regression Model

In [7]:
# Initiate the model
model = LinearRegression().fit(X_train, y_train)

# Start Prediction
y_pred = model.predict(X_test)

In [8]:
# Check model scores
# mean absolute error
mae = round(mean_absolute_error(y_test, y_pred),2)
# median absolute error
medae = round(median_absolute_error(y_test, y_pred),2)
# r2 score
r2score = round(r2_score(y_test, y_pred),2)

print(f"Mean Absolute Error: {mae}")
print(f"Median Absolute Error: {medae}")
print(f"r-squared Score: {r2score}")

# Score with cross validation
print(cross_val_score(model, X, y, cv=3))

Mean Absolute Error: 41736271.38
Median Absolute Error: 18415534.08
r-squared Score: 0.5
[0.58473562 0.59238003 0.65514311]


## Random Forest Regression

In [9]:
# Initiate model 
model = RandomForestRegressor(n_estimators=100).fit(X_train,y_train)

# Start Prediction
y_pred = model.predict(X_test)

  model = RandomForestRegressor(n_estimators=100).fit(X_train,y_train)


In [10]:
# Check model scores
# mean absolute error
mae = round(mean_absolute_error(y_test, y_pred),2)
# median absolute error
medae = round(median_absolute_error(y_test, y_pred),2)
# r2 score
r2score = round(r2_score(y_test, y_pred),2)

print(f"Mean Absolute Error: {mae}")
print(f"Median Absolute Error: {medae}")
print(f"r-squared Score: {r2score}")

# Score with cross validation
print(cross_val_score(model, X, y, cv=3))

Mean Absolute Error: 40408856.31
Median Absolute Error: 18418383.29
r-squared Score: 0.43


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


[0.59580724 0.5630904  0.6529214 ]


## Final Model with Validation

After comparing each model's scores, we choose one and use K-Folds cross-validation for the final prediction 

In [11]:
from sklearn.model_selection import KFold

# Initialize the model
final_model = RandomForestRegressor(n_estimators=100, random_state=16).fit(X_train, y_train)
# Use K-Folds cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=91)
for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    final_model.fit(X_train, y_train)
    y_pred = final_model.predict(X_test)

  final_model = RandomForestRegressor(n_estimators=100, random_state=16).fit(X_train, y_train)
  final_model.fit(X_train, y_train)
  final_model.fit(X_train, y_train)
  final_model.fit(X_train, y_train)
  final_model.fit(X_train, y_train)
  final_model.fit(X_train, y_train)


In [12]:
# Check final scores
# mean absolute error
mae = round(mean_absolute_error(y_test, y_pred),2)
# median absolute error
medae = round(median_absolute_error(y_test, y_pred),2)
# r2 score
r2score = round(r2_score(y_test, y_pred),2)

print(f"Mean Absolute Error: {mae}")
print(f"Median Absolute Error: {medae}")
print(f"r-squared Score: {r2score}")

# Score with cross validation
print(cross_val_score(final_model, X, y, cv=3))

Mean Absolute Error: 48760010.29
Median Absolute Error: 17862698.13
r-squared Score: 0.58


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


[0.60009408 0.56280547 0.64822253]
