In [1]:
# Import Libraries
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.datasets import make_regression
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import VarianceThreshold
from sklearn.linear_model import LinearRegression, Ridge, BayesianRidge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_absolute_error, median_absolute_error
from xgboost import XGBRegressor
from sqlalchemy import create_engine

from config import db_password

  from pandas import MultiIndex, Int64Index


In [2]:
# Connect to Database
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/IMDB_Movie_Data"

engine = create_engine(db_string).connect()

In [5]:
# Load data from Database into pandas DataFrame
X = pd.read_sql("""select budget, runtime, popularity, animation, western, mystery, horror, romance,
    tv_movie, war, action, clean_kaggle.foreign, music,
    thriller, documentary, science_fiction, adventure, drama, 
    comedy, family, fantasy, history, crime
    from clean_kaggle""", con=engine)
y = pd.read_sql("select revenue from clean_kaggle", con=engine)



In [None]:
# Split data features/targets into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [5]:
# Features
X.shape

(2993, 2)

In [6]:
# Predictor Variable
y.shape

(2993, 1)

## Linear Regression Model

In [24]:
# Initiate the model
model = LinearRegression().fit(X_train, y_train)

# Start Prediction
y_pred = model.predict(X_test)

In [None]:
features = sorted(zip(X.columns, model.feature_importances_), key = lambda x: x[1])
cols = [f[0] for f in features]
width = [f[1] for f in features]

fig, ax = plt.subplots()

fig.set_size_inches(10,200)
plt.margins(y=0.001)

ax.barh(y=cols, width=width)

plt.show()

In [25]:
# Check model scores
# mean absolute error
mae = mean_absolute_error(y_test, y_pred)
# median absolute error
medae = median_absolute_error(y_test, y_pred)
# r2 score
score = model.score(X_test, y_test)

print(f"Mean Absolute Error: {mae}")
print(f"Median Absolute Error: {medae}")
print(f"r-squared Score: {score}")

Mean Absolute Error: 2.7589452623437296
r-squared Score: 0.9992434463263903


## Ridge Regression Model

In [11]:
# Initiate the model
model = Ridge().fit(X_train, y_train)

# Start Prediction
y_pred = model.predict(X_test)

In [None]:
# Check model scores
# mean absolute error
mae = mean_absolute_error(y_test, y_pred)
# median absolute error
medae = median_absolute_error(y_test, y_pred)
# r2 score
score = model.score(X_test, y_test)

print(f"Mean Absolute Error: {mae}")
print(f"Median Absolute Error: {medae}")
print(f"r-squared Score: {score}")

Mean Absolute Error: 2.7589452623437296
r-squared Score: 0.9992434463263903


## Bayesian Ridge Regression Model

In [13]:
# Initiate the model
model = BayesianRidge().fit(X_train, y_train)

# Start Prediction
y_pred = model.predict(X_test)

In [None]:
# Check model scores
# mean absolute error
mae = mean_absolute_error(y_test, y_pred)
# median absolute error
medae = median_absolute_error(y_test, y_pred)
# r2 score
score = model.score(X_test, y_test)

print(f"Mean Absolute Error: {mae}")
print(f"Median Absolute Error: {medae}")
print(f"r-squared Score: {score}")

Mean Absolute Error: 2.7589452623437296
r-squared Score: 0.9992434463263903


## Random Forest Regression Model

In [15]:
# Initiate model 
model = RandomForestRegressor().fit(X_train,y_train)

# Start Prediction
y_pred = model.predict(X_test)

In [None]:
# Check model scores
# mean absolute error
mae = mean_absolute_error(y_test, y_pred)
# median absolute error
medae = median_absolute_error(y_test, y_pred)
# r2 score
score = model.score(X_test, y_test)

print(f"Mean Absolute Error: {mae}")
print(f"Median Absolute Error: {medae}")
print(f"r-squared Score: {score}")

Mean Absolute Error: 2.7589452623437296
r-squared Score: 0.9992434463263903


## XGBRegressor Model

In [17]:
# Initiate model
model = XGBRegressor().fit(X_train,y_train)

# Start Prediction
y_pred = model.predict(X_test)

In [None]:
# Check model scores
# mean absolute error
mae = mean_absolute_error(y_test, y_pred)
# median absolute error
medae = median_absolute_error(y_test, y_pred)
# r2 score
score = model.score(X_test, y_test)

print(f"Mean Absolute Error: {mae}")
print(f"Median Absolute Error: {medae}")
print(f"r-squared Score: {score}")

Mean Absolute Error: 2.7589452623437296
r-squared Score: 0.9992434463263903


## Final Model

After comparing each model's scores, we choose one for the final prediction.

In [None]:
# final_model = model(random_state = 16).fit(X_train, y_train)

In [None]:
# Predict values
# y_pred_final = final_model.predict(X_test)