In [33]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.linear_model import Lasso, Ridge, LinearRegression, LogisticRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
import pickle as pkl
from tqdm import tqdm
import time
from datetime import datetime
pd.set_option('display.max_columns', None)

In [19]:
data = pd.read_csv('final_data.csv')

In [34]:
data.head()

Unnamed: 0,1st_year_revenue,title,year,released,runtime,top_10_dir,top_50_dir,top_100_dir,top_10_actors,top_50_actors,top_100_actors,language_coded,country_coded,rating,Action,Adventure,Fantasy/Sci-Fi,Crime,Thriller/Mystery,Drama,Horror,Comedy,Documentary,Family/Animated,Biography/History,Romance,Music/Musical,Likely TV
0,2197561,Africa's Elephant Kingdom,1998,1998-05-08,40.0,0.0,0.0,0.0,0.0,0.0,0.0,English only,US only,Unrated,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,2637726,God's Army,2000,2000-03-10,108.0,0.0,0.0,0.0,0.0,0.0,0.0,English only,US only,PG,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2699820,The Bone Collector,1999,1999-11-05,118.0,0.0,0.0,0.0,0.0,1.0,1.0,English only,US and others,R,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2703114,Double Jeopardy,1999,1999-09-24,105.0,0.0,0.0,0.0,0.0,0.0,0.0,English only,US and others,R,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1428932,Three Kings,1999,1999-10-01,114.0,0.0,0.0,0.0,1.0,1.0,1.0,English and others,US only,R,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [31]:
data.columns

Index(['1st_year_revenue', 'title', 'year', 'released', 'runtime',
       'top_10_dir', 'top_50_dir', 'top_100_dir', 'top_10_actors',
       'top_50_actors', 'top_100_actors', 'language_coded', 'country_coded',
       'rating', 'Action', 'Adventure', 'Fantasy/Sci-Fi', 'Crime',
       'Thriller/Mystery', 'Drama', 'Horror', 'Comedy', 'Documentary',
       'Family/Animated', 'Biography/History', 'Romance', 'Music/Musical',
       'Likely TV'],
      dtype='object')

### Removing categorical variables for base test

In [25]:
Y = data['1st_year_revenue']
X = data[['year', 'runtime']]
# X = pd.concat((
#     data,
#     pd.get_dummies(data, columns= ['language', 'country', 'rating'], drop_first=True)), axis=1)


stand_X = (X - np.mean(X, axis = 0)) / np.std(X, axis = 0)
stand_X

stand_Y = Y

X2 = stand_X
Y2 = stand_Y

X2 = sm.add_constant(X2, prepend=False)
mod = sm.OLS(Y2, X2)
res = mod.fit()
print(res.summary())
print(res.params)

                            OLS Regression Results                            
Dep. Variable:       1st_year_revenue   R-squared:                       0.036
Model:                            OLS   Adj. R-squared:                  0.035
Method:                 Least Squares   F-statistic:                     85.02
Date:                Wed, 07 Dec 2022   Prob (F-statistic):           5.60e-37
Time:                        20:22:28   Log-Likelihood:                -88740.
No. Observations:                4573   AIC:                         1.775e+05
Df Residuals:                    4570   BIC:                         1.775e+05
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
year       -2.501e+06   9.58e+05     -2.610      0.0

### OLS Regression Results

#### Categories that don't need OHE
'directors', 'actors', 'genre'

In [36]:
data[data.index.duplicated()]

Unnamed: 0,1st_year_revenue,title,year,released,runtime,top_10_dir,top_50_dir,top_100_dir,top_10_actors,top_50_actors,top_100_actors,language_coded,country_coded,rating,Action,Adventure,Fantasy/Sci-Fi,Crime,Thriller/Mystery,Drama,Horror,Comedy,Documentary,Family/Animated,Biography/History,Romance,Music/Musical,Likely TV


In [48]:
pd.get_dummies(data, columns= ['language_coded', 'country_coded', 'rating'], drop_first=True).reset_index(drop=True)

Unnamed: 0,1st_year_revenue,title,year,released,runtime,top_10_dir,top_50_dir,top_100_dir,top_10_actors,top_50_actors,top_100_actors,Action,Adventure,Fantasy/Sci-Fi,Crime,Thriller/Mystery,Drama,Horror,Comedy,Documentary,Family/Animated,Biography/History,Romance,Music/Musical,Likely TV,language_coded_English only,language_coded_Foreign lang,country_coded_US and others,country_coded_US only,rating_PG,rating_PG-13,rating_R,rating_Unrated
0,2197561,Africa's Elephant Kingdom,1998,1998-05-08,40.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1,0,0,1,0,0,0,1
1,2637726,God's Army,2000,2000-03-10,108.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0,1,1,0,0,0
2,2699820,The Bone Collector,1999,1999-11-05,118.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,1,0,0,0,1,0
3,2703114,Double Jeopardy,1999,1999-09-24,105.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,1,0,0,0,1,0
4,1428932,Three Kings,1999,1999-10-01,114.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4568,37000,Heart of Champions,2021,2021-10-29,119.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0,1,0,1,0,0
4569,35910,Falling for Figaro,2020,2021-10-01,104.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1,0,0,0,0,0,0,1
4570,35481,Mainstream,2020,2021-05-07,94.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0,1,0,0,1,0
4571,33765,Cryptozoo,2021,2021-08-20,95.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1,0,0,1,0,0,0,1


In [68]:
X = data.drop(labels=['title', '1st_year_revenue', 'released'], axis=1)

In [71]:
Y = data['1st_year_revenue']
X = data.drop(labels=['title', '1st_year_revenue', 'released'], axis=1)
X = pd.get_dummies(X, columns= ['language_coded', 'country_coded', 'rating'], drop_first=True)
# X = pd.get_dummies(data, columns= ['language_coded', 'country_coded', 'rating'], drop_first=True)
# X = data.drop(labels=['title', '1st_year_revenue', 'released','language_coded', 'country_coded', 'rating'], axis=1)

# X = pd.concat((
#     data,
#     pd.get_dummies(data, columns= ['language_coded', 'country_coded', 'rating'], drop_first=True)), axis=1)
# X = data.drop(labels=['title', '1st_year_revenue', 'released','language_coded', 'country_coded', 'rating'], axis=1)

stand_X = (X - np.mean(X, axis = 0)) / np.std(X, axis = 0)
stand_X

stand_Y = Y

X2 = stand_X
Y2 = stand_Y

X2 = sm.add_constant(X2, prepend=False)
mod = sm.OLS(Y2, X2)
res = mod.fit()
print(res.summary())
print(res.params)

                            OLS Regression Results                            
Dep. Variable:       1st_year_revenue   R-squared:                       0.355
Model:                            OLS   Adj. R-squared:                  0.351
Method:                 Least Squares   F-statistic:                     83.50
Date:                Wed, 07 Dec 2022   Prob (F-statistic):               0.00
Time:                        20:41:44   Log-Likelihood:                -87819.
No. Observations:                4573   AIC:                         1.757e+05
Df Residuals:                    4542   BIC:                         1.759e+05
Df Model:                          30                                         
Covariance Type:            nonrobust                                         
                                  coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------
year              

### Data Transformations

In [None]:
# y = data.pop('gross')
# X = data
# X = data.drop(['gross'], axis=1)

# Shuffle to false to handle time data
X_train, y_train, X_test, y_test = train_test_split(X, y, random_state = 42, shuffle=False, test_size = 0.2)
X_train, y_train, X_val, y_val = train_test_split(X_train, y_train, random_state = 42, shuffle=False, test_size = 0.2)

In [8]:
categorical_cols = []

In [None]:
# Text Preprocessor
text_preprocesser = Pipeline(
    steps=[
        # Input tfidf parameters
        ('tfidf', TfidfVectorizer())
    ])

In [None]:
# Categorical Preprocessor
categorical_preprocessor = Pipeline(
    steps=[
        # Change to 'ignore' if error raised
        ("OHE", OneHotEncoder(handle_unknown='error', drop='first'))
    ])

In [9]:
#Combine preprocessors
preprocessor = ColumnTransformer(
    transformers=[
        ('text', text_preprocesser, 'plot'),
        ('category', categorical_preprocessor, categorical_cols)
    ])

In [None]:
lr_pipe = make_pipeline(
    preprocessor, 
    StandardScaler(with_mean=False), 
    LinearRegression()
)

lasso_pipe = make_pipeline(
    preprocessor, 
    StandardScaler(with_mean=False), 
    Lasso()
)

ridge_pipe = make_pipeline(
    preprocessor, 
    StandardScaler(with_mean=False), 
    Ridge()
)

rf_pipe = make_pipeline(
    preprocessor, 
    StandardScaler(with_mean=False), 
    RandomForestRegressor(n_estimators = 300, max_features = 'sqrt', max_depth = 5, random_state = 42)
)

In [12]:
## if make_pipeline doesn't work
# pipe = Pipeline(
#         steps= [
#             ('preprocessor', preprocessor),
#             ('scaler', StandardScaler(with_mean=False)),
#             ('linearReg', LinearRegression())
#         ])

### Linear Regression Base Model

In [None]:
pipe.fit(X_train, y_train)
train_score = pipe.score(X_train, y_train)
val_score = pipe.score(X_val, y_val)
test_score = pipe.score(X_test, y_test)
val_pred = pipe.predict(X_val)
test_pred = pipe.predict(X_test)

print('Linear Regression Results')
print("Train score:", train_score)
print("Val score:", val_score)
print("Test score:", test_score)
print("Val RMSE:", np.sqrt(mean_squared_error(y_val, val_pred))
print("Test RMSE:", np.sqrt(mean_squared_error(y_test, test_pred))

### Ridge Base Model

In [None]:
ridge_pipe.fit(X_train, y_train)
train_score = ridge_pipe.score(X_train, y_train)
val_score = ridge_pipe.score(X_val, y_val)
test_score = ridge_pipe.score(X_test, y_test)
val_pred = ridge_pipe.predict(X_val)
test_pred = ridge_pipe.predict(X_test)

print('Ridge Regression Results')
print("Train score:", train_score)
print("Val score:", val_score)
print("Test score:", test_score)
print("Val RMSE:", np.sqrt(mean_squared_error(y_val, val_pred))
print("Test RMSE:", np.sqrt(mean_squared_error(y_test, test_pred))

### Lasso Base Model

In [None]:
lasso_pipe.fit(X_train, y_train)
train_score = lasso_pipe.score(X_train, y_train)
val_score = lasso_pipe.score(X_val, y_val)
test_score = lasso_pipe.score(X_test, y_test)
val_pred = lasso_pipe.predict(X_val)
test_pred = lasso_pipe.predict(X_test)

print('Lasso Regression Results')
print("Train score:", train_score)
print("Val score:", val_score)
print("Test score:", test_score)
print("Val RMSE:", np.sqrt(mean_squared_error(y_val, val_pred))
print("Test RMSE:", np.sqrt(mean_squared_error(y_test, test_pred))

### RandomForestRegressor Base Model

In [None]:
# rf_pipe = RandomForestRegressor(n_estimators = 300, max_features = 'sqrt', max_depth = 5, random_state = 42)
rf_pipe.fit(X_train, y_train)
train_score = rf_pipe.score(X_train, y_train)
val_score = rf_pipe.score(X_val, y_val)
test_score = rf_pipe.score(X_test, y_test)
val_pred = rf_pipe.predict(X_val)
test_pred = rf_pipe.predict(X_test)

print('RandomForestRegressor Results')
print("Train score:", train_score)
print("Val score:", val_score)
print("Test score:", test_score)
print("Val RMSE:", np.sqrt(mean_squared_error(y_val, val_pred))
print("Test RMSE:", np.sqrt(mean_squared_error(y_test, test_pred))

### GridSearchCV

#### Ridge

In [None]:
param_grid = {
    "alpha": np.logspace(-3, 3, 7),
#     "max_iter": np.logspace(1, 5, 5)
}

# Start Time
start = datetime.now()

grid_search = GridSearchCV(pipe, param_grid=param_grid, n_jobs=-1, verbose = 2)
grid_search.fit(X_train, y_train)  
print("Best params:", grid_search.best_params_)
print(f"Internal CV score: {grid_search.best_score_:.3f}")

# End Time
end = datetime.now()
print("Process took:", end - start)

#### Lasso

In [None]:
param_grid = {
    "alpha": np.logspace(-3, 3, 7),
#     "max_iter": np.logspace(1, 5, 5)
}

# Start Time
start = datetime.now()

grid_search = GridSearchCV(pipe, param_grid=param_grid, n_jobs=-1, verbose = 2)
grid_search.fit(X_train, y_train)  
print("Best params:", grid_search.best_params_)
print(f"Internal CV score: {grid_search.best_score_:.3f}")

# End Time
end = datetime.now()
print("Process took:", end - start)

#### RandomForestRegressor

In [None]:
# Start Time
start = datetime.now()

param_grid = {
    'n_estimators': [200,300,400,500],
    'max_features': ['sqrt','log2'],
    'max_depth' : [3,4,5,6,7],
    'random_state' : [42]
}

grid_search = GridSearchCV(pipe, param_grid=param_grid, n_jobs=-1, verbose = 10)
grid_search.fit(X_train, y_train)  
print("Best params:", grid_search.best_params_)
print(f"Internal CV score: {grid_search.best_score_:.3f}")

# End Time
end = datetime.now()
print("Process took:", end - start)