In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.linear_model import Lasso, Ridge, LinearRegression, LogisticRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
import pickle as pkl
from tqdm import tqdm
import time
from datetime import datetime
pd.set_option('display.max_columns', None)

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.corpus import stopwords

stopwords = stopwords.words('english')

In [3]:
data = pd.read_csv('final_data.csv')

In [4]:
data.head()

Unnamed: 0,1st_year_revenue,title,year,released,runtime,plot,top_10_dir,top_50_dir,top_100_dir,top_10_actors,top_50_actors,top_100_actors,language,country,rating,Action,Adventure,Fantasy/Sci-Fi,Crime,Thriller/Mystery,Drama,Horror,Comedy,Documentary,Family/Animated,Biography/History,Romance,Music/Musical,Likely TV
0,2197561,Africa's Elephant Kingdom,1998,1998-05-08,40.0,This film explores an elephant clan's search f...,0.0,0.0,0.0,0.0,0.0,0.0,English only,US only,Unrated,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,2637726,God's Army,2000,2000-03-10,108.0,Life as a Mormon missionary isn't what 19-year...,0.0,0.0,0.0,0.0,0.0,0.0,English only,US only,PG,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2699820,The Bone Collector,1999,1999-11-05,118.0,A quadriplegic ex-homicide detective and his p...,0.0,0.0,0.0,0.0,1.0,1.0,English only,US and others,R,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2703114,Double Jeopardy,1999,1999-09-24,105.0,A woman framed for her husband's murder suspec...,0.0,0.0,0.0,0.0,0.0,0.0,English only,US and others,R,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1428932,Three Kings,1999,1999-10-01,114.0,"In the aftermath of the Persian Gulf War, four...",0.0,0.0,0.0,1.0,1.0,1.0,English and others,US only,R,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
data.columns

Index(['1st_year_revenue', 'title', 'year', 'released', 'runtime', 'plot',
       'top_10_dir', 'top_50_dir', 'top_100_dir', 'top_10_actors',
       'top_50_actors', 'top_100_actors', 'language', 'country', 'rating',
       'Action', 'Adventure', 'Fantasy/Sci-Fi', 'Crime', 'Thriller/Mystery',
       'Drama', 'Horror', 'Comedy', 'Documentary', 'Family/Animated',
       'Biography/History', 'Romance', 'Music/Musical', 'Likely TV'],
      dtype='object')

### Removing categorical variables for base test

In [6]:
Y = data['1st_year_revenue']
X = data[['year', 'runtime']]
# X = pd.concat((
#     data,
#     pd.get_dummies(data, columns= ['language', 'country', 'rating'], drop_first=True)), axis=1)


stand_X = (X - np.mean(X, axis = 0)) / np.std(X, axis = 0)
stand_X

stand_Y = Y

X2 = stand_X
Y2 = stand_Y

X2 = sm.add_constant(X2, prepend=False)
mod = sm.OLS(Y2, X2)
res = mod.fit()
print(res.summary())
print(res.params)

                            OLS Regression Results                            
Dep. Variable:       1st_year_revenue   R-squared:                       0.036
Model:                            OLS   Adj. R-squared:                  0.035
Method:                 Least Squares   F-statistic:                     84.73
Date:                Thu, 08 Dec 2022   Prob (F-statistic):           7.39e-37
Time:                        21:09:09   Log-Likelihood:                -88645.
No. Observations:                4568   AIC:                         1.773e+05
Df Residuals:                    4565   BIC:                         1.773e+05
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
year       -2.532e+06   9.59e+05     -2.640      0.0

### OLS Regression Results

#### Categories that don't need OHE
'directors', 'actors', 'genre'

In [7]:
Y = data['1st_year_revenue']
X = data.drop(labels=['title', '1st_year_revenue', 'released', 'plot'], axis=1)
X = pd.get_dummies(X, columns= ['language', 'country', 'rating'], drop_first=True)

stand_X = (X - np.mean(X, axis = 0)) / np.std(X, axis = 0)
stand_X

stand_Y = Y

X2 = stand_X
Y2 = stand_Y

X2 = sm.add_constant(X2, prepend=False)
mod = sm.OLS(Y2, X2)
res = mod.fit()
print(res.summary())
print(res.params)

                            OLS Regression Results                            
Dep. Variable:       1st_year_revenue   R-squared:                       0.356
Model:                            OLS   Adj. R-squared:                  0.351
Method:                 Least Squares   F-statistic:                     83.42
Date:                Thu, 08 Dec 2022   Prob (F-statistic):               0.00
Time:                        21:09:09   Log-Likelihood:                -87725.
No. Observations:                4568   AIC:                         1.755e+05
Df Residuals:                    4537   BIC:                         1.757e+05
Df Model:                          30                                         
Covariance Type:            nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------
year                   1.515e+

### Data Transformations

In [8]:
data.sort_values(by=['released'], inplace=True)
y = data['1st_year_revenue']
X = data.drop(columns=['title', '1st_year_revenue'], axis=1)
# X = data.drop(['gross'], axis=1)

# Shuffle to false to handle time data
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=False, test_size = 0.2)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, shuffle=False, test_size = 0.2)

In [9]:
categorical_cols = ['language', 'country', 'rating']

In [10]:
# Text Preprocessor
text_preprocesser = Pipeline(
    steps=[
        # Input tfidf parameters
        ('tfidf', TfidfVectorizer(stop_words = stopwords, strip_accents = 'unicode', min_df = .001, max_df = 0.7))
    ])

In [11]:
# Categorical Preprocessor
categorical_preprocessor = Pipeline(
    steps=[
        # Change to 'ignore' if error raised
        ("OHE", OneHotEncoder(handle_unknown='error', drop='first'))
    ])

In [12]:
#Combine preprocessors
#Commenting out TfidfVectorizer as it does not help the model
preprocessor = ColumnTransformer(
    transformers=[
        ('text', text_preprocesser, 'plot'),
        ('category', categorical_preprocessor, categorical_cols)
    ])

In [13]:
lr_pipe = make_pipeline(
    preprocessor, 
    StandardScaler(with_mean=False), 
    LinearRegression()
)

lasso_pipe = make_pipeline(
    preprocessor, 
    StandardScaler(with_mean=False), 
    Lasso()
)

ridge_pipe = make_pipeline(
    preprocessor, 
    StandardScaler(with_mean=False), 
    Ridge()
)

rf_pipe = make_pipeline(
    preprocessor, 
    StandardScaler(with_mean=False), 
    RandomForestRegressor(random_state = 42)
)

In [14]:
## if make_pipeline doesn't work
# pipe = Pipeline(
#         steps= [
#             ('preprocessor', preprocessor),
#             ('scaler', StandardScaler(with_mean=False)),
#             ('linearReg', LinearRegression())
#         ])

### Linear Regression Base Model

In [15]:
lr_pipe.fit(X_train, y_train)
train_score = lr_pipe.score(X_train, y_train)
val_score = lr_pipe.score(X_val, y_val)
test_score = lr_pipe.score(X_test, y_test)
val_pred = lr_pipe.predict(X_val)
test_pred = lr_pipe.predict(X_test)

print('Linear Regression Results')
print("Train score:", train_score)
print("Val score:", val_score)
print("Test score:", test_score)
print("Val RMSE:", np.sqrt(mean_squared_error(y_val, val_pred)))
print("Test RMSE:", np.sqrt(mean_squared_error(y_test, test_pred)))

Linear Regression Results
Train score: 0.9999999999998147
Val score: -2.9051529837853485
Test score: -2.9740205199765373
Val RMSE: 131433296.82579961
Test RMSE: 139510389.9512548


### Ridge Base Model

In [16]:
ridge_pipe.fit(X_train, y_train)
train_score = ridge_pipe.score(X_train, y_train)
val_score = ridge_pipe.score(X_val, y_val)
test_score = ridge_pipe.score(X_test, y_test)
val_pred = ridge_pipe.predict(X_val)
test_pred = ridge_pipe.predict(X_test)

print('Ridge Regression Results')
print("Train score:", train_score)
print("Val score:", val_score)
print("Test score:", test_score)
print("Val RMSE:", np.sqrt(mean_squared_error(y_val, val_pred)))
print("Test RMSE:", np.sqrt(mean_squared_error(y_test, test_pred)))

Ridge Regression Results
Train score: 0.9999791053167697
Val score: -2.8177990272180393
Test score: -2.873622370125576
Val RMSE: 129954974.15884143
Test RMSE: 137736847.93542886


### Lasso Base Model

In [17]:
lasso_pipe.fit(X_train, y_train)
train_score = lasso_pipe.score(X_train, y_train)
val_score = lasso_pipe.score(X_val, y_val)
test_score = lasso_pipe.score(X_test, y_test)
val_pred = lasso_pipe.predict(X_val)
test_pred = lasso_pipe.predict(X_test)

print('Lasso Regression Results')
print("Train score:", train_score)
print("Val score:", val_score)
print("Test score:", test_score)
print("Val RMSE:", np.sqrt(mean_squared_error(y_val, val_pred)))
print("Test RMSE:", np.sqrt(mean_squared_error(y_test, test_pred)))

Lasso Regression Results
Train score: 0.9999982931067715
Val score: -3.3012742917554148
Test score: -3.252982563175923
Val RMSE: 137938324.01868495
Test RMSE: 144323914.9440439


### RandomForestRegressor Base Model

In [18]:
rf_pipe.fit(X_train, y_train)
train_score = rf_pipe.score(X_train, y_train)
val_score = rf_pipe.score(X_val, y_val)
test_score = rf_pipe.score(X_test, y_test)
val_pred = rf_pipe.predict(X_val)
test_pred = rf_pipe.predict(X_test)

print('RandomForestRegressor Results')
print("Train score:", train_score)
print("Val score:", val_score)
print("Test score:", test_score)
print("Val RMSE:", np.sqrt(mean_squared_error(y_val, val_pred)))
print("Test RMSE:", np.sqrt(mean_squared_error(y_test, test_pred)))

RandomForestRegressor Results
Train score: 0.8885577873490816
Val score: 0.1901107713750304
Test score: 0.22173529624320232
Val RMSE: 59854828.01090608
Test RMSE: 61738371.15202071


### GridSearchCV

#### Ridge

In [28]:
# ridge_pipe.get_params().keys()

In [20]:
param_grid = {
    "ridge__alpha": np.logspace(-5, 5, 10),
    "ridge__max_iter": np.logspace(-5, 5, 10)
#     "ridge__tol": np.logspace(1, 5, 5)

}

# Start Time
start = datetime.now()

grid_search = GridSearchCV(ridge_pipe, param_grid=param_grid, n_jobs=-1, verbose = 2)
grid_search.fit(X_train, y_train)  
print("Best params:", grid_search.best_params_)
print(f"Internal CV score: {grid_search.best_score_:.3f}")

# End Time
end = datetime.now()
print("Process took:", end - start)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best params: {'ridge__alpha': 7742.636826811277, 'ridge__max_iter': 1e-05}
Internal CV score: 0.155
Process took: 0:00:08.524077


#### Lasso

In [21]:
param_grid = {
    "lasso__alpha": np.logspace(-3, 3, 7),
    "lasso__max_iter": np.logspace(-5, 5, 10)

#     "max_iter": np.logspace(1, 5, 5)
}

# Start Time
start = datetime.now()

grid_search = GridSearchCV(lasso_pipe, param_grid=param_grid, n_jobs=-1, verbose = 2)
grid_search.fit(X_train, y_train)  
print("Best params:", grid_search.best_params_)
print(f"Internal CV score: {grid_search.best_score_:.3f}")

# End Time
end = datetime.now()
print("Process took:", end - start)

Fitting 5 folds for each of 70 candidates, totalling 350 fits
Best params: {'lasso__alpha': 0.001, 'lasso__max_iter': 1e-05}
Internal CV score: -0.001
Process took: 0:03:16.437396


  model = cd_fast.sparse_enet_coordinate_descent(


#### RandomForestRegressor

In [22]:
# rf_pipe.get_params().keys()

In [29]:
# Start Time
start = datetime.now()

param_grid = {
    'randomforestregressor__n_estimators': [100,200,300,400,500],
    'randomforestregressor__max_features': [None, 'sqrt','log2'],
    'randomforestregressor__max_depth' : [None,1,2,3,4,5,6,7,8],
    'randomforestregressor__random_state': [42]
}

grid_search = GridSearchCV(rf_pipe, param_grid=param_grid, n_jobs=-1, verbose = 10)
grid_search.fit(X_train, y_train)  
print("Best params:", grid_search.best_params_)
print(f"Internal CV score: {grid_search.best_score_:.3f}")

# End Time
end = datetime.now()
print("Process took:", end - start)

Fitting 5 folds for each of 189 candidates, totalling 945 fits
Best params: {'randomforestregressor__max_depth': None, 'randomforestregressor__max_features': 'sqrt', 'randomforestregressor__n_estimators': 600, 'randomforestregressor__random_state': 42}
Internal CV score: 0.199
Process took: 0:05:13.023373


In [24]:
# grid_search.cv_results_

### Serialize Model

In [34]:
## Tfidf with RF displays best results
rf_pipe = make_pipeline(
    preprocessor, 
    StandardScaler(with_mean=False), 
    RandomForestRegressor(max_depth = None, 
                          max_features = 'sqrt', 
                          n_estimators = 600, 
                          random_state = 42)
)

filename= 'model.pkl'
pkl.dump(rf_pipe, open(filename, 'wb'))

### Final Model

In [35]:
rf_pipe.fit(X_train, y_train)
train_score = rf_pipe.score(X_train, y_train)
val_score = rf_pipe.score(X_val, y_val)
test_score = rf_pipe.score(X_test, y_test)
val_pred = rf_pipe.predict(X_val)
test_pred = rf_pipe.predict(X_test)

print('RandomForestRegressor Results')
print("Train score:", train_score)
print("Val score:", val_score)
print("Test score:", test_score)
print("Val RMSE:", np.sqrt(mean_squared_error(y_val, val_pred)))
print("Test RMSE:", np.sqrt(mean_squared_error(y_test, test_pred)))

RandomForestRegressor Results
Train score: 0.8921686306145462
Val score: 0.19171336444994835
Test score: 0.18177204589537665
Val RMSE: 59795578.90059872
Test RMSE: 63303636.125264354
