In [1]:
import os
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.ensemble import RandomForestRegressor
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.metrics import make_scorer, mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor
import pickle
from sklearn.neighbors import KNeighborsRegressor

df = pd.read_csv('../dataset/preprocessed_dataset.csv')
df = df.drop('votes', axis=1)
df = df.drop('avg_vote', axis=1)

In [2]:
X = df.drop('revenue_with_CPI', axis=1)
y = df['revenue_with_CPI']
print(X.head())

numerical_columns = ['duration','converted_budget',
                     'dir_oscar_nomination', 'writer_oscar_nomination',
                     'cast_globe_nomination',
                     'BAFTA_writer_nom', 'BAFTA_dir_nom', 'BAFTA_act_nom', 
                     'dir_emmy_nom', 'writer_emmy_nom', 'act_emmy_nom',
                     'actors_films_before', 'director_films_before', 'writers_films_before'
                     ]

   duration  converted_budget  dir_oscar_nomination  writer_oscar_nomination  \
0      88.0          175700.3                     0                        0   
1      59.0         3013850.0                     0                        0   
2      77.0          521727.6                     0                        0   
3      50.0         5598468.6                     0                        0   
4     300.0        10802441.1                     0                        0   

   cast_globe_nomination  BAFTA_act_nom  BAFTA_dir_nom  BAFTA_writer_nom  \
0                      0              0              0                 0   
1                      0              0              0                 0   
2                      0              0              0                 0   
3                      0              0              0                 0   
4                      0              0              0                 0   

   dir_emmy_nom  writer_emmy_nom  ...  month_published_11  mon

In [3]:

# Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

def mape(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

log_transformer = FunctionTransformer(np.log1p, validate=True)

preprocessor = ColumnTransformer(
    transformers=[
        ('log_scaler', Pipeline(steps=[
            ('scaler', StandardScaler())
        ]), numerical_columns)
    ],
    remainder='passthrough'  
)

preprocessor_linear = ColumnTransformer(
    transformers=[
        ('log_scaler', Pipeline(steps=[
            ('log', log_transformer),
            ('scaler', StandardScaler())
        ]), numerical_columns)
    ],
    remainder='passthrough' 
)


# Random Forest

In [4]:
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('feature_selection', SelectKBest(score_func=f_regression, k=20)),
    ('regressor', RandomForestRegressor())
])
param_grid = {
    'regressor__n_estimators': [ 500],#300],
    'regressor__max_depth': [ None],# 4, 10],
    'regressor__min_samples_split': [ 10],#2],
    'regressor__min_samples_leaf': [1],
    'regressor__max_features': [ 'sqrt'],# 4],  
    'regressor__bootstrap': [True],# False],  
    'regressor__criterion': ['squared_error'],# 'poisson'],# 'absolute_error', 'friedman_mse'], 
    'regressor__random_state': [42]  
}

grid_search = GridSearchCV(pipeline, param_grid, cv=10, scoring='neg_mean_squared_error', n_jobs=-1)

grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
print("Best parameters:", best_params)

best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)

y_pred_test = best_model.predict(X_test)
final_rmse_test = rmse(y_test, y_pred_test)
final_r2_test = r2_score(y_test, y_pred_test)
final_mae_test = mean_absolute_error(y_test, y_pred_test)
final_mse_test = mean_squared_error(y_test, y_pred_test)

y_pred_train = best_model.predict(X_train)
final_rmse_train = rmse(y_train, y_pred_train)
final_r2_train = r2_score(y_train, y_pred_train)
final_mae_train = mean_absolute_error(y_train, y_pred_train)
final_mse_train = mean_squared_error(y_train, y_pred_train)

print("Test set results:")
print("Final RMSE:", final_rmse_test)
print("Final MAE:", final_mae_test)
print("Final MSE:", final_mse_test)
print("Final R2:", final_r2_test)

print("\nTrain set results:")
print("Final RMSE:", final_rmse_train)
print("Final MAE:", final_mae_train)
print("Final MSE:", final_mse_train)
print("Final R2:", final_r2_train)

file_path = "../models/regression/RandomForestRegressor.pkl"

os.makedirs(os.path.dirname(file_path), exist_ok=True)

with open(file_path, 'wb') as file:
    pickle.dump(best_model, file)
    

Best parameters: {'regressor__bootstrap': True, 'regressor__criterion': 'squared_error', 'regressor__max_depth': None, 'regressor__max_features': 'sqrt', 'regressor__min_samples_leaf': 1, 'regressor__min_samples_split': 10, 'regressor__n_estimators': 500, 'regressor__random_state': 42}
Test set results:
Final RMSE: 207184974.83640644
Final MAE: 82376330.61702259
Final MSE: 4.292561379796237e+16
Final R2: 0.4230637954316091

Train set results:
Final RMSE: 163423686.50702277
Final MAE: 58019523.746770635
Final MSE: 2.6707301311545656e+16
Final R2: 0.6835693189713254


# AdaBoost

In [5]:
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('feature_selection', SelectKBest(score_func=f_regression, k=20)),
    ('regressor', AdaBoostRegressor(random_state=42))
])
param_grid = {
    'regressor__n_estimators': [50],# 100, 150],
    'regressor__learning_rate': [0.01],# 0.1, 1.0],
    'regressor__loss': [ 'exponential']#'linear', 'square',
}

grid_search = GridSearchCV(pipeline, param_grid, cv=10, scoring='neg_mean_squared_error', n_jobs=-1)

grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
print("Best parameters:", best_params)

best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)

y_pred_test = best_model.predict(X_test)
final_rmse_test = rmse(y_test, y_pred_test)
final_r2_test = r2_score(y_test, y_pred_test)
final_mae_test = mean_absolute_error(y_test, y_pred_test)
final_mse_test = mean_squared_error(y_test, y_pred_test)

y_pred_train = best_model.predict(X_train)
final_rmse_train = rmse(y_train, y_pred_train)
final_r2_train = r2_score(y_train, y_pred_train)
final_mae_train = mean_absolute_error(y_train, y_pred_train)
final_mse_train = mean_squared_error(y_train, y_pred_train)

print("Test set results:")
print("Final RMSE:", final_rmse_test)
print("Final MAE:", final_mae_test)
print("Final MSE:", final_mse_test)
print("Final R2:", final_r2_test)

print("\nTrain set results:")
print("Final RMSE:", final_rmse_train)
print("Final MAE:", final_mae_train)
print("Final MSE:", final_mse_train)
print("Final R2:", final_r2_train)

file_path = "../models/regression/AdaBoostRegressor.pkl"

os.makedirs(os.path.dirname(file_path), exist_ok=True)

with open(file_path, 'wb') as file:
    pickle.dump(best_model, file)

Best parameters: {'regressor__learning_rate': 0.01, 'regressor__loss': 'exponential', 'regressor__n_estimators': 50}
Test set results:
Final RMSE: 213376611.619415
Final MAE: 85714908.53466384
Final MSE: 4.5529578386182664e+16
Final R2: 0.38806554349211964

Train set results:
Final RMSE: 214334578.56208
Final MAE: 82018095.73655291
Final MSE: 4.593931156738445e+16
Final R2: 0.45570660713024935


# GradientBoosting

In [6]:
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('feature_selection', SelectKBest(score_func=f_regression, k=20)),
    ('regressor', GradientBoostingRegressor(random_state=42))
])

param_grid = {
    'regressor__n_estimators': [50],# 100, 150],
    'regressor__learning_rate': [0.1],# 1.0,0.01],
    'regressor__max_depth': [3],# 5, 7],
    'regressor__min_samples_split': [ 5],#2, 10],
    'regressor__min_samples_leaf': [1],# 2, 4]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=10, scoring='neg_mean_squared_error', n_jobs=-1)

grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
print("Best parameters:", best_params)

best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)

y_pred_test = best_model.predict(X_test)
final_rmse_test = rmse(y_test, y_pred_test)
final_r2_test = r2_score(y_test, y_pred_test)
final_mae_test = mean_absolute_error(y_test, y_pred_test)
final_mse_test = mean_squared_error(y_test, y_pred_test)

y_pred_train = best_model.predict(X_train)
final_rmse_train = rmse(y_train, y_pred_train)
final_r2_train = r2_score(y_train, y_pred_train)
final_mae_train = mean_absolute_error(y_train, y_pred_train)
final_mse_train = mean_squared_error(y_train, y_pred_train)

print("Test set results:")
print("Final RMSE:", final_rmse_test)
print("Final MAE:", final_mae_test)
print("Final MSE:", final_mse_test)
print("Final R2:", final_r2_test)

print("\nTrain set results:")
print("Final RMSE:", final_rmse_train)
print("Final MAE:", final_mae_train)
print("Final MSE:", final_mse_train)
print("Final R2:", final_r2_train)

file_path = "../models/regression/GradientBoostingRegressor.pkl"

os.makedirs(os.path.dirname(file_path), exist_ok=True)

with open(file_path, 'wb') as file:
    pickle.dump(best_model, file)

Best parameters: {'regressor__learning_rate': 0.1, 'regressor__max_depth': 3, 'regressor__min_samples_leaf': 1, 'regressor__min_samples_split': 5, 'regressor__n_estimators': 50}
Test set results:
Final RMSE: 218256233.40746337
Final MAE: 83342728.40080352
Final MSE: 4.763578342121314e+16
Final R2: 0.359757364960937

Train set results:
Final RMSE: 199650305.7017214
Final MAE: 77359532.15199389
Final MSE: 3.986024456679081e+16
Final R2: 0.5277319790904333


# ElasticNet

In [7]:
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor_linear),
    ('feature_selection', SelectKBest(score_func=f_regression, k=20)),
    ('regressor', ElasticNet(random_state=42))
])

param_grid = {
    'regressor__alpha': [0.1],# 0.5, 1.0],
    'regressor__l1_ratio': [ 0.9] #0.1, 0.5,
}
grid_search = GridSearchCV(pipeline, param_grid, cv=10, scoring='neg_mean_squared_error', n_jobs=-1)

grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
print("Best parameters:", best_params)

best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)

y_pred_test = best_model.predict(X_test)
final_rmse_test = rmse(y_test, y_pred_test)
final_r2_test = r2_score(y_test, y_pred_test)
final_mae_test = mean_absolute_error(y_test, y_pred_test)
final_mse_test = mean_squared_error(y_test, y_pred_test)

y_pred_train = best_model.predict(X_train)
final_rmse_train = rmse(y_train, y_pred_train)
final_r2_train = r2_score(y_train, y_pred_train)
final_mae_train = mean_absolute_error(y_train, y_pred_train)
final_mse_train = mean_squared_error(y_train, y_pred_train)

print("Test set results:")
print("Final RMSE:", final_rmse_test)
print("Final MAE:", final_mae_test)
print("Final MSE:", final_mse_test)
print("Final R2:", final_r2_test)

print("\nTrain set results:")
print("Final RMSE:", final_rmse_train)
print("Final MAE:", final_mae_train)
print("Final MSE:", final_mse_train)
print("Final R2:", final_r2_train)

file_path = "../models/regression/ElasticNetRegressor.pkl"

os.makedirs(os.path.dirname(file_path), exist_ok=True)

with open(file_path, 'wb') as file:
    pickle.dump(best_model, file)

Best parameters: {'regressor__alpha': 0.1, 'regressor__l1_ratio': 0.9}
Test set results:
Final RMSE: 255072458.62690815
Final MAE: 119316715.32961605
Final MSE: 6.506195914997578e+16
Final R2: 0.12554308598954644

Train set results:
Final RMSE: 273842741.7020254
Final MAE: 118242345.18430261
Final MSE: 7.498984718288222e+16
Final R2: 0.11151306013619999


# Knneighbors

In [8]:

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('feature_selection', SelectKBest(score_func=f_regression, k=20)),
    ('regressor', KNeighborsRegressor())
])

param_grid = {
    'regressor__n_neighbors': [ 11],#3, 5, 7, 9,
    'regressor__weights': ['uniform'],# 'distance'],
    'regressor__algorithm': [ 'kd_tree'],# 'brute','auto', 'ball_tree'],
    'regressor__leaf_size': [ 30],# 40, 50,20],
    'regressor__p': [ 2]#,1]  # Parametro p: 1 = distanza di Manhattan, 2 = distanza euclidea
}
grid_search = GridSearchCV(pipeline, param_grid, cv=10, scoring='neg_mean_squared_error', n_jobs=-1)

grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
print("Best parameters:", best_params)

best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)

y_pred_test = best_model.predict(X_test)
final_rmse_test = rmse(y_test, y_pred_test)
final_r2_test = r2_score(y_test, y_pred_test)
final_mae_test = mean_absolute_error(y_test, y_pred_test)
final_mse_test = mean_squared_error(y_test, y_pred_test)

y_pred_train = best_model.predict(X_train)
final_rmse_train = rmse(y_train, y_pred_train)
final_r2_train = r2_score(y_train, y_pred_train)
final_mae_train = mean_absolute_error(y_train, y_pred_train)
final_mse_train = mean_squared_error(y_train, y_pred_train)

print("Test set results:")
print("Final RMSE:", final_rmse_test)
print("Final MAE:", final_mae_test)
print("Final MSE:", final_mse_test)
print("Final R2:", final_r2_test)

print("\nTrain set results:")
print("Final RMSE:", final_rmse_train)
print("Final MAE:", final_mae_train)
print("Final MSE:", final_mse_train)
print("Final R2:", final_r2_train)

file_path = "../models/regression/KNNRegressor.pkl"

os.makedirs(os.path.dirname(file_path), exist_ok=True)

with open(file_path, 'wb') as file:
    pickle.dump(best_model, file)

Best parameters: {'regressor__algorithm': 'kd_tree', 'regressor__leaf_size': 30, 'regressor__n_neighbors': 11, 'regressor__p': 2, 'regressor__weights': 'uniform'}
Test set results:
Final RMSE: 216123892.93944815
Final MAE: 83324620.34960154
Final MSE: 4.670953709930205e+16
Final R2: 0.37220645980612665

Train set results:
Final RMSE: 212777695.48503122
Final MAE: 74847311.88379893
Final MSE: 4.527434769592068e+16
Final R2: 0.46358516319445875


# DecisionTree

In [9]:


pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('feature_selection', SelectKBest(score_func=f_regression, k=20)),
    ('regressor', DecisionTreeRegressor(random_state=42))
])


param_grid = {
    'regressor__max_depth': [ 10],# 20, 30,None],
    'regressor__min_samples_split': [2],# 5, 10],
    'regressor__min_samples_leaf': [ 4]#,1, 2]
}
grid_search = GridSearchCV(pipeline, param_grid, cv=10, scoring='neg_mean_squared_error', n_jobs=-1)

grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
print("Best parameters:", best_params)

best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)

y_pred_test = best_model.predict(X_test)
final_rmse_test = rmse(y_test, y_pred_test)
final_r2_test = r2_score(y_test, y_pred_test)
final_mae_test = mean_absolute_error(y_test, y_pred_test)
final_mse_test = mean_squared_error(y_test, y_pred_test)

y_pred_train = best_model.predict(X_train)
final_rmse_train = rmse(y_train, y_pred_train)
final_r2_train = r2_score(y_train, y_pred_train)
final_mae_train = mean_absolute_error(y_train, y_pred_train)
final_mse_train = mean_squared_error(y_train, y_pred_train)

print("Test set results:")
print("Final RMSE:", final_rmse_test)
print("Final MAE:", final_mae_test)
print("Final MSE:", final_mse_test)
print("Final R2:", final_r2_test)

print("\nTrain set results:")
print("Final RMSE:", final_rmse_train)
print("Final MAE:", final_mae_train)
print("Final MSE:", final_mse_train)
print("Final R2:", final_r2_train)

file_path = "../models/regression/DecisionTreeRegressor.pkl"

os.makedirs(os.path.dirname(file_path), exist_ok=True)

with open(file_path, 'wb') as file:
    pickle.dump(best_model, file)

Best parameters: {'regressor__max_depth': 10, 'regressor__min_samples_leaf': 4, 'regressor__min_samples_split': 2}
Test set results:
Final RMSE: 234266887.36524343
Final MAE: 88005703.68190682
Final MSE: 5.4880974515799656e+16
Final R2: 0.2623793036673354

Train set results:
Final RMSE: 192282060.24231526
Final MAE: 71758403.94561547
Final MSE: 3.697239069102935e+16
Final R2: 0.5619475502542429
