In [45]:

from sklearn.linear_model import ElasticNet
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.ensemble import RandomForestRegressor
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.metrics import make_scorer, mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor
# Carica il dataset
df = pd.read_csv('../dataset/preprocessed_dataset.csv')
df = df.drop('votes', axis=1)
df = df.drop('avg_vote', axis=1)

In [46]:
X = df.drop('revenue_with_CPI', axis=1)
y = df['revenue_with_CPI']
print(X.head())

numerical_columns = ['duration','converted_budget',
                     'dir_oscar_nomination', 'writer_oscar_nomination',
                     'cast_globe_nomination',
                     'BAFTA_writer_nom', 'BAFTA_dir_nom', 'BAFTA_act_nom', 
                     'dir_emmy_nom', 'writer_emmy_nom', 'act_emmy_nom',
                     'actors_films_before', 'director_films_before', 'writers_films_before'
                     ]

   duration  converted_budget  dir_oscar_nomination  writer_oscar_nomination  \
0      88.0          175700.3                     0                        0   
1      59.0         3013850.0                     0                        0   
2      77.0          521727.6                     0                        0   
3      50.0         5598468.6                     0                        0   
4     300.0        10802441.1                     0                        0   

   cast_globe_nomination  BAFTA_act_nom  BAFTA_dir_nom  BAFTA_writer_nom  \
0                      0              0              0                 0   
1                      0              0              0                 0   
2                      0              0              0                 0   
3                      0              0              0                 0   
4                      0              0              0                 0   

   dir_emmy_nom  writer_emmy_nom  ...  month_published_11  mon

In [41]:


# Dividere i dati in set di addestramento e test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Funzione per calcolare il Root Mean Squared Error (RMSE)
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

# Funzione per calcolare il Mean Absolute Percentage Error (MAPE)
def mape(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

# Definire il trasformatore logaritmico
log_transformer = FunctionTransformer(np.log1p, validate=True)

# Creare un ColumnTransformer per applicare trasformazioni solo alle caratteristiche numeriche
preprocessor = ColumnTransformer(
    transformers=[
        ('log_scaler', Pipeline(steps=[
            ('log', log_transformer),
            ('scaler', StandardScaler())
        ]), numerical_columns)
    ],
    remainder='passthrough'  # Lasciare le altre colonne intatte
)



# Random Forest

In [30]:
# Creare la pipeline completa
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('feature_selection', SelectKBest(score_func=f_regression, k=20)),
    ('regressor', RandomForestRegressor())
])
# Definire una griglia dei parametri da esplorare
param_grid = {
    'regressor__n_estimators': [50, 300],
    'regressor__max_depth': [ 4, 10],
    'regressor__min_samples_split': [2, 10],
    'regressor__min_samples_leaf': [1, 4],
    'regressor__max_features': [ 'sqrt', 4],  
    'regressor__bootstrap': [True, False],  # Puoi testare sia True che False
    'regressor__criterion': ['squared_error', 'poisson'],# 'absolute_error', 'friedman_mse'], 
    'regressor__random_state': [42]  
}


# Creare GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=10, scoring='neg_mean_squared_error', n_jobs=-1)

# Eseguire la ricerca della griglia sui dati di addestramento
grid_search.fit(X_train, y_train)

# Ottenere i migliori parametri trovati
best_params = grid_search.best_params_
print("Best parameters:", best_params)

# Addestrare un modello con i migliori parametri trovati sull'intero set di addestramento
best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)

# Valutare il modello finale sul set di test
y_pred_test = best_model.predict(X_test)
final_rmse_test = rmse(y_test, y_pred_test)
final_r2_test = r2_score(y_test, y_pred_test)
final_mae_test = mean_absolute_error(y_test, y_pred_test)
final_mse_test = mean_squared_error(y_test, y_pred_test)

# Valutare il modello sul set di addestramento
y_pred_train = best_model.predict(X_train)
final_rmse_train = rmse(y_train, y_pred_train)
final_r2_train = r2_score(y_train, y_pred_train)
final_mae_train = mean_absolute_error(y_train, y_pred_train)
final_mse_train = mean_squared_error(y_train, y_pred_train)

# Stampa i risultati
print("Test set results:")
print("Final RMSE:", final_rmse_test)
print("Final MAE:", final_mae_test)
print("Final MSE:", final_mse_test)
print("Final R2:", final_r2_test)

print("\nTrain set results:")
print("Final RMSE:", final_rmse_train)
print("Final MAE:", final_mae_train)
print("Final MSE:", final_mse_train)
print("Final R2:", final_r2_train)

Best parameters: {'regressor__bootstrap': True, 'regressor__criterion': 'squared_error', 'regressor__max_depth': 10, 'regressor__max_features': 'sqrt', 'regressor__min_samples_leaf': 1, 'regressor__min_samples_split': 10, 'regressor__n_estimators': 300, 'regressor__random_state': 42}
Test set results:
Final RMSE: 210818343.33922222
Final MAE: 89102329.1025544
Final MSE: 4.4444373888294184e+16
Final R2: 0.40265109530598897

Train set results:
Final RMSE: 198143181.8594518
Final MAE: 81232480.62071209
Final MSE: 3.926072051738779e+16
Final R2: 0.5348351978332291


# AdaBoost

In [38]:

# Creare la pipeline completa
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('feature_selection', SelectKBest(score_func=f_regression, k=20)),
    ('regressor', AdaBoostRegressor(random_state=42))
])
# Definire una griglia dei parametri da esplorare per AdaBoostRegressor
param_grid = {
    'regressor__n_estimators': [50, 100, 150],
    'regressor__learning_rate': [0.01, 0.1, 1.0],
    'regressor__loss': ['linear', 'square', 'exponential']
}

# Creare GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=10, scoring='neg_mean_squared_error', n_jobs=-1)

# Eseguire la ricerca della griglia sui dati di addestramento
grid_search.fit(X_train, y_train)

# Ottenere i migliori parametri trovati
best_params = grid_search.best_params_
print("Best parameters:", best_params)

# Addestrare un modello con i migliori parametri trovati sull'intero set di addestramento
best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)

# Valutare il modello finale sul set di test
y_pred_test = best_model.predict(X_test)
final_rmse_test = rmse(y_test, y_pred_test)
final_r2_test = r2_score(y_test, y_pred_test)
final_mae_test = mean_absolute_error(y_test, y_pred_test)
final_mse_test = mean_squared_error(y_test, y_pred_test)

# Valutare il modello sul set di addestramento
y_pred_train = best_model.predict(X_train)
final_rmse_train = rmse(y_train, y_pred_train)
final_r2_train = r2_score(y_train, y_pred_train)
final_mae_train = mean_absolute_error(y_train, y_pred_train)
final_mse_train = mean_squared_error(y_train, y_pred_train)

# Stampa i risultati
print("Test set results:")
print("Final RMSE:", final_rmse_test)
print("Final MAE:", final_mae_test)
print("Final MSE:", final_mse_test)
print("Final R2:", final_r2_test)

print("\nTrain set results:")
print("Final RMSE:", final_rmse_train)
print("Final MAE:", final_mae_train)
print("Final MSE:", final_mse_train)
print("Final R2:", final_r2_train)

Best parameters: {'regressor__learning_rate': 0.01, 'regressor__loss': 'exponential', 'regressor__n_estimators': 50}
Test set results:
Final RMSE: 213078427.5505543
Final MAE: 85459858.0560362
Final MSE: 4.540241628741682e+16
Final R2: 0.3897746493647164

Train set results:
Final RMSE: 214334476.40398216
Final MAE: 81809219.25231358
Final MSE: 4.593926777536918e+16
Final R2: 0.4557071259822685


# GradientBoosting

In [47]:
# Creare la pipeline completa per GradientBoostingRegressor
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('feature_selection', SelectKBest(score_func=f_regression, k=20)),
    ('regressor', GradientBoostingRegressor(random_state=42))
])

# Definire una griglia dei parametri da esplorare per GradientBoostingRegressor
param_grid = {
    'regressor__n_estimators': [50, 100, 150],
    'regressor__learning_rate': [0.01, 0.1, 1.0],
    'regressor__max_depth': [3, 5, 7],
    'regressor__min_samples_split': [2, 5, 10],
    'regressor__min_samples_leaf': [1, 2, 4]
}

# Creare GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=10, scoring='neg_mean_squared_error', n_jobs=-1)

# Eseguire la ricerca della griglia sui dati di addestramento
grid_search.fit(X_train, y_train)

# Ottenere i migliori parametri trovati
best_params = grid_search.best_params_
print("Best parameters:", best_params)

# Addestrare un modello con i migliori parametri trovati sull'intero set di addestramento
best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)

# Valutare il modello finale sul set di test
y_pred_test = best_model.predict(X_test)
final_rmse_test = rmse(y_test, y_pred_test)
final_r2_test = r2_score(y_test, y_pred_test)
final_mae_test = mean_absolute_error(y_test, y_pred_test)
final_mse_test = mean_squared_error(y_test, y_pred_test)

# Valutare il modello sul set di addestramento
y_pred_train = best_model.predict(X_train)
final_rmse_train = rmse(y_train, y_pred_train)
final_r2_train = r2_score(y_train, y_pred_train)
final_mae_train = mean_absolute_error(y_train, y_pred_train)
final_mse_train = mean_squared_error(y_train, y_pred_train)

# Stampa i risultati
print("Test set results:")
print("Final RMSE:", final_rmse_test)
print("Final MAE:", final_mae_test)
print("Final MSE:", final_mse_test)
print("Final R2:", final_r2_test)

print("\nTrain set results:")
print("Final RMSE:", final_rmse_train)
print("Final MAE:", final_mae_train)
print("Final MSE:", final_mse_train)
print("Final R2:", final_r2_train)

Best parameters: {'regressor__learning_rate': 0.1, 'regressor__max_depth': 3, 'regressor__min_samples_leaf': 1, 'regressor__min_samples_split': 5, 'regressor__n_estimators': 50}
Test set results:
Final RMSE: 218257105.33626708
Final MAE: 83351584.59714966
Final MSE: 4.763616402976639e+16
Final R2: 0.359752249440512

Train set results:
Final RMSE: 199650305.7017214
Final MAE: 77359532.1519939
Final MSE: 3.986024456679081e+16
Final R2: 0.5277319790904333


# ElasticNet

In [48]:
# Creare la pipeline completa per ElasticNet
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('feature_selection', SelectKBest(score_func=f_regression, k=20)),
    ('regressor', ElasticNet(random_state=42))
])

# Definire una griglia dei parametri da esplorare per ElasticNet
param_grid = {
    'regressor__alpha': [0.1, 0.5, 1.0],
    'regressor__l1_ratio': [0.1, 0.5, 0.9]
}
# Creare GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=10, scoring='neg_mean_squared_error', n_jobs=-1)

# Eseguire la ricerca della griglia sui dati di addestramento
grid_search.fit(X_train, y_train)

# Ottenere i migliori parametri trovati
best_params = grid_search.best_params_
print("Best parameters:", best_params)

# Addestrare un modello con i migliori parametri trovati sull'intero set di addestramento
best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)

# Valutare il modello finale sul set di test
y_pred_test = best_model.predict(X_test)
final_rmse_test = rmse(y_test, y_pred_test)
final_r2_test = r2_score(y_test, y_pred_test)
final_mae_test = mean_absolute_error(y_test, y_pred_test)
final_mse_test = mean_squared_error(y_test, y_pred_test)

# Valutare il modello sul set di addestramento
y_pred_train = best_model.predict(X_train)
final_rmse_train = rmse(y_train, y_pred_train)
final_r2_train = r2_score(y_train, y_pred_train)
final_mae_train = mean_absolute_error(y_train, y_pred_train)
final_mse_train = mean_squared_error(y_train, y_pred_train)

# Stampa i risultati
print("Test set results:")
print("Final RMSE:", final_rmse_test)
print("Final MAE:", final_mae_test)
print("Final MSE:", final_mse_test)
print("Final R2:", final_r2_test)

print("\nTrain set results:")
print("Final RMSE:", final_rmse_train)
print("Final MAE:", final_mae_train)
print("Final MSE:", final_mse_train)
print("Final R2:", final_r2_train)

Best parameters: {'regressor__alpha': 0.1, 'regressor__l1_ratio': 0.9}
Test set results:
Final RMSE: 255072458.62690815
Final MAE: 119316715.32961605
Final MSE: 6.506195914997578e+16
Final R2: 0.12554308598954644

Train set results:
Final RMSE: 273842741.7020254
Final MAE: 118242345.18430261
Final MSE: 7.498984718288222e+16
Final R2: 0.11151306013619999


# Knneighbors

In [49]:
from sklearn.neighbors import KNeighborsRegressor

# Creare la pipeline completa per SVR
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('feature_selection', SelectKBest(score_func=f_regression, k=20)),
    ('regressor', KNeighborsRegressor())
])

# Definire una griglia dei parametri da esplorare per SVR
param_grid = {
    'regressor__n_neighbors': [3, 5, 7, 9, 11],
    'regressor__weights': ['uniform', 'distance'],
    'regressor__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'regressor__leaf_size': [20, 30, 40, 50],
    'regressor__p': [1, 2]  # Parametro p: 1 = distanza di Manhattan, 2 = distanza euclidea
}
# Creare GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=10, scoring='neg_mean_squared_error', n_jobs=-1)

# Eseguire la ricerca della griglia sui dati di addestramento
grid_search.fit(X_train, y_train)

# Ottenere i migliori parametri trovati
best_params = grid_search.best_params_
print("Best parameters:", best_params)

# Addestrare un modello con i migliori parametri trovati sull'intero set di addestramento
best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)

# Valutare il modello finale sul set di test
y_pred_test = best_model.predict(X_test)
final_rmse_test = rmse(y_test, y_pred_test)
final_r2_test = r2_score(y_test, y_pred_test)
final_mae_test = mean_absolute_error(y_test, y_pred_test)
final_mse_test = mean_squared_error(y_test, y_pred_test)

# Valutare il modello sul set di addestramento
y_pred_train = best_model.predict(X_train)
final_rmse_train = rmse(y_train, y_pred_train)
final_r2_train = r2_score(y_train, y_pred_train)
final_mae_train = mean_absolute_error(y_train, y_pred_train)
final_mse_train = mean_squared_error(y_train, y_pred_train)

# Stampa i risultati
print("Test set results:")
print("Final RMSE:", final_rmse_test)
print("Final MAE:", final_mae_test)
print("Final MSE:", final_mse_test)
print("Final R2:", final_r2_test)

print("\nTrain set results:")
print("Final RMSE:", final_rmse_train)
print("Final MAE:", final_mae_train)
print("Final MSE:", final_mse_train)
print("Final R2:", final_r2_train)

Best parameters: {'regressor__C': 10, 'regressor__epsilon': 0.01, 'regressor__kernel': 'linear'}
Test set results:
Final RMSE: 287543648.06016934
Final MAE: 101571450.52714463
Final MSE: 8.268134953975051e+16
Final R2: -0.111268069842265

Train set results:
Final RMSE: 302904636.78913206
Final MAE: 96717567.02796443
Final MSE: 9.175121898835603e+16
Final R2: -0.08707728912863


# DecisionTree

In [8]:
from sklearn.tree import DecisionTreeRegressor

# Creare la pipeline completa per MLP
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('feature_selection', SelectKBest(score_func=f_regression, k=20)),
    ('regressor', DecisionTreeRegressor(random_state=42))
])

# Definire una griglia dei parametri da esplorare per MLP
param_grid = {
    'regressor__max_depth': [None, 10, 20, 30],
    'regressor__min_samples_split': [2, 5, 10],
    'regressor__min_samples_leaf': [1, 2, 4]
}
# Creare GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=10, scoring='neg_mean_squared_error', n_jobs=-1)

# Eseguire la ricerca della griglia sui dati di addestramento
grid_search.fit(X_train, y_train)

# Ottenere i migliori parametri trovati
best_params = grid_search.best_params_
print("Best parameters:", best_params)

# Addestrare un modello con i migliori parametri trovati sull'intero set di addestramento
best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)

# Valutare il modello finale sul set di test
y_pred_test = best_model.predict(X_test)
final_rmse_test = rmse(y_test, y_pred_test)
final_r2_test = r2_score(y_test, y_pred_test)
final_mae_test = mean_absolute_error(y_test, y_pred_test)
final_mse_test = mean_squared_error(y_test, y_pred_test)

# Valutare il modello sul set di addestramento
y_pred_train = best_model.predict(X_train)
final_rmse_train = rmse(y_train, y_pred_train)
final_r2_train = r2_score(y_train, y_pred_train)
final_mae_train = mean_absolute_error(y_train, y_pred_train)
final_mse_train = mean_squared_error(y_train, y_pred_train)

# Stampa i risultati
print("Test set results:")
print("Final RMSE:", final_rmse_test)
print("Final MAE:", final_mae_test)
print("Final MSE:", final_mse_test)
print("Final R2:", final_r2_test)

print("\nTrain set results:")
print("Final RMSE:", final_rmse_train)
print("Final MAE:", final_mae_train)
print("Final MSE:", final_mse_train)
print("Final R2:", final_r2_train)

Best parameters: {'regressor__max_depth': 10, 'regressor__min_samples_leaf': 4, 'regressor__min_samples_split': 2}
Test set results:
Final RMSE: 234341231.2186275
Final MAE: 88080157.98294143
Final MSE: 5.491581264906225e+16
Final R2: 0.26191106620721727

Train set results:
Final RMSE: 192282060.24231523
Final MAE: 71758403.94561547
Final MSE: 3.697239069102934e+16
Final R2: 0.561947550254243
