In [38]:
from sklearn.naive_bayes import GaussianNB
import numpy as np
import pandas as pd
from sklearn.ensemble import AdaBoostClassifier
import os
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report
from imblearn.over_sampling import SMOTENC
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import pickle
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.base import BaseEstimator, TransformerMixin


In [43]:
df = pd.read_csv('../dataset/discretized_dataset.csv')


def multi_value_one_hot(df, column):
    s = df[column].str.get_dummies(sep=', ')
    return df.join(s.add_prefix(column + '_'))

df = multi_value_one_hot(df, 'genre')

df.drop(columns=['genre'], inplace=True)
df['month_published'] = df['month_published'].astype(str)



In [44]:
X = df.drop('revenue_cluster', axis=1)
y = df['revenue_cluster']
print(X.head())

numerical_features = ['duration','converted_budget',
                     'dir_oscar_nomination', 'writer_oscar_nomination',
                     'cast_globe_nomination',
                     'BAFTA_writer_nom', 'BAFTA_dir_nom', 'BAFTA_act_nom', 
                     'dir_emmy_nom', 'writer_emmy_nom', 'act_emmy_nom',
                     'actors_films_before', 'director_films_before', 'writers_films_before'
                     ]

categorical_features_no_genre = ['language', 'production_company', 'month_published']
categorical_features =categorical_features_no_genre + [col for col in df.columns if col.startswith('genre_')]


   duration  converted_budget  dir_oscar_nomination  writer_oscar_nomination  \
0      88.0          175700.3                     0                        0   
1      59.0         3013850.0                     0                        0   
2      77.0          521727.6                     0                        0   
3      50.0         5598468.6                     0                        0   
4     300.0        10802441.1                     0                        0   

   cast_globe_nomination  BAFTA_act_nom  BAFTA_dir_nom  BAFTA_writer_nom  \
0                      0              0              0                 0   
1                      0              0              0                 0   
2                      0              0              0                 0   
3                      0              0              0                 0   
4                      0              0              0                 0   

   dir_emmy_nom  writer_emmy_nom  ...  genre_Horror  genre_Mus

In [45]:
# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

#log transformer
log_transformer = FunctionTransformer(np.log1p, validate=True)

# transfromer to drop columns with a specific prefix
class DropOtherColumns(BaseEstimator, TransformerMixin):
    def __init__(self, prefix='Other'):
        self.prefix = prefix
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        other_columns = [col for col in X.columns if col.startswith(self.prefix)]
        return X.drop(columns=other_columns, errors='ignore')

columns_to_drop_after_encoding = ['language_Other', 'production_company_Other']


preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[
            ('scaler', StandardScaler())
        ]), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),  
        ('drop_other', DropOtherColumns(), [])  # Apply the transformer to drop the "Other" columns after encoding
    ]
)

preprocessor_logistic = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[
            ('log', log_transformer),
            ('scaler', StandardScaler())
        ]), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),  
        ('drop_other', DropOtherColumns(), [])  #Apply the transformer to drop the "Other" columns after encoding
    ]
)


# Random Forest

In [16]:


pipeline = ImbPipeline(steps=[
    ('smote', SMOTENC(categorical_features=[X.columns.get_loc(col) for col in categorical_features], random_state=42)),
    ('preprocessor', preprocessor),
    ('feature_selection', SelectKBest(score_func=f_classif, k=20)),
    ('classifier', RandomForestClassifier(random_state=42))
])

param_grid = {
    'classifier__n_estimators': [ 300],#100],
    'classifier__max_depth': [ 10],#None, 4],
    'classifier__min_samples_split': [ 10],#2],
    'classifier__min_samples_leaf': [1],
    'classifier__max_features': ['sqrt'],# 'log2', None],
    'classifier__bootstrap': [True],# False],
    'classifier__criterion': ['gini'],# 'entropy']
}

grid_search = GridSearchCV(pipeline, param_grid, cv=10, scoring='accuracy', n_jobs=-1)


grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
print("Best parameters:", best_params)

best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)

y_pred_test = best_model.predict(X_test)
final_accuracy_test = accuracy_score(y_test, y_pred_test)
final_f1_test = f1_score(y_test, y_pred_test, average='weighted')
final_confusion_matrix_test = confusion_matrix(y_test, y_pred_test)
final_classification_report_test = classification_report(y_test, y_pred_test)

y_pred_train = best_model.predict(X_train)
final_accuracy_train = accuracy_score(y_train, y_pred_train)
final_f1_train = f1_score(y_train, y_pred_train, average='weighted')
final_confusion_matrix_train = confusion_matrix(y_train, y_pred_train)
final_classification_report_train = classification_report(y_train, y_pred_train)

print("Test set results:")
print("Final Accuracy:", final_accuracy_test)
print("Final F1 Score:", final_f1_test)
print("Confusion Matrix:\n", final_confusion_matrix_test)
print("Classification Report:\n", final_classification_report_test)

print("\nTrain set results:")
print("Final Accuracy:", final_accuracy_train)
print("Final F1 Score:", final_f1_train)
print("Confusion Matrix:\n", final_confusion_matrix_train)
print("Classification Report:\n", final_classification_report_train)

file_path = "../models/classification/RandomForestClassifier.pkl"

os.makedirs(os.path.dirname(file_path), exist_ok=True)

with open(file_path, 'wb') as file:
    pickle.dump(best_model, file)

Best parameters: {'classifier__bootstrap': True, 'classifier__criterion': 'gini', 'classifier__max_depth': 10, 'classifier__max_features': 'sqrt', 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 10, 'classifier__n_estimators': 300}
Test set results:
Final Accuracy: 0.4894952909925139
Final F1 Score: 0.48538038744025086
Confusion Matrix:
 [[491 316 426 179]
 [224 399  82 306]
 [243  39 943  51]
 [ 66 159  23 194]]
Classification Report:
               precision    recall  f1-score   support

           0       0.48      0.35      0.40      1412
           1       0.44      0.39      0.41      1011
           2       0.64      0.74      0.69      1276
           3       0.27      0.44      0.33       442

    accuracy                           0.49      4141
   macro avg       0.46      0.48      0.46      4141
weighted avg       0.50      0.49      0.49      4141


Train set results:
Final Accuracy: 0.5545761893262497
Final F1 Score: 0.5535559989629932
Confusion Matr

# Logistic Regression

In [17]:
pipeline = ImbPipeline(steps=[
    ('smote', SMOTENC(categorical_features=categorical_features, random_state=42)),
    ('preprocessor', preprocessor_logistic),
    ('feature_selection', SelectKBest(score_func=f_classif, k=20)),
    ('classifier', LogisticRegression())
])

param_grid = {
    'classifier__penalty': [ 'l2'],#'l1'],
    'classifier__C': [ 0.1],# 1, 10, 100,0.001, 0.01],
    'classifier__solver': ['liblinear']  
}

grid_search = GridSearchCV(pipeline, param_grid, cv=10, scoring='accuracy', n_jobs=-1)

grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
print("Best parameters:", best_params)


Best parameters: {'classifier__C': 0.1, 'classifier__penalty': 'l2', 'classifier__solver': 'liblinear'}


In [18]:

best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)

y_pred_test = best_model.predict(X_test)
final_accuracy_test = accuracy_score(y_test, y_pred_test)
final_f1_test = f1_score(y_test, y_pred_test, average='weighted')
final_confusion_matrix_test = confusion_matrix(y_test, y_pred_test)
final_classification_report_test = classification_report(y_test, y_pred_test)

y_pred_train = best_model.predict(X_train)
final_accuracy_train = accuracy_score(y_train, y_pred_train)
final_f1_train = f1_score(y_train, y_pred_train, average='weighted')
final_confusion_matrix_train = confusion_matrix(y_train, y_pred_train)
final_classification_report_train = classification_report(y_train, y_pred_train)

print("Test set results:")
print("Final Accuracy:", final_accuracy_test)
print("Final F1 Score:", final_f1_test)
print("Confusion Matrix:\n", final_confusion_matrix_test)
print("Classification Report:\n", final_classification_report_test)

print("\nTrain set results:")
print("Final Accuracy:", final_accuracy_train)
print("Final F1 Score:", final_f1_train)
print("Confusion Matrix:\n", final_confusion_matrix_train)
print("Classification Report:\n", final_classification_report_train)

file_path = "../models/classification/LogisticRegression.pkl"

os.makedirs(os.path.dirname(file_path), exist_ok=True)
with open(file_path, 'wb') as file:
    pickle.dump(best_model, file)

Test set results:
Final Accuracy: 0.47162521130161794
Final F1 Score: 0.452990705939036
Confusion Matrix:
 [[ 342  262  550  258]
 [ 170  357  133  351]
 [ 123   38 1033   82]
 [  52  133   36  221]]
Classification Report:
               precision    recall  f1-score   support

           0       0.50      0.24      0.33      1412
           1       0.45      0.35      0.40      1011
           2       0.59      0.81      0.68      1276
           3       0.24      0.50      0.33       442

    accuracy                           0.47      4141
   macro avg       0.45      0.48      0.43      4141
weighted avg       0.49      0.47      0.45      4141


Train set results:
Final Accuracy: 0.4757908717701038
Final F1 Score: 0.46055253961944304
Confusion Matrix:
 [[1432 1010 2093 1115]
 [ 685 1457  456 1445]
 [ 501  185 4105  314]
 [ 209  529  141  887]]
Classification Report:
               precision    recall  f1-score   support

           0       0.51      0.25      0.34      5650
     

# AdaBoost

In [46]:

pipeline = ImbPipeline(steps=[
    ('smote', SMOTENC(categorical_features=[X.columns.get_loc(col) for col in categorical_features], random_state=42)),
    ('preprocessor', preprocessor),
    ('feature_selection', SelectKBest(score_func=f_classif, k=20)),
    ('classifier', AdaBoostClassifier(random_state=42))
])

param_grid = {
    'classifier__n_estimators': [ 100],# 150,50],  # Numero di stimatori
    'classifier__learning_rate': [ 0.1],# 1.0,0.01],  # Tasso di apprendimento
    'classifier__algorithm': [ 'SAMME.R']#,'SAMME']  # Algoritmo per calcolare i pesi degli stimatori
}


grid_search = GridSearchCV(pipeline, param_grid, cv=10, scoring='accuracy', n_jobs=-1)

grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
print("Best parameters:", best_params)

best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)

y_pred_test = best_model.predict(X_test)
final_accuracy_test = accuracy_score(y_test, y_pred_test)
final_f1_test = f1_score(y_test, y_pred_test, average='weighted')
final_confusion_matrix_test = confusion_matrix(y_test, y_pred_test)
final_classification_report_test = classification_report(y_test, y_pred_test)

y_pred_train = best_model.predict(X_train)
final_accuracy_train = accuracy_score(y_train, y_pred_train)
final_f1_train = f1_score(y_train, y_pred_train, average='weighted')
final_confusion_matrix_train = confusion_matrix(y_train, y_pred_train)
final_classification_report_train = classification_report(y_train, y_pred_train)

print("Test set results:")
print("Final Accuracy:", final_accuracy_test)
print("Final F1 Score:", final_f1_test)
print("Confusion Matrix:\n", final_confusion_matrix_test)
print("Classification Report:\n", final_classification_report_test)

print("\nTrain set results:")
print("Final Accuracy:", final_accuracy_train)
print("Final F1 Score:", final_f1_train)
print("Confusion Matrix:\n", final_confusion_matrix_train)
print("Classification Report:\n", final_classification_report_train)

file_path = "../models/classification/AdaBoostClassifier.pkl"

os.makedirs(os.path.dirname(file_path), exist_ok=True)

with open(file_path, 'wb') as file:
    pickle.dump(best_model, file)

Best parameters: {'classifier__algorithm': 'SAMME.R', 'classifier__learning_rate': 0.1, 'classifier__n_estimators': 100}
Test set results:
Final Accuracy: 0.5006037189084762
Final F1 Score: 0.5004335213983511
Confusion Matrix:
 [[547 342 368 155]
 [236 472  65 238]
 [300  50 889  37]
 [ 63 193  21 165]]
Classification Report:
               precision    recall  f1-score   support

           0       0.48      0.39      0.43      1412
           1       0.45      0.47      0.46      1011
           2       0.66      0.70      0.68      1276
           3       0.28      0.37      0.32       442

    accuracy                           0.50      4141
   macro avg       0.47      0.48      0.47      4141
weighted avg       0.51      0.50      0.50      4141


Train set results:
Final Accuracy: 0.507486114465105
Final F1 Score: 0.5077914953267204
Confusion Matrix:
 [[2164 1351 1443  692]
 [ 896 1936  232  979]
 [1116  214 3598  177]
 [ 240  741   77  708]]
Classification Report:
            

# SVC

In [20]:
pipeline = ImbPipeline(steps=[
    ('smote', SMOTENC(categorical_features=[X.columns.get_loc(col) for col in categorical_features], random_state=42)),
    ('preprocessor', preprocessor),
    ('feature_selection', SelectKBest(score_func=f_classif, k=20)),
    ('classifier', SVC())
])

param_grid = {
    'classifier__C': [0.1],# 1, 10],  Parametro di regolarizzazione
    'classifier__kernel': [ 'poly'],# 'rbf', 'sigmoid','linear',], Tipo di kernel da utilizzare
    'classifier__gamma': [ 'auto'],#'scale'], C oefficiente del kernel per 'rbf', 'poly' e 'sigmoid'
    'classifier__degree': [2]# 3, 4],  # Grado del polinomio per 'poly'
}


grid_search = GridSearchCV(pipeline, param_grid, cv=10, scoring='accuracy', n_jobs=-1)

grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
print("Best parameters:", best_params)

best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)

y_pred_test = best_model.predict(X_test)
final_accuracy_test = accuracy_score(y_test, y_pred_test)
final_f1_test = f1_score(y_test, y_pred_test, average='weighted')
final_confusion_matrix_test = confusion_matrix(y_test, y_pred_test)
final_classification_report_test = classification_report(y_test, y_pred_test)

y_pred_train = best_model.predict(X_train)
final_accuracy_train = accuracy_score(y_train, y_pred_train)
final_f1_train = f1_score(y_train, y_pred_train, average='weighted')
final_confusion_matrix_train = confusion_matrix(y_train, y_pred_train)
final_classification_report_train = classification_report(y_train, y_pred_train)

print("Test set results:")
print("Final Accuracy:", final_accuracy_test)
print("Final F1 Score:", final_f1_test)
print("Confusion Matrix:\n", final_confusion_matrix_test)
print("Classification Report:\n", final_classification_report_test)

print("\nTrain set results:")
print("Final Accuracy:", final_accuracy_train)
print("Final F1 Score:", final_f1_train)
print("Confusion Matrix:\n", final_confusion_matrix_train)
print("Classification Report:\n", final_classification_report_train)

file_path = "../models/classification/SVCclassifier.pkl"

os.makedirs(os.path.dirname(file_path), exist_ok=True)

with open(file_path, 'wb') as file:
    pickle.dump(best_model, file)

Best parameters: {'classifier__C': 0.1, 'classifier__degree': 2, 'classifier__gamma': 'auto', 'classifier__kernel': 'poly'}
Test set results:
Final Accuracy: 0.4549625694276745
Final F1 Score: 0.47013848165566446
Confusion Matrix:
 [[543 413 194 262]
 [217 456  39 299]
 [346 124 701 105]
 [ 69 179  10 184]]
Classification Report:
               precision    recall  f1-score   support

           0       0.46      0.38      0.42      1412
           1       0.39      0.45      0.42      1011
           2       0.74      0.55      0.63      1276
           3       0.22      0.42      0.28       442

    accuracy                           0.45      4141
   macro avg       0.45      0.45      0.44      4141
weighted avg       0.50      0.45      0.47      4141


Train set results:
Final Accuracy: 0.46021492393141755
Final F1 Score: 0.4758189416407024
Confusion Matrix:
 [[2159 1695  726 1070]
 [ 822 1934  120 1167]
 [1426  466 2827  386]
 [ 266  752   45  703]]
Classification Report:
      

# KNNeighbors

In [21]:
pipeline = ImbPipeline(steps=[
    ('smote', SMOTENC(categorical_features=[X.columns.get_loc(col) for col in categorical_features], random_state=42)),
    ('preprocessor', preprocessor),
    ('feature_selection', SelectKBest(score_func=f_classif, k=20)),
    ('classifier', KNeighborsClassifier())
])

param_grid = {
    'classifier__n_neighbors': [ 9],#3, 5],  # Numero di vicini da considerare
    'classifier__weights': ['uniform'],# 'distance'],  # Pesi utilizzati nella previsione
    'classifier__algorithm': [ 'ball_tree'],# 'kd_tree', 'brute','auto'],  # Algoritmo per calcolare i vicini
    'classifier__leaf_size': [ 20],# 30,10],  # Dimensione delle foglie per gli alberi di ricerca
    'classifier__p': [1],# 2],  # Parametro di potenza per la distanza di Minkowski
}


grid_search = GridSearchCV(pipeline, param_grid, cv=10, scoring='accuracy', n_jobs=-1)

grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
print("Best parameters:", best_params)

best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)

y_pred_test = best_model.predict(X_test)
final_accuracy_test = accuracy_score(y_test, y_pred_test)
final_f1_test = f1_score(y_test, y_pred_test, average='weighted')
final_confusion_matrix_test = confusion_matrix(y_test, y_pred_test)
final_classification_report_test = classification_report(y_test, y_pred_test)

y_pred_train = best_model.predict(X_train)
final_accuracy_train = accuracy_score(y_train, y_pred_train)
final_f1_train = f1_score(y_train, y_pred_train, average='weighted')
final_confusion_matrix_train = confusion_matrix(y_train, y_pred_train)
final_classification_report_train = classification_report(y_train, y_pred_train)

print("Test set results:")
print("Final Accuracy:", final_accuracy_test)
print("Final F1 Score:", final_f1_test)
print("Confusion Matrix:\n", final_confusion_matrix_test)
print("Classification Report:\n", final_classification_report_test)

print("\nTrain set results:")
print("Final Accuracy:", final_accuracy_train)
print("Final F1 Score:", final_f1_train)
print("Confusion Matrix:\n", final_confusion_matrix_train)
print("Classification Report:\n", final_classification_report_train)

file_path = "../models/classification/KNNClassifier.pkl"

os.makedirs(os.path.dirname(file_path), exist_ok=True)

with open(file_path, 'wb') as file:
    pickle.dump(best_model, file)

Best parameters: {'classifier__algorithm': 'ball_tree', 'classifier__leaf_size': 20, 'classifier__n_neighbors': 9, 'classifier__p': 1, 'classifier__weights': 'uniform'}
Test set results:
Final Accuracy: 0.46099975851243663
Final F1 Score: 0.4658417860212128
Confusion Matrix:
 [[635 310 317 150]
 [315 385  70 241]
 [402  77 754  43]
 [ 93 194  20 135]]
Classification Report:
               precision    recall  f1-score   support

           0       0.44      0.45      0.44      1412
           1       0.40      0.38      0.39      1011
           2       0.65      0.59      0.62      1276
           3       0.24      0.31      0.27       442

    accuracy                           0.46      4141
   macro avg       0.43      0.43      0.43      4141
weighted avg       0.47      0.46      0.47      4141


Train set results:
Final Accuracy: 0.5812001931900507
Final F1 Score: 0.5847938502630485
Confusion Matrix:
 [[3224  950  917  559]
 [1011 2159  224  649]
 [1187  306 3469  143]
 [ 372  5

# GaussianNB

In [22]:
pipeline = ImbPipeline(steps=[
    ('smote', SMOTENC(categorical_features=[X.columns.get_loc(col) for col in categorical_features], random_state=42)),
    ('preprocessor', preprocessor),
    ('feature_selection', SelectKBest(score_func=f_classif, k=20)),
    ('classifier', GaussianNB())
])

param_grid = {
    'classifier__var_smoothing': [1e-9]#, 1e-8, 1e-7]  # Parametro di smoothing della varianza
}


grid_search = GridSearchCV(pipeline, param_grid, cv=10, scoring='accuracy', n_jobs=-1)

grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
print("Best parameters:", best_params)

best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)

y_pred_test = best_model.predict(X_test)
final_accuracy_test = accuracy_score(y_test, y_pred_test)
final_f1_test = f1_score(y_test, y_pred_test, average='weighted')
final_confusion_matrix_test = confusion_matrix(y_test, y_pred_test)
final_classification_report_test = classification_report(y_test, y_pred_test)

y_pred_train = best_model.predict(X_train)
final_accuracy_train = accuracy_score(y_train, y_pred_train)
final_f1_train = f1_score(y_train, y_pred_train, average='weighted')
final_confusion_matrix_train = confusion_matrix(y_train, y_pred_train)
final_classification_report_train = classification_report(y_train, y_pred_train)

print("Test set results:")
print("Final Accuracy:", final_accuracy_test)
print("Final F1 Score:", final_f1_test)
print("Confusion Matrix:\n", final_confusion_matrix_test)
print("Classification Report:\n", final_classification_report_test)

print("\nTrain set results:")
print("Final Accuracy:", final_accuracy_train)
print("Final F1 Score:", final_f1_train)
print("Confusion Matrix:\n", final_confusion_matrix_train)
print("Classification Report:\n", final_classification_report_train)

file_path = "../models/classification/GaussianNBClassifier.pkl"

os.makedirs(os.path.dirname(file_path), exist_ok=True)

with open(file_path, 'wb') as file:
    pickle.dump(best_model, file)

Best parameters: {'classifier__var_smoothing': 1e-09}
Test set results:
Final Accuracy: 0.32987201159140306
Final F1 Score: 0.33835751278735104
Confusion Matrix:
 [[237 181 281 713]
 [ 94 168  83 666]
 [126  82 652 416]
 [ 40  69  24 309]]
Classification Report:
               precision    recall  f1-score   support

           0       0.48      0.17      0.25      1412
           1       0.34      0.17      0.22      1011
           2       0.63      0.51      0.56      1276
           3       0.15      0.70      0.24       442

    accuracy                           0.33      4141
   macro avg       0.40      0.39      0.32      4141
weighted avg       0.45      0.33      0.34      4141


Train set results:
Final Accuracy: 0.32818159864766966
Final F1 Score: 0.3337875556224307
Confusion Matrix:
 [[ 900  725 1170 2855]
 [ 377  678  334 2654]
 [ 589  319 2614 1583]
 [ 128  268  126 1244]]
Classification Report:
               precision    recall  f1-score   support

           0       