# Addestramento con bilanciamento (Undersampling) 

In [25]:
import pandas as pd

* Prendo il dataset risultante dalla fase di analisi e preprocessing.

In [26]:
ds = pd.read_csv("DataSet/games1.csv")
print(ds.columns.tolist())

['required_age', 'achievements', 'platform_count', 'is_free', 'price_log', 'trimester', 'success_class_encoded', 'top1_genre_freq', 'top2_genre_freq', 'genre_Indie', 'genre_Action', 'genre_Adventure', 'genre_Casual', 'genre_Free To Play', 'genre_RPG', 'genre_Strategy', 'genre_Simulation', 'genre_Early Access', 'genre_Massively Multiplayer', 'genre_Sports', 'genre_Racing', 'cat_Captions available', 'cat_Co-op', 'cat_Cross-Platform Multiplayer', 'cat_Family Sharing', 'cat_Full controller support', 'cat_In-App Purchases', 'cat_Includes Source SDK', 'cat_Includes level editor', 'cat_LAN Co-op', 'cat_LAN PvP', 'cat_MMO', 'cat_Multi-player', 'cat_Online Co-op', 'cat_Online PvP', 'cat_PvP', 'cat_Shared/Split Screen', 'cat_Shared/Split Screen Co-op', 'cat_Shared/Split Screen PvP', 'cat_Single-player', 'cat_Steam Trading Cards', 'publishers_freq', 'developer_freq', 'pub_top_Big Fish Games', 'pub_top_Conglomerate 5', 'pub_top_Daedalic Entertainment', 'pub_top_Devolver Digital', 'pub_top_DigiPen 

In [27]:
ds.to_csv('DataSet/games2.csv', index=False)

# Inizio fase di Addestramento
* divido la variabile target dal dataset
* Eseguo Undersampling (casuale) per ribilanciare il dataset altamente sbilanciato

In [28]:
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter

# Features e target
X = ds.drop(columns=['success_class_encoded'])
y = ds['success_class_encoded']

print("Distribuzione originale:", Counter(y))

# Definisci undersampler (stratificato, random_state per riproducibilità)
rus = RandomUnderSampler(random_state=42)

# Applica undersampling
X_res, y_res = rus.fit_resample(X, y)

print("Distribuzione dopo undersampling:", Counter(y_res))


Distribuzione originale: Counter({1: 13274, 0: 10733, 2: 5441, 3: 844})
Distribuzione dopo undersampling: Counter({0: 844, 1: 844, 2: 844, 3: 844})


In [29]:
ds.to_csv('DataSet/games2.csv', index=False)
y.to_csv('DataSet/target.csv', index=False)

print(X.columns.tolist())

['required_age', 'achievements', 'platform_count', 'is_free', 'price_log', 'trimester', 'top1_genre_freq', 'top2_genre_freq', 'genre_Indie', 'genre_Action', 'genre_Adventure', 'genre_Casual', 'genre_Free To Play', 'genre_RPG', 'genre_Strategy', 'genre_Simulation', 'genre_Early Access', 'genre_Massively Multiplayer', 'genre_Sports', 'genre_Racing', 'cat_Captions available', 'cat_Co-op', 'cat_Cross-Platform Multiplayer', 'cat_Family Sharing', 'cat_Full controller support', 'cat_In-App Purchases', 'cat_Includes Source SDK', 'cat_Includes level editor', 'cat_LAN Co-op', 'cat_LAN PvP', 'cat_MMO', 'cat_Multi-player', 'cat_Online Co-op', 'cat_Online PvP', 'cat_PvP', 'cat_Shared/Split Screen', 'cat_Shared/Split Screen Co-op', 'cat_Shared/Split Screen PvP', 'cat_Single-player', 'cat_Steam Trading Cards', 'publishers_freq', 'developer_freq', 'pub_top_Big Fish Games', 'pub_top_Conglomerate 5', 'pub_top_Daedalic Entertainment', 'pub_top_Devolver Digital', 'pub_top_DigiPen Institute of Technology',

* Divido in set di addestramento e di test (train\test) per valutare il modello.

In [30]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_res, y_res, test_size=0.2, stratify=y_res, random_state=42
)
# stratify mantiene le proporzioni delle classi nel train/test
# 20% dei dati per il test

## Definizione modello Random Forest
* Metriche

In [31]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(
    n_estimators=500,       # numero alberi
    max_depth=None,
    min_samples_split=5,   # minimo campioni per split
    class_weight="balanced", # gestisce sbilanciamento
    random_state=42,
    n_jobs=-1           # usa tutti i core CPU
)

## Addestramento

In [32]:
model.fit(X_train, y_train)

### Salvo modello

In [33]:
import joblib
joblib.dump(model, "Modello/rf_model_future_game.pkl")

['Modello/rf_model_future_game.pkl']

## Valutazione performance

In [34]:
from sklearn.metrics import classification_report, confusion_matrix

y_pred = model.predict(X_test)

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.70      0.96      0.81       169
           1       0.55      0.51      0.53       169
           2       0.44      0.28      0.34       169
           3       0.66      0.71      0.68       169

    accuracy                           0.62       676
   macro avg       0.59      0.62      0.59       676
weighted avg       0.59      0.62      0.59       676

[[162   1   0   6]
 [ 30  87  35  17]
 [ 28  55  47  39]
 [ 11  14  24 120]]


* analisi importanza delle variabili che pesano di più nella predizione

In [35]:
feat_importances = pd.Series(model.feature_importances_, index=X.columns)
print(feat_importances.sort_values(ascending=False).head(20))


price_log                      0.137306
achievements                   0.099646
is_free                        0.077831
publishers_freq                0.061854
cat_Family Sharing             0.060092
cat_Steam Trading Cards        0.044619
developer_freq                 0.041582
trimester                      0.038834
top2_genre_freq                0.035288
platform_count                 0.025575
top1_genre_freq                0.024593
genre_Free To Play             0.024483
cat_Multi-player               0.020661
cat_Full controller support    0.019543
cat_Online Co-op               0.017028
genre_Action                   0.016822
genre_Adventure                0.016462
genre_Casual                   0.016321
required_age                   0.016290
genre_RPG                      0.015421
dtype: float64


In [36]:
print(ds['price_log'].value_counts())
print(X_train['price_log'].value_counts())

price_log
0.000000    13915
2.396986     1671
1.790091     1652
0.688135     1495
3.044046     1484
            ...  
1.479329        1
1.738710        1
1.266948        1
5.303255        1
0.756122        1
Name: count, Length: 292, dtype: int64
price_log
0.000000    1058
3.044046     218
2.396986     125
2.771964     123
1.790091     117
            ... 
2.052841       1
3.156575       1
0.565314       1
3.409166       1
2.369309       1
Name: count, Length: 97, dtype: int64
