In [None]:
import pandas as pd

# Carica il file CSV
df = pd.read_csv("data/fundamentals_ready_multiclass.csv")

# Conta quante osservazioni ha ogni classe
class_counts = df["buffett_class"].value_counts().sort_index()

# Mostra la distribuzione
print("📊 Distribuzione delle classi:")
for cls, count in class_counts.items():
    print(f"Classe {cls}: {count} stock")

# Totale
print(f"\nTotale stock nel dataset: {df.shape[0]}")

📊 Distribuzione delle classi:
Classe 0: 661 stock
Classe 1: 8 stock
Classe 2: 8 stock
Classe 3: 7 stock

Totale stock nel dataset: 684


In [3]:
import pandas as pd

# Carica il dataset multiclass
df = pd.read_csv("data/fundamentals_ready_multiclass.csv")

# Crea colonna binaria
df["buffett_binary"] = df["buffett_class"].apply(lambda x: 1 if x in [1, 2, 3] else 0)

# Salva nuovo file
df.to_csv("data/fundamentals_ready_binary.csv", index=False)

# Mostra distribuzione binaria
print(df["buffett_binary"].value_counts())


buffett_binary
0    661
1     23
Name: count, dtype: int64


In [10]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.utils import resample
from sklearn.metrics import make_scorer, classification_report, f1_score, recall_score, precision_score

# === Carica dataset ===
df = pd.read_csv("data/fundamentals_ready_binary.csv")
X = df.drop(columns=["Ticker", "buffett_class", "buffett_binary"], errors="ignore")
y = df["buffett_binary"]

# === Bilanciamento prima del CV
df_full = pd.concat([X, y], axis=1)
df_0 = df_full[df_full.buffett_binary == 0]
df_1 = df_full[df_full.buffett_binary == 1]

df_0_under = resample(df_0, replace=False, n_samples=300, random_state=42)
df_1_over = resample(df_1, replace=True, n_samples=100, random_state=42)

df_bal = pd.concat([df_0_under, df_1_over])
X_bal = df_bal.drop(columns="buffett_binary")
y_bal = df_bal["buffett_binary"]

# === K-Fold Stratificato
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# === Modello base
model = xgb.XGBClassifier(
    objective="binary:logistic",
    eval_metric="logloss",
    use_label_encoder=False,
    max_depth=6,
    n_estimators=300,
    learning_rate=0.2,
    colsample_bytree=0.8,
    subsample=1.0,
    gamma=0
)

# === Scorer per classe 1
scoring = {
    "f1": make_scorer(f1_score, pos_label=1),
    "recall": make_scorer(recall_score, pos_label=1),
    "precision": make_scorer(precision_score, pos_label=1),
    "accuracy": "accuracy"
}

# === Valutazione cross-validation
cv_results = cross_validate(model, X_bal, y_bal, cv=skf, scoring=scoring)

# === Mostra risultati
print("📊 K-FOLD VALIDATION RISULTATI (classe 1):")
for metric in ["f1", "precision", "recall", "accuracy"]:
    values = cv_results[f"test_{metric}"]
    print(f"{metric:<10}: {values.mean():.3f} ± {values.std():.3f}")

# Salva il modello completo addestrato su tutti i dati bilanciati
model.fit(X_bal, y_bal)  # retraining su tutto
import os, joblib
os.makedirs("models", exist_ok=True)
joblib.dump(model, "models/buffett_model_balanced.pkl")
print("✅ Modello finale salvato: models/buffett_model_balanced.pkl")


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


📊 K-FOLD VALIDATION RISULTATI (classe 1):
f1        : 0.944 ± 0.017
precision : 0.894 ± 0.030
recall    : 1.000 ± 0.000
accuracy  : 0.970 ± 0.010
✅ Modello finale salvato: models/buffett_model_balanced.pkl


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [11]:
import pandas as pd
import joblib
import shap
import numpy as np

# === Carica modello e dataset ===
model = joblib.load("models/buffett_model_balanced.pkl")
df = pd.read_csv("data/fundamentals_ready_binary.csv")
X = df.drop(columns=["Ticker", "buffett_class", "buffett_binary"], errors="ignore")
tickers = df["Ticker"].tolist()

# === Crea explainer SHAP
explainer = shap.Explainer(model, X)
shap_values = explainer(X)

# === Funzione per generare spiegazione testuale
def explain_stock(index, top_n=5, threshold=0.02):
    ticker = tickers[index]
    pred_prob = model.predict_proba([X.iloc[index]])[0][1]
    shap_row = shap_values[index]
    
    # Filtra i contributi positivi significativi
    pos_contribs = [(X.columns[i], shap_row.values[i]) for i in range(len(shap_row.values)) if shap_row.values[i] > threshold]
    pos_contribs = sorted(pos_contribs, key=lambda x: -x[1])[:top_n]

    if pred_prob < 0.5:
        return f"🔴 {ticker} → Non Buffett-like. Motivo principale: mancano contributi positivi rilevanti."

    if not pos_contribs:
        return f"🟡 {ticker} → Buffett-like ma nessuna feature dominante oltre soglia."

    # Genera frase
    components = [f"{feat} elevato" if X.iloc[index][feat] > X[feat].median() else f"{feat} basso" for feat, _ in pos_contribs]
    frase = ", ".join(components)
    return f"✅ {ticker} → Buffett-like per: {frase} (probabilità: {pred_prob:.2f})"

# === Esempio su 3 stock
for i in range(3):
    print(explain_stock(i))


🔴 A → Non Buffett-like. Motivo principale: mancano contributi positivi rilevanti.
✅ AAPL → Buffett-like per: Market Cap elevato, Volume elevato, P/B elevato, Curr R basso, ROE elevato (probabilità: 0.98)
🔴 ABBV → Non Buffett-like. Motivo principale: mancano contributi positivi rilevanti.
