In [None]:
# --- Installazioni necessarie (per Kaggle) ---
# !pip install -q xgboost catboost lightgbm

# --- Import di base ---
import numpy as np
import pandas as pd
import os
import sys
import json
import warnings
from collections import Counter
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

# --- Import Modelli e Utility ---
from sklearn.model_selection import StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

# --- Impostazioni ---
warnings.filterwarnings('ignore')

In [None]:
# --- Clona il repository ---
GIT_REPO_URL = "https://github.com/Gabriele-mp/FDS-DITTO-DATI.git"
REPO_NAME = GIT_REPO_URL.split('/')[-1].replace('.git', '')

if not os.path.exists(REPO_NAME):
    print(f"Clonazione repository: {GIT_REPO_URL}...")
    !git clone -q {GIT_REPO_URL}
else:
    print(f"Repository {REPO_NAME} giÃ  presente.")

# --- Aggiungi 'src' al path di sistema ---
SRC_PATH = os.path.join(os.getcwd(), REPO_NAME, 'src')
if SRC_PATH not in sys.path:
    sys.path.append(SRC_PATH)
    print(f"Aggiunto '{SRC_PATH}' a sys.path")

print("Repository e path pronti.")

In [None]:
# --- Import dal tuo repository GitHub ---
try:
    # Funzioni per caricare e pulire
    from data_processing import load_and_clean_data
    
    # Funzione per costruire i DataFrame
    from train_utils import build_feature_dataframe
    
    # TUTTE le funzioni di feature engineering
    from feature_builder import (
        extract_features_v8,
        extract_features_v21,
        extract_moveset_features,
        extract_features_CRITICAL_MISSING
    )
    print("âœ… Funzioni importate con successo dal repository GitHub!")

except ImportError as e:
    print(f"Errore nell'import: {e}")
    print("Verifica che il path nella Cella 2 sia corretto e che src/__init__.py esista.")

In [None]:
# --- Configurazione Globale ---
COMPETITION_NAME = 'fds-pokemon-battles-prediction-2025'
DATA_PATH = os.path.join('../input', COMPETITION_NAME)
SEED = 123
N_SPLITS = 5

# --- Caricamento e Pulizia Dati di Training ---
train_file_path = os.path.join(DATA_PATH, 'train.jsonl')
df_train_shuffled = load_and_clean_data(train_file_path, seed=SEED, is_train=True)

# Definisci KFold 
kfold = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)

print(f"\nDati di training pronti: {df_train_shuffled.shape}")

## ðŸ¤– Spiegazione Modello 1: Stacking Ensemble a 3 Livelli

Questa submission Ã¨ generata da un **Ensemble di Stacking** progettato per massimizzare l'accuratezza combinando le previsioni di 3 modelli base eterogenei.

**Architettura:**
* **Livello 0 (Feature Sets):**
    * **Set 1 (v8):** Un set compatto di ~30 feature (usato da LR).
    * **Set 2 (v21):** Un set ampio di ~100+ feature (usato da LGBM).
    * **Set 3 (Mega-Set):** Un set "supremo" di 145 feature (v21 + Moveset + Critical), ridotto a 120 tramite feature selection (usato dal secondo LGBM).
* **Livello 1 (Modelli Base):**
    1.  `lr_v8`: **LogisticRegression** ottimizzata (su Set v8).
    2.  `lgbm_v21`: **LightGBM** (su Set v21).
    3.  `lgbm_mega`: **LightGBM** (sul Mega-Set pulito).
* **Livello 2 (Meta-Modello):**
    * Le previsioni Out-of-Fold (OOF) dei 3 modelli base diventano le *meta-features* che addestrano una **LogisticRegression** finale, che funge da "giudice" per pesare e combinare i risultati.

**CV Attesa:** 0.8528

In [None]:
print("--- Inizio Pipeline di Training ---")

# ===================================================================
# 1. GENERAZIONE FEATURE SET DI TRAINING
# ===================================================================
print("\n[Fase 1/6] Generazione Feature Set di Training...")

# Set 1 (LR)
X_train_v8, y_train = build_feature_dataframe(df_train_shuffled, extract_features_v8, is_test_set=False)
# Set 2 (LGBM)
X_train_v21, _ = build_feature_dataframe(df_train_shuffled, extract_features_v21, is_test_set=False)
# Set 3 (per Mega-Set)
X_train_moveset, _ = build_feature_dataframe(df_train_shuffled, extract_moveset_features, is_test_set=False)
X_train_ultimate, _ = build_feature_dataframe(df_train_shuffled, extract_features_CRITICAL_MISSING, is_test_set=False)

# ===================================================================
# 2. CREAZIONE E PULIZIA "MEGA-SET"
# ===================================================================
print("\n[Fase 2/6] Creazione e Pulizia 'Mega-Set'...")

X_train_v21_safe = X_train_v21.reset_index(drop=True)
X_train_moveset_safe = X_train_moveset.reset_index(drop=True)
X_train_ultimate_safe = X_train_ultimate.reset_index(drop=True)

X_train_MEGASUPERSET = pd.concat([X_train_v21_safe, X_train_moveset_safe, X_train_ultimate_safe], axis=1)
X_train_MEGASUPERSET = X_train_MEGASUPERSET.loc[:,~X_train_MEGASUPERSET.columns.duplicated()]
print(f"Shape Mega-Set (grezzo): {X_train_MEGASUPERSET.shape}")

# Feature Selection
lgbm_selector = LGBMClassifier(n_estimators=500, learning_rate=0.05, max_depth=6, num_leaves=31, random_state=SEED, verbose=-1)
lgbm_selector.fit(X_train_MEGASUPERSET, y_train)
importances = pd.Series(lgbm_selector.feature_importances_, index=X_train_MEGASUPERSET.columns)

top_120_features = importances.nlargest(120).index
X_train_MEGA_SELECTED = X_train_MEGASUPERSET[top_120_features]
print(f"Shape Mega-Set (pulito): {X_train_MEGA_SELECTED.shape}")

# ===================================================================
# 3. OTTIMIZZAZIONE LR
# ===================================================================
print("\n[Fase 3/6] Ottimizzazione Iperparametri (LR)...")

param_grid_lr = {'model__C': [1.0, 5.0, 10.0, 15.0, 20.0, 25.0]}
model_lr_v8_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', LogisticRegression(penalty='l2', solver='saga', max_iter=5000, random_state=SEED))
])
grid_lr = GridSearchCV(estimator=model_lr_v8_pipeline, param_grid=param_grid_lr, cv=kfold, scoring='accuracy', n_jobs=-1, verbose=0)
grid_lr.fit(X_train_v8, y_train)
lr_v8_OPTIMIZED = grid_lr.best_estimator_

print(f"LR Ottimizzato. Miglior Score: {grid_lr.best_score_:.4f}, Migliori Parametri: {grid_lr.best_params_}")

# ===================================================================
# 4. DEFINIZIONE MODELLI BASE
# ===================================================================
print("\n[Fase 4/6] Definizione Modelli Base L1...")
base_models = {}

# Modello 1 (LR Ottimizzato)
base_models['lr_v8'] = (lr_v8_OPTIMIZED, X_train_v8)

# Modello 2 (LGBM v21)
lgbm_v21 = LGBMClassifier(n_estimators=500, learning_rate=0.05, max_depth=6, num_leaves=31, random_state=SEED, verbose=-1)
base_models['lgbm_v21'] = (lgbm_v21, X_train_v21)

# Modello 3 (LGBM Mega)
lgbm_mega = LGBMClassifier(n_estimators=500, learning_rate=0.05, max_depth=6, num_leaves=31, random_state=SEED, verbose=-1)
base_models['lgbm_mega'] = (lgbm_mega, X_train_MEGA_SELECTED)

print(f"Definiti {len(base_models)} modelli base: {list(base_models.keys())}")

# ===================================================================
# 5. ESECUZIONE STACKING (OOF)
# ===================================================================
print("\n[Fase 5/6] Esecuzione Stacking (OOF)...")

X_meta_df = pd.DataFrame(np.zeros((len(y_train), len(base_models))), columns=base_models.keys())
final_base_models = {} # Modelli addestrati su tutto il training set

for fold, (train_idx, val_idx) in enumerate(tqdm(kfold.split(y_train, y_train), total=N_SPLITS, desc="Folds")):
    y_train_fold, y_val_fold = y_train[train_idx], y_train[val_idx]
    
    for name, (model, X_data) in base_models.items():
        X_train_fold = X_data.iloc[train_idx]
        X_val_fold = X_data.iloc[val_idx]
        
        model.fit(X_train_fold, y_train_fold)
        X_meta_df.loc[val_idx, name] = model.predict_proba(X_val_fold)[:, 1]

print("Addestramento modelli base finali su tutti i dati...")
for name, (model, X_data) in tqdm(base_models.items(), desc="Modelli Finali L1"):
    final_base_models[name] = model.fit(X_data, y_train)

print("âœ… Meta-Features (X_meta_df) create.")

# ===================================================================
# 6. ADDESTRAMENTO META-MODELLO L2
# ===================================================================
print("\n[Fase 6/6] Addestramento Meta-Modello L2...")

# Analisi e selezione (usiamo tutti e 3 i modelli come da analisi precedente)
optimal_models = ['lr_v8', 'lgbm_v21', 'lgbm_mega']
X_meta_df_optimal = X_meta_df[optimal_models].copy()

# Addestra Meta-Modello Finale
meta_model = LogisticRegression(random_state=SEED, max_iter=1000)
final_ensemble_model = meta_model.fit(X_meta_df_optimal, y_train)

# Stampa CV score
final_cv_score = cross_val_score(meta_model, X_meta_df_optimal, y_train, cv=kfold, scoring='accuracy', n_jobs=-1).mean()

print(f"âœ… Pipeline di Training completata.")
print(f"ðŸŽ¯ CV Score Finale (stimato): {final_cv_score:.4f}")

In [None]:
print("--- Inizio Pipeline di Predizione (Test Set) ---")

# ===================================================================
# 1. CARICAMENTO DATI DI TEST
# ===================================================================
print("\n[Fase 1/3] Caricamento Dati di Test...")
test_file_path = os.path.join(DATA_PATH, 'test.jsonl')
df_test_raw = load_and_clean_data(test_file_path, is_train=False)
battle_ids = df_test_raw['battle_id']

# ===================================================================
# 2. GENERAZIONE FEATURE SET DI TEST
# ===================================================================
print("\n[Fase 2/3] Generazione Feature Set di Test...")

# Set 1 (LR)
X_test_v8, _ = build_feature_dataframe(df_test_raw, extract_features_v8, is_test_set=True)
X_test_v8 = X_test_v8[X_train_v8.columns] # Allinea colonne

# Set 2 (LGBM)
X_test_v21, _ = build_feature_dataframe(df_test_raw, extract_features_v21, is_test_set=True)
X_test_v21 = X_test_v21[X_train_v21.columns] # Allinea colonne

# Set 3 (per Mega-Set)
X_test_moveset, _ = build_feature_dataframe(df_test_raw, extract_moveset_features, is_test_set=True)
X_test_moveset = X_test_moveset[X_train_moveset.columns] # Allinea colonne
X_test_ultimate, _ = build_feature_dataframe(df_test_raw, extract_features_CRITICAL_MISSING, is_test_set=True)
X_test_ultimate = X_test_ultimate[X_train_ultimate.columns] # Allinea colonne

# Costruisci X_test_MEGA_SELECTED
X_test_v21_safe = X_test_v21.reset_index(drop=True)
X_test_moveset_safe = X_test_moveset.reset_index(drop=True)
X_test_ultimate_safe = X_test_ultimate.reset_index(drop=True)

X_test_MEGASUPERSET = pd.concat([X_test_v21_safe, X_test_moveset_safe, X_test_ultimate_safe], axis=1)
X_test_MEGASUPERSET = X_test_MEGASUPERSET.loc[:,~X_test_MEGASUPERSET.columns.duplicated()]

# Usa le top_120_features definite nella Cella 6
X_test_MEGA_SELECTED = X_test_MEGASUPERSET[top_120_features]
print(f"Shape X_test_MEGA_SELECTED: {X_test_MEGA_SELECTED.shape}")

# ===================================================================
# 3. GENERAZIONE SUBMISSION
# ===================================================================
print("\n[Fase 3/3] Generazione Submission Finale...")

# Mappa per i dati di test
test_data_map = {
    'lr_v8': X_test_v8,
    'lgbm_v21': X_test_v21,
    'lgbm_mega': X_test_MEGA_SELECTED
}

# Crea Meta-Features di Test L2
X_meta_test_df = pd.DataFrame(columns=optimal_models)
for name, model in tqdm(final_base_models.items(), desc="Predizioni Test L1"):
    if name in optimal_models:
        X_test_data = test_data_map.get(name)
        X_meta_test_df[name] = model.predict_proba(X_test_data)[:, 1]

# Predizione Finale L2
final_predictions = final_ensemble_model.predict(X_meta_test_df[optimal_models])

# --- Creazione File Submission ---
submission_df = pd.DataFrame({
    'battle_id': battle_ids,
    'player_won': final_predictions.astype(int)
})

submission_filename = 'submission.csv'
submission_df.to_csv(submission_filename, index=False)

print(f"\nâœ… File '{submission_filename}' creato con successo!")
print(f"ðŸ“Š Predizioni: {len(final_predictions)}")
print(f"ðŸ“ˆ Distribuzione (0 vs 1): {np.bincount(final_predictions)}")
print(f"ðŸŽ¯ CV Score Atteso: {final_cv_score:.4f}")

display(submission_df.head(10))