Importing libraries and models

In [None]:
import pandas as pd
import numpy as np
import os
import joblib 
import warnings
from tqdm import tqdm


from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier


warnings.filterwarnings('ignore')

# === SEED  ===
SEED = 123 

Loading Feature Sets

In [None]:
print("Loading feature sets from 'data/processed/'...")

try:
    X_train_v8 = pd.read_csv('../data/processed/v8_train_features.csv')
    X_train_v20 = pd.read_csv('../data/processed/v20_train_features.csv')
    X_train_v19 = pd.read_csv('../data/processed/v19_train_features.csv')
    X_train_v2 = pd.read_csv('../data/processed/v2_train_features.csv')

    y_train = pd.read_csv('../data/processed/train_target.csv').squeeze() 

    print("Dati caricati con successo.")
    print(f"Shape y_train: {y_train.shape}")
    print(f"Shape X_train_v8 (for LR): {X_train_v8.shape}")
    print(f"Shape X_train_v20 (for XGB): {X_train_v20.shape}")
    print(f"Shape X_train_v19 (for RF/CAT/kNN): {X_train_v19.shape}")
    print(f"Shape X_train_v2 (for XGB-v2): {X_train_v2.shape}")

except FileNotFoundError as e:
    print(f"ERROR: File not found. {e}")
    print("!!! Be sure to execute notebook 02_Feature_Engineering.ipynb first!!!")

Training Model

In [None]:
base_models = {}

# === 1. Model LR (v8) ===
model_lr_v8 = Pipeline([
    ('scaler', StandardScaler()),
    ('model', LogisticRegression(
        C=10.0, 
        penalty='l2', 
        solver='saga', 
        max_iter=5000, 
        random_state=SEED
    ))
])
base_models['lr_v8'] = (model_lr_v8, X_train_v8)

# === 2. Model XGB (v20) ===
model_xgb_v20 = XGBClassifier(
    colsample_bytree=0.7,
    learning_rate=0.05,
    max_depth=3,
    n_estimators=200,
    reg_lambda=5,
    subsample=0.7,
    objective='binary:logistic',
    eval_metric='logloss',
    use_label_encoder=False,
    random_state=SEED
)
base_models['xgb_v20'] = (model_xgb_v20, X_train_v20)

# === 3. Model RF (v19) ===
model_rf_v19 = Pipeline([
    ('scaler', StandardScaler()),
    ('model', RandomForestClassifier(
        n_estimators=400,
        min_samples_split=2,
        min_samples_leaf=1,
        max_features=0.5,
        max_depth=10,
        random_state=SEED,
        n_jobs=-1
    ))
])
base_models['rf_v19'] = (model_rf_v19, X_train_v19)

# === 4. Model CAT (v19) ===
model_cat_v19 = CatBoostClassifier(
    learning_rate=0.03,
    l2_leaf_reg=7,
    iterations=300,
    depth=8,
    random_state=SEED,
    verbose=0,
    eval_metric='Accuracy'
)
base_models['cat_v19'] = (model_cat_v19, X_train_v19)

# === 5. Model kNN (v19) ===
model_knn_v19 = Pipeline([
    ('scaler', StandardScaler()),
    ('model', KNeighborsClassifier(
        metric='manhattan',
        n_neighbors=45,
        weights='uniform',
        n_jobs=-1
    ))
])
base_models['knn_v19'] = (model_knn_v19, X_train_v19)


print(f"Definiti {len(base_models)} modelli base pronti per lo stacking.")
print(f"Modelli nello stack: {list(base_models.keys())}")

# === 6. Model XGB (v2_features) ===
model_xgb_v2 = XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    random_state=SEED,
    n_estimators=1133,  
    alpha=1.3799959101127168,
    colsample_bytree=0.788820517112247,
    reg_lambda=0.8263346953150125, 
    learning_rate=0.013127281348238786,
    max_depth=3,
    subsample=0.7016566351370807,
    use_label_encoder=False
)

base_models['xgb_v2'] = (model_xgb_v2, X_train_v2)

print(f"\nADDED Modello 'xgb_v2'. Total base models: {len(base_models)}")
print(f"Models: {list(base_models.keys())}")


In [None]:
N_SPLITS = 5
kfold = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)

# Creating dataframe x_meta
oof_preds = np.zeros((len(y_train), len(base_models)))
X_meta_df = pd.DataFrame(oof_preds, columns=base_models.keys())

final_base_models = {}

print(f"Avvio Stacking (OOF) con {N_SPLITS} folds...")

# Let'use a tqdm bar
for fold, (train_idx, val_idx) in enumerate(tqdm(kfold.split(y_train, y_train), total=N_SPLITS, desc="Folds")):
    
    y_train_fold, y_val_fold = y_train.iloc[train_idx], y_train.iloc[val_idx]
    
    for name, (model, X_data) in base_models.items():
        
        X_train_fold = X_data.iloc[train_idx]
        X_val_fold = X_data.iloc[val_idx]
        
        # Training the model
        model.fit(X_train_fold, y_train_fold)
        
        # Saving OOF predictions (probabilities)
        X_meta_df.loc[val_idx, name] = model.predict_proba(X_val_fold)[:, 1]

print("\nCreation meta features (X_meta_df) completed.")

# Now, train base models on the entire training set.

print("Addestramento modelli base finali su tutti i dati di training...")
for name, (model, X_data) in tqdm(base_models.items(), desc="Modelli Finali"):
    final_base_models[name] = model.fit(X_data, y_train)

print("Modelli base finali addestrati.")
display(X_meta_df.head())

Ensemble Analysis - Selection optimal models

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
import seaborn as sns
from itertools import combinations
import warnings

warnings.filterwarnings('ignore')

print("="*80)
print("ANALISI ENSEMBLE v6 - SELEZIONE MODELLI")
print("="*80)

# ============================================================================
# METHOD 1: INDIVIDUAL PERFORMACE (OOF)
# ============================================================================
print("\n" + "="*80)
print("METODO 1: PERFORMANCE INDIVIDUALI DEI MODELLI BASE")
print("="*80)


individual_scores = {}
for name in X_meta_df.columns:
    # Predict with threshold 0.5
    predictions = (X_meta_df[name] > 0.5).astype(int)
    accuracy = (predictions == y_train).mean()
    individual_scores[name] = accuracy
    

sorted_scores = dict(sorted(individual_scores.items(), key=lambda x: x[1], reverse=True))

print("\nINDIVIDUAL OOF ACCURACY:")
print("-" * 50)
for name, score in sorted_scores.items():
    print(f"{name:10s}: {score:.4f}")


plt.figure(figsize=(10, 5))
keys = list(sorted_scores.keys())
values = list(sorted_scores.values())
plt.barh(keys, values, color='steelblue')
plt.xlabel('OOF Accuracy')
plt.title('Individual performance base models (v6)')
plt.xlim([min(values)-0.01, max(values)+0.01])
for i, (name, score) in enumerate(sorted_scores.items()):
    plt.text(score, i, f' {score:.4f}', va='center')
plt.tight_layout()
plt.savefig("v6_individual_performance.png")
plt.show()

# ============================================================================
# METODO 2: CORRELAZIONE PREDIZIONI
# ============================================================================
print("\n" + "="*80)
print("METHOD 2: PREDICTION CORRELATION ANALYSIS")
print("="*80)

correlation_matrix = X_meta_df.corr()
print("\nCORRELATION MATRIX:")
print(correlation_matrix.round(3))


plt.figure(figsize=(10, 7))
sns.heatmap(
    correlation_matrix, 
    annot=True, 
    cmap='coolwarm', 
    fmt=".3f"
)
plt.title(f"Base Models Correlation (Stack {list(X_meta_df.columns)})")
plt.savefig("v6_correlation_heatmap.png")
plt.show()


print("\n‚ö†Ô∏è  COPPIE AD ALTA CORRELAZIONE (>0.90 - Potenziale Ridondanza):")
print("-" * 50)
high_corr_pairs = []
for i in range(len(correlation_matrix.columns)):
    for j in range(i+1, len(correlation_matrix.columns)):
        corr_value = correlation_matrix.iloc[i, j]
        if corr_value > 0.90:
            pair = (correlation_matrix.columns[i], correlation_matrix.columns[j], corr_value)
            high_corr_pairs.append(pair)
            print(f"{pair[0]:10s} <-> {pair[1]:10s}: {pair[2]:.4f}")

if not high_corr_pairs:
    print("‚úì Nessuna coppia con correlazione >0.90 (Buona diversit√†!)")

# Avarage correlation
avg_corr = {}
for col in correlation_matrix.columns:
    
    other_corrs = correlation_matrix[col].drop(col)
    avg_corr[col] = other_corrs.mean()

print("\nüìä MEDIA CORRELAZIONE CON ALTRI MODELLI:")
print("-" * 50)
for name, avg in sorted(avg_corr.items(), key=lambda x: x[1]):
    print(f"{name:10s}: {avg:.4f} {'‚≠ê (Pi√π diverso)' if avg == min(avg_corr.values()) else ''}")

# ============================================================================
# METHOD 3: BACKWARD ELIMINATION
# ============================================================================
print("\n" + "="*80)
print("METODO 3: BACKWARD ELIMINATION")
print("="*80)


if 'meta_model' not in locals():
    meta_model = LogisticRegression(random_state=SEED, max_iter=1000)
if 'kfold' not in locals():
    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)


# Score baseline
baseline_score = cross_val_score(
    meta_model, X_meta_df, y_train, cv=kfold, scoring='accuracy', n_jobs=-1
).mean()

print(f"üìå BASELINE (Tutti i {len(X_meta_df.columns)} modelli): {baseline_score:.4f}")
print("-" * 50)

# Remove each model
removal_impact = {}
for col_to_remove in X_meta_df.columns:
    X_reduced = X_meta_df.drop(columns=[col_to_remove])
    score = cross_val_score(
        meta_model, X_reduced, y_train, cv=kfold, scoring='accuracy', n_jobs=-1
    ).mean()
    impact = score - baseline_score
    removal_impact[col_to_remove] = {'score': score, 'impact': impact}
    emoji = "üìâ" if impact < 0 else "üìà" if impact > 0 else "‚û°Ô∏è"
    print(f"{emoji} Senza {col_to_remove:10s}: {score:.4f} (Œî = {impact:+.4f})")


least_damaging = max(removal_impact.items(), key=lambda x: x[1]['impact'])
print(f"\nüí° CANDIDATE: {least_damaging[0]}")
print(f"    Score WITHOUT it: {least_damaging[1]['score']:.4f}")
print(f"    Impact: {least_damaging[1]['impact']:+.4f}")

if least_damaging[1]['impact'] >= 0:
    print(f"    ‚úì Rimuoverlo MIGLIORA o non peggiora il modello!")
else:
    print(f"    ‚ö†Ô∏è  Rimuoverlo peggiora il modello di {abs(least_damaging[1]['impact']):.4f}")

# ============================================================================
# METODO 4: FORWARD SELECTION
# ============================================================================
print("\n" + "="*80)
print("METHOD 4: FORWARD SELECTION")
print("="*80)

# Sort models based on individual performance
sorted_models = sorted(individual_scores.items(), key=lambda x: x[1], reverse=True)

# Start with the best one
selected = [sorted_models[0][0]]
remaining = [m[0] for m in sorted_models[1:]]

print(f"üìå START WITH: {selected[0]} (Accuracy OOF: {sorted_models[0][1]:.4f})")
print("-" * 50)

forward_history = []
current_score = cross_val_score(
    meta_model, X_meta_df[selected], y_train, cv=kfold, scoring='accuracy', n_jobs=-1
).mean()
forward_history.append({'models': selected.copy(), 'score': current_score})

print(f"CV Score con [{', '.join(selected)}]: {current_score:.4f}\n")

# Add model
while remaining:
    best_addition = None
    best_score_so_far = current_score
    
    for candidate in remaining:
        test_set = selected + [candidate]
        score = cross_val_score(
            meta_model, X_meta_df[test_set], y_train, cv=kfold, scoring='accuracy', n_jobs=-1
        ).mean()
        
        if score > best_score_so_far:
            best_score_so_far = score
            best_addition = candidate
    
    if best_addition is not None:
        selected.append(best_addition)
        remaining.remove(best_addition)
        improvement = best_score_so_far - current_score
        current_score = best_score_so_far
        forward_history.append({'models': selected.copy(), 'score': current_score})
        
        print(f"‚ûï Added {best_addition:10s}: {current_score:.4f} (Œî = +{improvement:.4f})")
    else:
        print(f"\n‚õî STOP: Nessun modello migliora ulteriormente il CV score")
        break

print(f"\nüèÜ MIGLIOR COMBINAZIONE (Forward): {selected}")
print(f"    CV Score: {current_score:.4f}")

# Plot history
fig, ax = plt.subplots(figsize=(10, 5))
scores = [h['score'] for h in forward_history]
labels = [f"{i+1}: {', '.join(h['models'][:2])}..." if len(h['models']) > 2 
          else f"{i+1}: {', '.join(h['models'])}" 
          for i, h in enumerate(forward_history)]
ax.plot(range(1, len(scores)+1), scores, marker='o', linewidth=2, markersize=8)
ax.set_xlabel('Numero di Modelli nell\'Ensemble')
ax.set_ylabel('CV Accuracy')
ax.set_title('Forward Selection: Andamento CV Score')
ax.set_xticks(range(1, len(scores)+1))
ax.set_xticklabels(labels, rotation=45, ha='right')
ax.grid(alpha=0.3)
ax.axhline(baseline_score, color='red', linestyle='--', label=f'Baseline (tutti): {baseline_score:.4f}')
ax.legend()
plt.tight_layout()
plt.savefig("v6_forward_selection.png")
plt.show()

# ============================================================================
# METHOD 5: TRY ALL POSSIBLE COMBINATION
# ============================================================================
print("\n" + "="*80)
print("METODO 5: VALUTAZIONE TUTTE LE COMBINAZIONI")
print("="*80)
print(f"(Con {len(X_meta_df.columns)} modelli ci sono {2**len(X_meta_df.columns)-1} combinazioni possibili)\n")

all_models = list(X_meta_df.columns)
all_combinations = []

# Try all combination
for size in range(1, len(all_models) + 1):
    print(f"Test combinazioni da {size} modelli...")
    for combo in combinations(all_models, size):
        score = cross_val_score(
            meta_model, X_meta_df[list(combo)], y_train, cv=kfold, scoring='accuracy', n_jobs=-1
        ).mean()
        all_combinations.append({
            'models': list(combo),
            'n_models': len(combo),
            'score': score
        })
# Sort by score
all_combinations_sorted = sorted(all_combinations, key=lambda x: x['score'], reverse=True)

print("\nüèÜ TOP 10 COMBINAZIONI:")
print("-" * 80)
for i, combo in enumerate(all_combinations_sorted[:10], 1):
    models_str = ', '.join(combo['models'])
    print(f"{i:2d}. [{combo['n_models']} modelli] {combo['score']:.4f} - {models_str}")

# Best number of models
print("\nüìä MIGLIOR COMBINAZIONE PER NUMERO DI MODELLI:")
print("-" * 80)
best_by_size = {}
for combo in all_combinations:
    size = combo['n_models']
    if size not in best_by_size or combo['score'] > best_by_size[size]['score']:
        best_by_size[size] = combo

for size in sorted(best_by_size.keys()):
    combo = best_by_size[size]
    models_str = ', '.join(combo['models'])
    print(f"{size} modelli: {combo['score']:.4f} - [{models_str}]")

# Plot
fig, ax = plt.subplots(figsize=(12, 6))
for size in sorted(best_by_size.keys()):
    scores_for_size = [c['score'] for c in all_combinations if c['n_models'] == size]
    ax.scatter([size] * len(scores_for_size), scores_for_size, alpha=0.3, s=50)

# Best ones
best_scores = [best_by_size[s]['score'] for s in sorted(best_by_size.keys())]
ax.plot(sorted(best_by_size.keys()), best_scores, 'ro-', linewidth=2, markersize=10, label='Best per size')
ax.axhline(baseline_score, color='green', linestyle='--', linewidth=2, label=f'Baseline (tutti): {baseline_score:.4f}')
ax.set_xlabel('Numero di Modelli nell\'Ensemble')
ax.set_ylabel('CV Accuracy')
ax.set_title('Tutte le Combinazioni: CV Score vs Numero di Modelli')
ax.set_xticks(sorted(best_by_size.keys()))
ax.legend()
ax.grid(alpha=0.3)
plt.tight_layout()
plt.savefig("v6_all_combinations.png")
plt.show()

# ============================================================================
# FINAL REPORT
# ============================================================================
print("\n" + "="*80)
print("üìã REPORT AND RECOMMENDETION (v6)")
print("="*80)

print(f"\n1Ô∏è‚É£  BASELINE (All the {len(X_meta_df.columns)} models):")
print(f"    CV Score: {baseline_score:.4f}")

print(f"\n2Ô∏è‚É£  BACKWARD ELIMINATION suggests:")
print(f"    Remove: {least_damaging[0]}")
print(f"    Score: {least_damaging[1]['score']:.4f}")

print(f"\n3Ô∏è‚É£  FORWARD SELECTION suggests:")
print(f"    Models: {selected}")
print(f"    Score: {current_score:.4f}")

print(f"\n4Ô∏è‚É£  BEST COMBINATION (Brute Force):")
best_overall = all_combinations_sorted[0]
print(f"    Models: {best_overall['models']}")
print(f"    Score: {best_overall['score']:.4f}")

# Compare with baseline
improvement = best_overall['score'] - baseline_score
if improvement > 0.0005:  # Good improvement
    print(f"\n‚úÖ RACCOMANDAZIONE: Usa la combinazione ottimale trovata")
    print(f"    Miglioramento: +{improvement:.4f}")
    print(f"    Modelli da usare: {best_overall['models']}")
elif improvement < -0.0005:  # Worsening
    print(f"\n‚ö†Ô∏è  RACCOMANDAZIONE: Mantieni tutti i modelli (baseline)")
    print(f"    La combinazione ottimale √® peggiore: {improvement:.4f}")
else:  # Negligible difference
    print(f"\n‚û°Ô∏è  RACCOMANDAZIONE: Baseline vs Ottimale sono equivalenti")
    print(f"    Differenza trascurabile: {improvement:.4f}")
    if len(best_overall['models']) < len(X_meta_df.columns):
        print(f"    Suggerisco: Usa {best_overall['models']} (pi√π semplice)")
    else:
        print(f"    Suggerisco: Mantieni tutti (pi√π robusto)")

print("\n" + "="*80)
print("üí° SUGGERIMENTI AGGIUNTIVI:")
print("="*80)
print("- Se hai modelli con correlazione >0.95, considera di rimuovere il pi√π debole")
print("- XGBoost e CatBoost tendono ad essere correlati (entrambi gradient boosting)")
print("- Il meta-modello (LogReg) pu√≤ dare pesi diversi ai modelli automaticamente")
print("- Se il dataset √® piccolo, meno modelli = meno overfitting del meta-modello")
print("="*80)

# ============================================================================
# CREATE OPTIMAL SUBSET
# ============================================================================
print(f"\nüîß Creazione X_meta_df ottimizzato...")
optimal_models = best_overall['models']
X_meta_df_optimal = X_meta_df[optimal_models].copy()
print(f"    X_meta_df_optimal creato con modelli: {optimal_models}")
print(f"    Usa 'X_meta_df_optimal' nella cella successiva per il meta-modello finale")
print("="*80)

Saving for the Ensemble Notebook

In [None]:
import joblib
import os

print("--- Saving results ---")

os.makedirs('../models', exist_ok=True)
os.makedirs('../data/processed', exist_ok=True)

# 2. Save just base optimal models

print(f"Salvataggio dei {len(optimal_models)} modelli base ottimali in /models/...")

for name in optimal_models:
    if name in final_base_models:
        model = final_base_models[name]
        SAVE_PATH = f'../models/{name}.joblib'
        joblib.dump(model, SAVE_PATH)
        print(f"Salvato: {SAVE_PATH}")
    else:
        print(f"ATTENZIONE: Modello {name} non trovato in final_base_models.")

# 3. Saving optimal meta features and target

print("\nSalvataggio delle meta-features e target per il meta-modello...")

META_OPTIMAL_PATH = '../data/processed/meta_features_optimal_train.csv'
TARGET_PATH = '../data/processed/train_target.csv'

X_meta_df_optimal.to_csv(META_OPTIMAL_PATH, index=False)
y_train.to_csv(TARGET_PATH, index=False)

print(f"Salvate meta-features ottimali: {META_OPTIMAL_PATH}")
print(f"Salvato target: {TARGET_PATH}")

print("\n--- Salvataggio completato! ---")
print("Ora sei pronto per creare il notebook 07_Ensemble_Submission.ipynb")