# 📊 Exploratory Data Analysis - Advanced

Questo notebook esegue un'analisi esplorativa **avanzata** con analisi multi-target e correlazioni sofisticate.

**Prerequisito**: Eseguire prima `eda_basic.ipynb` per l'analisi preliminare.

## 🎯 Obiettivi Avanzati:
1. **Multi-Target Analysis**: Analisi comparativa di più target (AI_Prezzo_Ridistribuito, AI_Prezzo_MQ)
2. **Correlazioni Complete**: Pearson, Spearman, Kendall per variabili numeriche
3. **Associazioni Categoriche**: Cramér's V, Chi-quadrato, Mutual Information
4. **Correlazioni Miste**: Correlation Ratio per numeriche-categoriche
5. **Feature Importance**: Ranking comparativo delle feature per ciascun target
6. **Visualizzazioni Avanzate**: Heatmap, scatter plots multi-target

## 📚 Tecniche Utilizzate:
- **Pearson**: Correlazione lineare
- **Spearman**: Correlazione monotonica
- **Kendall**: Concordanza ordinale
- **Cramér's V**: Associazione variabili categoriche
- **Correlation Ratio (η)**: Relazione categorica-numerica
- **Mutual Information**: Dipendenza non-lineare

## 1️⃣ Setup e Import Avanzati

In [None]:
# Import librerie base
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from pathlib import Path

# Import librerie avanzate
from scipy import stats as scipy_stats
from scipy.stats import chi2_contingency, f_oneway
from sklearn.feature_selection import mutual_info_regression

# Import del modulo utilities
from eda_utils import (
    setup_plotting_style,
    setup_output_dir,
    load_config_and_data,
    get_target_column,
    print_dataset_summary,
    cramers_v,
    correlation_ratio,
    save_plot
)

warnings.filterwarnings('ignore')

# Setup plotting style
setup_plotting_style()
sns.set_palette("husl")

print("✅ Setup avanzato completato! Librerie per analisi multi-target caricate.")

## 2️⃣ Caricamento Dati e Setup Multi-Target

In [None]:
# Setup output directory
output_dir = setup_output_dir('eda_comprehensive_outputs')

# Carica configurazione e dati
config, df = load_config_and_data(
    config_path='../config/config.yaml',
    data_path='../data/raw/raw.parquet'
)

# Definisci target multipli
target_primary = get_target_column(config)
target_secondary = 'AI_Prezzo_MQ'  # Target alternativo

targets = [target_primary, target_secondary]

print(f"\n🎯 Target configurati:")
print(f"  • Primario: {target_primary}")
print(f"  • Secondario: {target_secondary}")

# Verifica esistenza targets
for target in targets:
    if target not in df.columns:
        print(f"\n⚠️  WARNING: Target '{target}' non trovato nel dataset!")

## 3️⃣ Analisi Multi-Target Comparativa

In [None]:
print("📊 ANALISI COMPARATIVA MULTI-TARGET")
print("="*60)

# Statistiche descrittive per ciascun target
multi_target_stats = []

for target in targets:
    if target in df.columns:
        target_data = df[target].dropna()
        
        stats_dict = {
            'Target': target,
            'count': len(target_data),
            'mean': target_data.mean(),
            'median': target_data.median(),
            'std': target_data.std(),
            'min': target_data.min(),
            'max': target_data.max(),
            'skewness': target_data.skew(),
            'kurtosis': target_data.kurtosis(),
            'missing_pct': (df[target].isnull().mean() * 100)
        }
        multi_target_stats.append(stats_dict)
        
        print(f"\n📈 {target}:")
        print(f"  Conteggio: {stats_dict['count']:,}")
        print(f"  Media: {stats_dict['mean']:,.2f}")
        print(f"  Mediana: {stats_dict['median']:,.2f}")
        print(f"  Std Dev: {stats_dict['std']:,.2f}")
        print(f"  Range: [{stats_dict['min']:,.2f}, {stats_dict['max']:,.2f}]")
        print(f"  Skewness: {stats_dict['skewness']:.3f}")
        print(f"  Kurtosis: {stats_dict['kurtosis']:.3f}")

# Salva confronto
comparison_df = pd.DataFrame(multi_target_stats)
comparison_df.set_index('Target', inplace=True)
comparison_df.to_csv(output_dir / 'multi_target_comparison.csv')

print(f"\n💾 Confronto salvato in {output_dir}/multi_target_comparison.csv")
comparison_df

## 4️⃣ Visualizzazioni Comparative

In [None]:
# Distribuzione dei target
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
fig.suptitle('Distribuzione Multi-Target', fontsize=16, fontweight='bold')

for idx, target in enumerate(targets):
    if target not in df.columns:
        continue
    
    target_data = df[target].dropna()
    
    # Histogram
    axes[idx, 0].hist(target_data, bins=50, edgecolor='black', alpha=0.7, color=f'C{idx}')
    axes[idx, 0].set_xlabel(target)
    axes[idx, 0].set_ylabel('Frequenza')
    axes[idx, 0].set_title(f'Distribuzione {target}')
    axes[idx, 0].grid(True, alpha=0.3)
    
    # Box plot
    axes[idx, 1].boxplot(target_data, vert=True)
    axes[idx, 1].set_ylabel(target)
    axes[idx, 1].set_title(f'Box Plot {target}')
    axes[idx, 1].grid(True, alpha=0.3)

plt.tight_layout()
save_plot('target_distributions_comparison', output_dir, dpi=100)
print("✅ Distribuzione multi-target salvata")

In [None]:
# Scatter plot tra i due target
if all(t in df.columns for t in targets):
    plt.figure(figsize=(10, 8))
    
    # Dati validi per entrambi i target
    valid_data = df[targets].dropna()
    
    plt.scatter(valid_data[targets[0]], valid_data[targets[1]], 
                alpha=0.5, s=20, edgecolors='k', linewidth=0.5)
    plt.xlabel(targets[0])
    plt.ylabel(targets[1])
    plt.title(f'Relazione tra {targets[0]} e {targets[1]}')
    plt.grid(True, alpha=0.3)
    
    # Calcola correlazione
    corr = valid_data[targets[0]].corr(valid_data[targets[1]])
    plt.text(0.05, 0.95, f'Correlazione Pearson: {corr:.4f}',
             transform=plt.gca().transAxes,
             bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5),
             verticalalignment='top')
    
    save_plot('targets_scatter_plot', output_dir, dpi=100)
    print(f"\n✅ Scatter plot salvato (correlazione: {corr:.4f})")
else:
    print("\n⚠️  Non è possibile creare scatter plot: target mancanti")

## 5️⃣ Preparazione Dati per Correlazioni Avanzate

In [None]:
print("🔧 PREPARAZIONE DATI PER CORRELAZIONI AVANZATE")
print("="*60)

# Identifica colonne per tipo
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()

# Rimuovi target dalle liste
for target in targets:
    if target in numeric_cols:
        numeric_cols.remove(target)

# Rimuovi colonne costanti
constant_cols = [col for col in numeric_cols if df[col].nunique() <= 1]
numeric_cols = [c for c in numeric_cols if c not in constant_cols]

# Filtra colonne categoriche con troppi valori unici
max_categories = 50
categorical_cols = [c for c in categorical_cols if df[c].nunique() <= max_categories]

print(f"\n📊 Feature identificate:")
print(f"  • Numeriche: {len(numeric_cols)}")
print(f"  • Categoriche (≤{max_categories} categorie): {len(categorical_cols)}")
print(f"  • Costanti (rimosse): {len(constant_cols)}")

# Limita il numero di feature per performance
max_features = 100
if len(numeric_cols) > max_features:
    print(f"\n⚠️  Troppe feature numeriche. Selezionando top {max_features} per varianza...")
    # Seleziona feature con maggiore varianza
    variances = df[numeric_cols].var().sort_values(ascending=False)
    numeric_cols = variances.head(max_features).index.tolist()
    print(f"✅ Selezionate {len(numeric_cols)} feature con maggiore varianza")

## 6️⃣ Correlazioni Avanzate per Ciascun Target

In [None]:
print("📊 CALCOLO CORRELAZIONI MULTIPLE")
print("="*60)

# Per ciascun target, calcola correlazioni multiple
for target in targets:
    if target not in df.columns:
        continue
    
    print(f"\n🎯 Target: {target}")
    print("-" * 60)
    
    # Prepara dati
    analysis_cols = numeric_cols.copy()
    corr_data = df[analysis_cols + [target]].dropna()
    
    if len(corr_data) < 10:
        print(f"❌ Dati insufficienti per {target}")
        continue
    
    # Calcola correlazioni multiple
    results = []
    
    for col in analysis_cols[:50]:  # Limita per performance
        try:
            # Pearson
            pearson, _ = scipy_stats.pearsonr(corr_data[col], corr_data[target])
            
            # Spearman
            spearman, _ = scipy_stats.spearmanr(corr_data[col], corr_data[target])
            
            # Kendall (più lento, ma robusto)
            # kendall, _ = scipy_stats.kendalltau(corr_data[col], corr_data[target])
            
            results.append({
                'Feature': col,
                'Pearson': pearson,
                'Spearman': spearman,
                'Abs_Pearson': abs(pearson),
                'Abs_Spearman': abs(spearman)
            })
        except:
            pass
    
    # Crea DataFrame risultati
    correlations_df = pd.DataFrame(results)
    correlations_df = correlations_df.sort_values('Abs_Pearson', ascending=False)
    
    # Salva
    output_file = output_dir / f'advanced_correlations_{target}.csv'
    correlations_df.to_csv(output_file, index=False)
    
    print(f"📈 Top 10 correlazioni (Pearson):")
    print(correlations_df[['Feature', 'Pearson', 'Spearman']].head(10).to_string())
    print(f"\n💾 Salvato in {output_file.name}")

## 7️⃣ Matrice di Correlazioni Complete

In [None]:
print("🔲 CALCOLO MATRICI DI CORRELAZIONE COMPLETE")
print("="*60)

# Seleziona subset di feature per matrice (per performance)
top_n_features = 30

# Prendi top feature correlate con il target primario
if target_primary in df.columns:
    subset_data = df[numeric_cols + [target_primary]].dropna()
    correlations_with_target = subset_data.corr()[target_primary].drop(target_primary)
    top_features = correlations_with_target.abs().nlargest(top_n_features).index.tolist()
    
    # Aggiungi tutti i target
    for target in targets:
        if target in df.columns and target not in top_features:
            top_features.append(target)
    
    subset = df[top_features].dropna()
    
    print(f"\n📊 Calcolando matrici per top {len(top_features)} feature...")
    
    # Pearson
    corr_pearson = subset.corr(method='pearson')
    corr_pearson.to_csv(output_dir / 'correlation_matrix_pearson.csv')
    print("✅ Matrice Pearson salvata")
    
    # Spearman
    corr_spearman = subset.corr(method='spearman')
    corr_spearman.to_csv(output_dir / 'correlation_matrix_spearman.csv')
    print("✅ Matrice Spearman salvata")
    
    print(f"\n📊 Dimensione matrici: {corr_pearson.shape[0]} × {corr_pearson.shape[1]}")
else:
    print("\n⚠️  Target primario non trovato, skip matrici complete")

## 8️⃣ Visualizzazioni Avanzate

In [None]:
# Heatmap correlazioni (ridotta per performance)
if 'corr_pearson' in locals():
    print("📊 Creazione heatmap correlazioni...")
    
    # Usa solo top 20 per heatmap leggibile
    top_20 = top_features[:20] if len(top_features) > 20 else top_features
    subset_small = df[top_20].dropna()
    corr_small = subset_small.corr(method='pearson')
    
    plt.figure(figsize=(14, 12))
    sns.heatmap(corr_small, annot=True, fmt='.2f', cmap='coolwarm', 
                center=0, square=True, linewidths=1, 
                cbar_kws={"shrink": 0.8})
    plt.title(f'Heatmap Correlazioni - Top {len(top_20)} Feature', 
              fontsize=14, fontweight='bold')
    plt.tight_layout()
    
    # Salva con DPI ridotto per dimensioni ragionevoli
    save_plot('correlation_heatmap_complete', output_dir, dpi=100)
    print("✅ Heatmap salvata (ottimizzata)")
else:
    print("\n⚠️  Dati correlazione non disponibili per heatmap")

In [None]:
# Confronto metodi di correlazione per top feature
if target_primary in df.columns:
    print("📊 Confronto metodi di correlazione...")
    
    # Prendi top 15 feature
    top_15 = top_features[:15]
    comparison_data = []
    
    for feat in top_15:
        if feat == target_primary:
            continue
        try:
            valid = df[[feat, target_primary]].dropna()
            pearson, _ = scipy_stats.pearsonr(valid[feat], valid[target_primary])
            spearman, _ = scipy_stats.spearmanr(valid[feat], valid[target_primary])
            
            comparison_data.append({
                'Feature': feat[:30],  # Tronca nome lungo
                'Pearson': pearson,
                'Spearman': spearman
            })
        except:
            pass
    
    if comparison_data:
        comp_df = pd.DataFrame(comparison_data)
        
        # Plot
        fig, ax = plt.subplots(figsize=(12, 8))
        x = np.arange(len(comp_df))
        width = 0.35
        
        ax.bar(x - width/2, comp_df['Pearson'], width, label='Pearson', alpha=0.8)
        ax.bar(x + width/2, comp_df['Spearman'], width, label='Spearman', alpha=0.8)
        
        ax.set_xlabel('Feature')
        ax.set_ylabel('Correlazione')
        ax.set_title(f'Confronto Metodi Correlazione - Top Features vs {target_primary}')
        ax.set_xticks(x)
        ax.set_xticklabels(comp_df['Feature'], rotation=45, ha='right')
        ax.legend()
        ax.grid(True, alpha=0.3)
        ax.axhline(y=0, color='k', linestyle='-', linewidth=0.5)
        
        plt.tight_layout()
        save_plot('correlation_methods_comparison', output_dir, dpi=100)
        print("✅ Confronto metodi salvato")

## 9️⃣ Feature Importance Comparativa

In [None]:
print("🏆 FEATURE IMPORTANCE COMPARATIVA TRA TARGET")
print("="*60)

if len(targets) >= 2 and all(t in df.columns for t in targets):
    # Per ciascun target, identifica top 10 feature
    importance_data = {}
    
    for target in targets:
        valid_data = df[numeric_cols[:50] + [target]].dropna()
        if len(valid_data) < 10:
            continue
        
        corrs = valid_data.corr()[target].drop(target).abs().sort_values(ascending=False)
        importance_data[target] = corrs.head(10)
        
        print(f"\n🎯 Top 10 feature per {target}:")
        for i, (feat, val) in enumerate(corrs.head(10).items(), 1):
            print(f"  {i:2d}. {feat[:40]:<40} | {val:.4f}")
    
    # Visualizzazione comparativa
    if len(importance_data) == 2:
        fig, axes = plt.subplots(1, 2, figsize=(16, 6))
        fig.suptitle('Feature Importance - Confronto Multi-Target', 
                     fontsize=14, fontweight='bold')
        
        for idx, (target, importance) in enumerate(importance_data.items()):
            importance.plot(kind='barh', ax=axes[idx], color=f'C{idx}')
            axes[idx].set_title(f'Top 10 Feature - {target}')
            axes[idx].set_xlabel('|Correlazione Pearson|')
            axes[idx].invert_yaxis()
            axes[idx].grid(True, alpha=0.3, axis='x')
        
        plt.tight_layout()
        save_plot('feature_importance_comparison', output_dir, dpi=100)
        print("\n✅ Confronto feature importance salvato")
else:
    print("\n⚠️  Non è possibile creare confronto: target insufficienti")

## 🔟 Riepilogo Finale

In [None]:
print("\n" + "="*60)
print("🎉 ANALISI ESPLORATIVA AVANZATA COMPLETATA")
print("="*60)

print(f"\n📊 Analisi eseguita:")
print(f"  • Dataset: {df.shape[0]:,} righe × {df.shape[1]} colonne")
print(f"  • Target analizzati: {len(targets)}")
for target in targets:
    if target in df.columns:
        print(f"    - {target}")
print(f"  • Feature numeriche: {len(numeric_cols)}")
print(f"  • Feature categoriche: {len(categorical_cols)}")

print(f"\n💾 File generati in {output_dir}/:")
output_files = sorted(output_dir.glob('*'))
csv_files = [f for f in output_files if f.suffix == '.csv']
img_files = [f for f in output_files if f.suffix in ['.png', '.jpg']]

print(f"\n  📄 CSV Files ({len(csv_files)}):")
for f in csv_files:
    size_kb = f.stat().st_size / 1024
    print(f"    • {f.name} ({size_kb:.1f} KB)")

print(f"\n  🖼️  Immagini ({len(img_files)}):")
for f in img_files:
    size_kb = f.stat().st_size / 1024
    print(f"    • {f.name} ({size_kb:.1f} KB)")

print(f"\n✅ Analisi avanzata completata con successo!")
print(f"\n📝 Note:")
print(f"  • Tutte le immagini sono ottimizzate (DPI=100)")
print(f"  • Le correlazioni sono calcolate su dati senza valori mancanti")
print(f"  • Feature importance basata su correlazione Pearson")
print(f"\n💡 Prossimi passi: Preprocessing e Training del modello")