# üìä EDA - Stimatrix Project Analysis

**Obiettivo**: Analisi esplorativa del dataset allineata alle scelte del progetto:
- ‚úÖ Analisi dataset raw completo
- ‚úÖ Effetto dei filtri configurati (anno>=2022, zone escluse, no ville)
- ‚úÖ Confronto pre/post filtri
- ‚úÖ Analisi target e feature chiave
- ‚úÖ Validazione scelte preprocessing

**Output**: `eda_project_outputs/`

## üîß Setup

In [None]:
# Imports
import sys
from pathlib import Path

# Add src to path
sys.path.insert(0, str(Path.cwd().parent / "src"))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings

# Project imports
from utils.config import load_config
from preprocessing.pipeline import apply_data_filters
from utils.logger import get_logger

warnings.filterwarnings('ignore')
logger = get_logger(__name__)

# Plot settings
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
%matplotlib inline

print("‚úÖ Setup completato")

In [None]:
# Configurazione
CONFIG_PATH = "../config/config.yaml"
RAW_DATA_PATH = "../data/raw/raw.parquet"
OUTPUT_DIR = Path("eda_project_outputs")
OUTPUT_DIR.mkdir(exist_ok=True)

# Helper per salvare plot
def save_plot(name, dpi=100):
    plt.tight_layout()
    plt.savefig(OUTPUT_DIR / f"{name}.png", dpi=dpi, bbox_inches='tight')
    print(f"üíæ Salvato: {name}.png")

print(f"üìÇ Output directory: {OUTPUT_DIR}")

## üì¶ 1. Load Data & Config

In [None]:
# Load config
config = load_config(CONFIG_PATH)
print("‚úÖ Config caricato")

# Mostra filtri configurati
filters = config.get('data_filters', {})
print("\nüéØ FILTRI CONFIGURATI:")
for key, value in filters.items():
    if value is not None and key not in ['description', 'experiment_name']:
        print(f"  - {key}: {value}")

In [None]:
# Load raw data
df_raw = pd.read_parquet(RAW_DATA_PATH)
print(f"‚úÖ Dataset raw caricato: {len(df_raw):,} righe √ó {len(df_raw.columns)} colonne")
print(f"   Memoria: {df_raw.memory_usage(deep=True).sum() / 1024**2:.1f} MB")

## üìä 2. Dataset Raw - Overview

In [None]:
# Informazioni base
print("=" * 80)
print("DATASET RAW - OVERVIEW")
print("=" * 80)
print(f"\nDimensioni: {len(df_raw):,} righe √ó {len(df_raw.columns)} colonne")
print(f"Periodo: {df_raw['A_AnnoStipula'].min()} - {df_raw['A_AnnoStipula'].max()}")
print(f"\nTipi di dato:")
print(df_raw.dtypes.value_counts())

In [None]:
# Missing values overview
missing = df_raw.isnull().sum()
missing_pct = 100 * missing / len(df_raw)
missing_df = pd.DataFrame({
    'Missing_Count': missing,
    'Missing_Pct': missing_pct
}).sort_values('Missing_Pct', ascending=False)

print(f"\nüìä MISSING VALUES (Top 20):")
print(missing_df.head(20))

# Salva
missing_df.to_csv(OUTPUT_DIR / "01_missing_values_raw.csv")
print(f"\nüíæ Salvato: 01_missing_values_raw.csv")

## üí∞ 3. Target Analysis (Raw)

In [None]:
# Target: AI_Prezzo_Ridistribuito
target_col = 'AI_Prezzo_Ridistribuito'
target = df_raw[target_col].dropna()

print("=" * 80)
print(f"TARGET: {target_col}")
print("=" * 80)
print(f"\nNon-null: {len(target):,} ({len(target)/len(df_raw)*100:.1f}%)")
print(f"\nStatistiche:")
print(f"  Mean:       ‚Ç¨{target.mean():,.0f}")
print(f"  Median:     ‚Ç¨{target.median():,.0f}")
print(f"  Std:        ‚Ç¨{target.std():,.0f} ({target.std()/target.mean()*100:.0f}% CV)")
print(f"  Min:        ‚Ç¨{target.min():,.0f}")
print(f"  Max:        ‚Ç¨{target.max():,.0f}")
print(f"\nDistribuzione:")
print(f"  Skewness:   {target.skew():.2f}")
print(f"  Kurtosis:   {target.kurtosis():.2f}")
print(f"\nQuartili:")
for q in [0.25, 0.50, 0.75, 0.90, 0.95, 0.99]:
    print(f"  Q{int(q*100):02d}:  ‚Ç¨{target.quantile(q):>12,.0f}")

# Salva statistiche
stats_df = pd.DataFrame({
    'Statistic': ['count', 'mean', 'median', 'std', 'min', 'max', 'skewness', 'kurtosis'],
    'Value': [
        len(target),
        target.mean(),
        target.median(),
        target.std(),
        target.min(),
        target.max(),
        target.skew(),
        target.kurtosis()
    ]
})
stats_df.to_csv(OUTPUT_DIR / "02_target_statistics_raw.csv", index=False)
print(f"\nüíæ Salvato: 02_target_statistics_raw.csv")

In [None]:
# Plot distribuzione target
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Histogram
axes[0, 0].hist(target, bins=50, edgecolor='black', alpha=0.7)
axes[0, 0].set_xlabel('Prezzo (‚Ç¨)')
axes[0, 0].set_ylabel('Frequenza')
axes[0, 0].set_title('Distribuzione Target (Raw)')
axes[0, 0].axvline(target.mean(), color='r', linestyle='--', label=f'Mean: ‚Ç¨{target.mean():,.0f}')
axes[0, 0].axvline(target.median(), color='g', linestyle='--', label=f'Median: ‚Ç¨{target.median():,.0f}')
axes[0, 0].legend()

# Log scale
axes[0, 1].hist(np.log10(target + 1), bins=50, edgecolor='black', alpha=0.7, color='orange')
axes[0, 1].set_xlabel('log10(Prezzo + 1)')
axes[0, 1].set_ylabel('Frequenza')
axes[0, 1].set_title('Distribuzione Target (Log Scale)')

# Boxplot
axes[1, 0].boxplot(target, vert=True)
axes[1, 0].set_ylabel('Prezzo (‚Ç¨)')
axes[1, 0].set_title('Boxplot Target')
axes[1, 0].grid(True, alpha=0.3)

# Q-Q plot
stats.probplot(target, dist="norm", plot=axes[1, 1])
axes[1, 1].set_title('Q-Q Plot (vs Normal)')
axes[1, 1].grid(True, alpha=0.3)

save_plot("03_target_distribution_raw")
plt.show()

## üìÖ 4. Distribuzione Temporale

In [None]:
# Distribuzione per anno
if 'A_AnnoStipula' in df_raw.columns:
    print("=" * 80)
    print("DISTRIBUZIONE TEMPORALE")
    print("=" * 80)
    
    year_counts = df_raw['A_AnnoStipula'].value_counts().sort_index()
    print("\nTransazioni per anno:")
    for year, count in year_counts.items():
        pct = count / len(df_raw) * 100
        bar = '‚ñà' * int(pct / 2)
        print(f"  {year}: {count:>6,} ({pct:>5.1f}%) {bar}")
    
    # Salva
    year_counts.to_csv(OUTPUT_DIR / "04_temporal_distribution.csv")
    print(f"\nüíæ Salvato: 04_temporal_distribution.csv")

In [None]:
# Plot temporale
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Bar chart per anno
year_counts.plot(kind='bar', ax=axes[0], color='steelblue', edgecolor='black')
axes[0].set_xlabel('Anno Stipula')
axes[0].set_ylabel('Numero Transazioni')
axes[0].set_title('Distribuzione Temporale')
axes[0].tick_params(axis='x', rotation=45)
axes[0].grid(True, alpha=0.3)

# Trend prezzo per anno
if 'A_AnnoStipula' in df_raw.columns and target_col in df_raw.columns:
    yearly_price = df_raw.groupby('A_AnnoStipula')[target_col].agg(['mean', 'median'])
    axes[1].plot(yearly_price.index, yearly_price['mean'], marker='o', label='Mean', linewidth=2)
    axes[1].plot(yearly_price.index, yearly_price['median'], marker='s', label='Median', linewidth=2)
    axes[1].set_xlabel('Anno Stipula')
    axes[1].set_ylabel('Prezzo (‚Ç¨)')
    axes[1].set_title('Trend Prezzi per Anno')
    axes[1].legend()
    axes[1].grid(True, alpha=0.3)

save_plot("05_temporal_analysis")
plt.show()

## üó∫Ô∏è 5. Distribuzione Zone OMI

In [None]:
# Analisi zone
if 'AI_ZonaOmi' in df_raw.columns:
    print("=" * 80)
    print("ZONE OMI")
    print("=" * 80)
    
    zone_counts = df_raw['AI_ZonaOmi'].value_counts()
    print(f"\nTotale zone uniche: {df_raw['AI_ZonaOmi'].nunique()}")
    print("\nDistribuzione:")
    for zone, count in zone_counts.items():
        pct = count / len(df_raw) * 100
        marker = "‚ùå" if zone in filters.get('zone_escluse', []) else "‚úÖ"
        print(f"  {marker} {zone}: {count:>6,} ({pct:>5.1f}%)")
    
    # Zone da escludere
    zone_escluse = filters.get('zone_escluse', [])
    if zone_escluse:
        print(f"\n‚ö†Ô∏è  Zone DA ESCLUDERE nella config: {zone_escluse}")
        for zone in zone_escluse:
            if zone in zone_counts.index:
                count = zone_counts[zone]
                pct = count / len(df_raw) * 100
                print(f"     {zone}: {count:,} transazioni ({pct:.1f}%)")

In [None]:
# Statistiche prezzo per zona
if 'AI_ZonaOmi' in df_raw.columns and target_col in df_raw.columns:
    zone_stats = df_raw.groupby('AI_ZonaOmi')[target_col].agg([
        'count', 'mean', 'median', 'std', 'min', 'max'
    ]).round(0)
    
    # Aggiungi CV (coefficiente variazione)
    zone_stats['cv'] = (zone_stats['std'] / zone_stats['mean'] * 100).round(2)
    zone_stats = zone_stats.sort_values('count', ascending=False)
    
    print("\nüìä Statistiche Prezzo per Zona:")
    print(zone_stats)
    
    # Salva
    zone_stats.to_csv(OUTPUT_DIR / "06_zone_statistics_raw.csv")
    print(f"\nüíæ Salvato: 06_zone_statistics_raw.csv")

In [None]:
# Plot zone
fig, axes = plt.subplots(2, 1, figsize=(14, 10))

# Count per zona
zone_counts.plot(kind='barh', ax=axes[0], color='steelblue', edgecolor='black')
axes[0].set_xlabel('Numero Transazioni')
axes[0].set_ylabel('Zona OMI')
axes[0].set_title('Distribuzione Transazioni per Zona')
axes[0].grid(True, alpha=0.3, axis='x')

# Highlight zone da escludere
zone_escluse = filters.get('zone_escluse', [])
for i, zone in enumerate(zone_counts.index):
    if zone in zone_escluse:
        axes[0].get_children()[i].set_color('red')
        axes[0].get_children()[i].set_alpha(0.5)

# Prezzo medio per zona
zone_stats['mean'].plot(kind='barh', ax=axes[1], color='orange', edgecolor='black')
axes[1].set_xlabel('Prezzo Medio (‚Ç¨)')
axes[1].set_ylabel('Zona OMI')
axes[1].set_title('Prezzo Medio per Zona')
axes[1].grid(True, alpha=0.3, axis='x')

save_plot("07_zone_analysis")
plt.show()

## üè† 6. Distribuzione Tipologie

In [None]:
# Analisi tipologie
if 'AI_IdTipologiaEdilizia' in df_raw.columns:
    print("=" * 80)
    print("TIPOLOGIE EDILIZIE")
    print("=" * 80)
    
    tipo_counts = df_raw['AI_IdTipologiaEdilizia'].value_counts().sort_index()
    print("\nDistribuzione:")
    for tipo, count in tipo_counts.items():
        pct = count / len(df_raw) * 100
        marker = "‚ùå" if str(tipo) in filters.get('tipologie_escluse', []) else "‚úÖ"
        print(f"  {marker} Tipologia {tipo}: {count:>6,} ({pct:>5.1f}%)")
    
    # Tipologie da escludere
    tipo_escluse = filters.get('tipologie_escluse', [])
    if tipo_escluse:
        print(f"\n‚ö†Ô∏è  Tipologie DA ESCLUDERE nella config: {tipo_escluse}")
        for tipo in tipo_escluse:
            count = (df_raw['AI_IdTipologiaEdilizia'].astype(str) == str(tipo)).sum()
            if count > 0:
                pct = count / len(df_raw) * 100
                print(f"     Tipologia {tipo}: {count:,} transazioni ({pct:.1f}%)")

## üîç 7. EFFETTO FILTRI - Confronto Pre/Post

In [None]:
# Applica filtri
print("=" * 80)
print("APPLICAZIONE FILTRI")
print("=" * 80)

df_filtered = apply_data_filters(df_raw, config)

initial_rows = len(df_raw)
final_rows = len(df_filtered)
removed = initial_rows - final_rows
pct_removed = removed / initial_rows * 100

print(f"\nüìä RISULTATO:")
print(f"  Dataset iniziale:  {initial_rows:>8,} righe (100.0%)")
print(f"  Dataset finale:    {final_rows:>8,} righe ({final_rows/initial_rows*100:>5.1f}%)")
print(f"  Rimossi:           {removed:>8,} righe ({pct_removed:>5.1f}%)")

if pct_removed > 50:
    print(f"\n‚ö†Ô∏è  WARNING: Rimossi {pct_removed:.1f}% dei dati!")
elif pct_removed > 30:
    print(f"\n‚ö†Ô∏è  Attenzione: Rimossi {pct_removed:.1f}% dei dati")
else:
    print(f"\n‚úÖ Rimozione moderata: {pct_removed:.1f}% dei dati")

In [None]:
# Confronto statistiche target
print("\n" + "=" * 80)
print("CONFRONTO TARGET: RAW vs FILTERED")
print("=" * 80)

target_raw = df_raw[target_col].dropna()
target_filt = df_filtered[target_col].dropna()

comparison = pd.DataFrame({
    'Raw': [
        len(target_raw),
        target_raw.mean(),
        target_raw.median(),
        target_raw.std(),
        target_raw.min(),
        target_raw.max(),
        target_raw.skew(),
        target_raw.kurtosis()
    ],
    'Filtered': [
        len(target_filt),
        target_filt.mean(),
        target_filt.median(),
        target_filt.std(),
        target_filt.min(),
        target_filt.max(),
        target_filt.skew(),
        target_filt.kurtosis()
    ]
}, index=['Count', 'Mean', 'Median', 'Std', 'Min', 'Max', 'Skewness', 'Kurtosis'])

# Delta percentuale
comparison['Delta'] = comparison['Filtered'] - comparison['Raw']
comparison['Delta_Pct'] = 100 * comparison['Delta'] / comparison['Raw']

print("\n", comparison.round(2))

# Salva
comparison.to_csv(OUTPUT_DIR / "08_target_comparison_raw_vs_filtered.csv")
print(f"\nüíæ Salvato: 08_target_comparison_raw_vs_filtered.csv")

In [None]:
# Plot confronto distribuzioni
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Histograms sovrapposti
axes[0, 0].hist(target_raw, bins=50, alpha=0.5, label='Raw', edgecolor='black')
axes[0, 0].hist(target_filt, bins=50, alpha=0.5, label='Filtered', edgecolor='black')
axes[0, 0].set_xlabel('Prezzo (‚Ç¨)')
axes[0, 0].set_ylabel('Frequenza')
axes[0, 0].set_title('Confronto Distribuzioni: Raw vs Filtered')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

# Boxplots affiancati
axes[0, 1].boxplot([target_raw, target_filt], labels=['Raw', 'Filtered'])
axes[0, 1].set_ylabel('Prezzo (‚Ç¨)')
axes[0, 1].set_title('Boxplot Comparison')
axes[0, 1].grid(True, alpha=0.3)

# KDE plots
target_raw.plot(kind='density', ax=axes[1, 0], label='Raw', linewidth=2)
target_filt.plot(kind='density', ax=axes[1, 0], label='Filtered', linewidth=2)
axes[1, 0].set_xlabel('Prezzo (‚Ç¨)')
axes[1, 0].set_ylabel('Densit√†')
axes[1, 0].set_title('Kernel Density Estimation')
axes[1, 0].legend()
axes[1, 0].grid(True, alpha=0.3)

# Q-Q plots sovrapposti
stats.probplot(target_raw, dist="norm", plot=axes[1, 1])
axes[1, 1].get_lines()[0].set_marker('o')
axes[1, 1].get_lines()[0].set_markersize(3)
axes[1, 1].get_lines()[0].set_alpha(0.5)
axes[1, 1].get_lines()[0].set_label('Raw')
stats.probplot(target_filt, dist="norm", plot=axes[1, 1])
axes[1, 1].get_lines()[2].set_marker('s')
axes[1, 1].get_lines()[2].set_markersize(3)
axes[1, 1].get_lines()[2].set_alpha(0.5)
axes[1, 1].get_lines()[2].set_label('Filtered')
axes[1, 1].set_title('Q-Q Plot Comparison')
axes[1, 1].legend()
axes[1, 1].grid(True, alpha=0.3)

save_plot("09_distribution_comparison_raw_vs_filtered")
plt.show()

In [None]:
# Confronto zone
if 'AI_ZonaOmi' in df_raw.columns:
    print("\n" + "=" * 80)
    print("CONFRONTO ZONE: RAW vs FILTERED")
    print("=" * 80)
    
    zone_raw = df_raw['AI_ZonaOmi'].value_counts()
    zone_filt = df_filtered['AI_ZonaOmi'].value_counts()
    
    zone_comp = pd.DataFrame({
        'Raw': zone_raw,
        'Filtered': zone_filt
    }).fillna(0).astype(int)
    
    zone_comp['Removed'] = zone_comp['Raw'] - zone_comp['Filtered']
    zone_comp['Removed_Pct'] = 100 * zone_comp['Removed'] / zone_comp['Raw']
    
    zone_comp = zone_comp.sort_values('Raw', ascending=False)
    
    print("\n", zone_comp)
    
    # Salva
    zone_comp.to_csv(OUTPUT_DIR / "10_zone_comparison_raw_vs_filtered.csv")
    print(f"\nüíæ Salvato: 10_zone_comparison_raw_vs_filtered.csv")

## üìà 8. Top Correlazioni con Target

In [None]:
# Calcola correlazioni (solo colonne numeriche)
numeric_cols = df_filtered.select_dtypes(include=[np.number]).columns
numeric_cols = [c for c in numeric_cols if c != target_col]  # Escludi target stesso

correlations = []
for col in numeric_cols:
    try:
        corr = df_filtered[[col, target_col]].corr().iloc[0, 1]
        if not np.isnan(corr):
            correlations.append({
                'Feature': col,
                'Correlation': corr,
                'Abs_Correlation': abs(corr)
            })
    except:
        pass

corr_df = pd.DataFrame(correlations).sort_values('Abs_Correlation', ascending=False)

print("=" * 80)
print("TOP 30 CORRELAZIONI CON TARGET")
print("=" * 80)
print("\n", corr_df.head(30))

# Salva
corr_df.to_csv(OUTPUT_DIR / "11_correlations_with_target.csv", index=False)
print(f"\nüíæ Salvato: 11_correlations_with_target.csv")

In [None]:
# Plot top correlazioni
top_corr = corr_df.head(20)

plt.figure(figsize=(12, 8))
colors = ['red' if x < 0 else 'steelblue' for x in top_corr['Correlation']]
plt.barh(range(len(top_corr)), top_corr['Correlation'], color=colors, edgecolor='black')
plt.yticks(range(len(top_corr)), top_corr['Feature'])
plt.xlabel('Correlazione con Target')
plt.title('Top 20 Correlazioni con Target (Filtered Dataset)')
plt.axvline(x=0, color='black', linewidth=0.8)
plt.grid(True, alpha=0.3, axis='x')

save_plot("12_top_correlations")
plt.show()

## üéØ 9. Feature Droppate nella Config

In [None]:
# Feature droppate dalla config
drop_cols = config.get('feature_pruning', {}).get('drop_columns', [])

print("=" * 80)
print("FEATURE DROPPATE NELLA CONFIG")
print("=" * 80)
print(f"\nTotale feature da droppare: {len(drop_cols)}")
print("\nCategorie:")
print("  - ID e chiavi esterne: ~12 colonne")
print("  - Superfici ridondanti: ~5 colonne")
print("  - Indicatori ISTAT ridondanti: ~7 colonne")
print("  - OmiValori ridondanti: ~4 colonne")
print("  - Metadata e tecniche: ~13 colonne")
print("  - Codici catastali: ~8 colonne")
print("  - Poco predittive: ~7 colonne")

# Verifica quali sono presenti nel dataset
present_drops = [c for c in drop_cols if c in df_filtered.columns]
missing_drops = [c for c in drop_cols if c not in df_filtered.columns]

print(f"\nPresenti nel dataset: {len(present_drops)}/{len(drop_cols)}")
if missing_drops:
    print(f"\n‚ö†Ô∏è  Colonne non trovate (gi√† rimosse o mai presenti):")
    for col in missing_drops[:10]:
        print(f"     - {col}")
    if len(missing_drops) > 10:
        print(f"     ... altre {len(missing_drops)-10} colonne")

## üìä 10. Summary Report

In [None]:
# Genera report finale
report = {
    'dataset': {
        'raw_rows': len(df_raw),
        'raw_cols': len(df_raw.columns),
        'filtered_rows': len(df_filtered),
        'filtered_cols': len(df_filtered.columns),
        'rows_removed': len(df_raw) - len(df_filtered),
        'rows_removed_pct': 100 * (len(df_raw) - len(df_filtered)) / len(df_raw),
    },
    'target_raw': {
        'count': int(len(target_raw)),
        'mean': float(target_raw.mean()),
        'median': float(target_raw.median()),
        'std': float(target_raw.std()),
        'skewness': float(target_raw.skew()),
        'kurtosis': float(target_raw.kurtosis()),
    },
    'target_filtered': {
        'count': int(len(target_filt)),
        'mean': float(target_filt.mean()),
        'median': float(target_filt.median()),
        'std': float(target_filt.std()),
        'skewness': float(target_filt.skew()),
        'kurtosis': float(target_filt.kurtosis()),
    },
    'filters_applied': {
        'anno_min': filters.get('anno_min'),
        'zone_escluse': filters.get('zone_escluse'),
        'tipologie_escluse': filters.get('tipologie_escluse'),
    },
    'top_correlations': corr_df.head(10).to_dict('records')
}

# Salva report JSON
import json
with open(OUTPUT_DIR / "00_summary_report.json", 'w') as f:
    json.dump(report, f, indent=2)

print("=" * 80)
print("SUMMARY REPORT")
print("=" * 80)
print(json.dumps(report, indent=2))
print(f"\nüíæ Salvato: 00_summary_report.json")

## ‚úÖ Conclusioni

### File Generati

1. `00_summary_report.json` - Report completo in JSON
2. `01_missing_values_raw.csv` - Missing values overview
3. `02_target_statistics_raw.csv` - Statistiche target raw
4. `03_target_distribution_raw.png` - Distribuzione target raw
5. `04_temporal_distribution.csv` - Distribuzione temporale
6. `05_temporal_analysis.png` - Analisi temporale
7. `06_zone_statistics_raw.csv` - Statistiche zone
8. `07_zone_analysis.png` - Analisi zone
9. `08_target_comparison_raw_vs_filtered.csv` - Confronto target
10. `09_distribution_comparison_raw_vs_filtered.png` - Confronto distribuzioni
11. `10_zone_comparison_raw_vs_filtered.csv` - Confronto zone
12. `11_correlations_with_target.csv` - Correlazioni complete
13. `12_top_correlations.png` - Top 20 correlazioni

### Prossimi Passi

1. Verificare effetto filtri su performance modello
2. Analizzare preprocessed data (feature contestuali, encoding)
3. Confrontare con risultati training
4. Iterare su filtri se necessario