In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
"""
EV Transition Forecasting - Complete Kaggle Pipeline
=====================================================
Execution Order:
1. Load raw datasets
2. Join adoption + infrastructure (ONLY)
3. Feature engineer individual files
4. Save engineered files with (1) suffix
5. Create final ML-ready dataset
6. Data quality validation and export

NO MODEL TRAINING - Data preparation only
"""

import pandas as pd
import numpy as np
import os
import warnings
warnings.filterwarnings('ignore')

# ============================================================================
# CONFIGURATION
# ============================================================================

OUTPUT_DIR = '/kaggle/working/'
RAW_DATA_PATH = None  # Will be set after kagglehub download

# File mappings
FILE_MAPPING = {
    'adoption': 'india_ev_ice_adoption_large.csv',
    'infra': 'ev_charging_infrastructure_india.csv',
    'detailed': 'vehicle_registrations_detailed.csv',
    'battery': 'ev_vehicle_battery_specs_india.csv',
    'sales': 'ev_ice_market_sales_india.csv'
}

# ============================================================================
# SAFETY PRECHECK FUNCTION (PREVENTS INFINITE/NAN BUG)
# ============================================================================

def safety_precheck(df, grain_cols, name="DataFrame"):
    """
    Enforce data quality rules to prevent infinite values and NaN explosions.
    
    Rules enforced:
    1. Deduplicate by grain
    2. Sort by time
    3. Remove infinite values
    4. Validate uniqueness
    """
    print(f"\n{'='*60}")
    print(f"SAFETY PRECHECK: {name}")
    print(f"{'='*60}")
    
    initial_rows = len(df)
    print(f"Initial rows: {initial_rows:,}")
    
    # Rule 1: Check for duplicates
    duplicates = df.duplicated(subset=grain_cols, keep=False).sum()
    if duplicates > 0:
        print(f"‚ö†Ô∏è  WARNING: {duplicates:,} duplicate rows found on grain {grain_cols}")
        print("   Aggregating to enforce grain...")
        # Will be handled by caller
    
    # Rule 2: Check sorting
    if 'year' in df.columns:
        is_sorted = df.groupby([c for c in grain_cols if c != 'year'])['year'].apply(
            lambda x: x.is_monotonic_increasing
        ).all()
        if not is_sorted:
            print("‚ö†Ô∏è  WARNING: Data not sorted by year")
    
    # Rule 3: Check for infinite values
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    inf_count = np.isinf(df[numeric_cols]).sum().sum()
    if inf_count > 0:
        print(f"‚ö†Ô∏è  WARNING: {inf_count:,} infinite values detected")
    
    # Rule 4: Check for NaN
    nan_count = df[numeric_cols].isna().sum().sum()
    print(f"NaN values: {nan_count:,}")
    
    print(f"‚úì Precheck complete")
    print(f"{'='*60}\n")
    
    return df


# ============================================================================
# STEP 0: LOAD RAW DATA
# ============================================================================

def load_raw_data(data_path):
    """Load all raw CSV files without modification."""
    print("\n" + "="*60)
    print("STEP 0: LOADING RAW DATA")
    print("="*60)
    
    dfs = {}
    
    for key, filename in FILE_MAPPING.items():
        filepath = os.path.join(data_path, filename)
        if os.path.exists(filepath):
            dfs[key] = pd.read_csv(filepath)
            print(f"‚úì Loaded {key}: {filename}")
            print(f"  Shape: {dfs[key].shape}")
            print(f"  Columns: {list(dfs[key].columns)[:5]}...")
        else:
            print(f"‚úó NOT FOUND: {filename}")
    
    return dfs


# ============================================================================
# STEP 1: JOIN ADOPTION + INFRASTRUCTURE (ONLY REQUIRED JOIN)
# ============================================================================

def join_adoption_infra(adoption_df, infra_df):
    """
    LEFT JOIN adoption with infrastructure on (state, year).
    This is the ONLY join that happens for the ML pipeline.
    """
    print("\n" + "="*60)
    print("STEP 1: JOINING ADOPTION + INFRASTRUCTURE")
    print("="*60)
    
    # Safety precheck on raw data
    adoption_df = safety_precheck(
        adoption_df, 
        ['state', 'year', 'vehicle_segment'],
        "Adoption (raw)"
    )
    
    infra_df = safety_precheck(
        infra_df,
        ['state', 'year'],
        "Infrastructure (raw)"
    )
    
    # CRITICAL: Deduplicate before join
    print("\nDeduplicating adoption data by grain...")
    adoption_clean = adoption_df.groupby(
        ['state', 'year', 'vehicle_segment'],
        as_index=False
    ).agg({
        'ice_vehicle_registrations': 'sum',
        'ev_vehicle_registrations': 'sum',
        'charging_stations': 'mean',
        'avg_ev_subsidy_rs': 'mean',
        'fuel_price_rs_per_litre': 'mean',
        'avg_income_index': 'mean'
    })
    
    print(f"After dedup: {len(adoption_clean):,} rows (from {len(adoption_df):,})")
    
    # CRITICAL ASSERTION: Guarantee grain uniqueness
    assert adoption_clean.duplicated(['state', 'year', 'vehicle_segment']).sum() == 0, \
        "FATAL: Duplicates still exist after deduplication!"
    
    print("\nDeduplicating infrastructure data by grain...")
    infra_clean = infra_df.groupby(
        ['state', 'year'],
        as_index=False
    ).agg({
        'charging_stations': 'mean',
        'fast_charger_pct': 'mean',
        'urban_coverage_pct': 'mean'
    })
    
    print(f"After dedup: {len(infra_clean):,} rows (from {len(infra_df):,})")
    
    # CRITICAL ASSERTION: Guarantee grain uniqueness
    assert infra_clean.duplicated(['state', 'year']).sum() == 0, \
        "FATAL: Duplicates still exist in infra after deduplication!"
    
    # Rename infra columns to avoid collision
    infra_clean = infra_clean.rename(columns={
        'charging_stations': 'infra_charging_stations',
        'fast_charger_pct': 'infra_fast_charger_pct',
        'urban_coverage_pct': 'infra_urban_coverage_pct'
    })
    
    # LEFT JOIN
    joined = adoption_clean.merge(
        infra_clean,
        on=['state', 'year'],
        how='left',
        validate='m:1'
    )
    
    print(f"\n‚úì Join complete: {len(joined):,} rows")
    print(f"  Null infra values: {joined['infra_charging_stations'].isna().sum()}")
    
    # Use infra_charging_stations if available, else fall back to adoption's
    joined['charging_stations_final'] = joined['infra_charging_stations'].fillna(
        joined['charging_stations']
    )
    
    # Sort by time (CRITICAL for lag features)
    joined = joined.sort_values(['state', 'vehicle_segment', 'year'])
    
    return joined


# ============================================================================
# STEP 2A: FEATURE ENGINEERING - ADOPTION TABLE
# ============================================================================

def engineer_adoption_features(df):
    """
    Create adoption-specific features with proper time-series handling.
    
    Features created:
    - EV/ICE shares
    - Transition metrics
    - YoY growth rates
    - Lag features (t-1, t-2)
    """
    print("\n" + "="*60)
    print("STEP 2A: FEATURE ENGINEERING - ADOPTION")
    print("="*60)
    
    df = df.copy()
    
    # ========== VOLUME & SHARE FEATURES ==========
    print("\n1. Computing volume & share metrics...")
    
    df['total_registrations'] = df['ev_vehicle_registrations'] + df['ice_vehicle_registrations']
    df['ev_share'] = df['ev_vehicle_registrations'] / (df['total_registrations'] + 1e-6)
    df['ice_share'] = df['ice_vehicle_registrations'] / (df['total_registrations'] + 1e-6)
    
    # Clip shares to [0, 1]
    df['ev_share'] = df['ev_share'].clip(0, 1)
    df['ice_share'] = df['ice_share'].clip(0, 1)
    
    print(f"   EV share range: [{df['ev_share'].min():.4f}, {df['ev_share'].max():.4f}]")
    
    # ========== TEMPORAL / MOMENTUM FEATURES ==========
    print("\n2. Computing YoY growth rates...")
    
    # Vectorized YoY calculations (safe, fast, correct)
    df['ev_yoy_growth'] = (
        df.groupby(['state', 'vehicle_segment'])['ev_vehicle_registrations']
          .pct_change()
    )
    
    df['ice_yoy_change'] = (
        df.groupby(['state', 'vehicle_segment'])['ice_vehicle_registrations']
          .pct_change()
    )
    
    # Clip extreme outliers
    df['ev_yoy_growth'] = df['ev_yoy_growth'].clip(-1, 5)
    df['ice_yoy_change'] = df['ice_yoy_change'].clip(-1, 5)
    
    # Transition index
    df['transition_index'] = df['ev_yoy_growth'] - df['ice_yoy_change']
    
    # Conversion pressure
    df['conversion_pressure'] = df['ev_share'] / (df['ice_share'] + 1e-6)
    
    print(f"   Transition index range: [{df['transition_index'].min():.2f}, {df['transition_index'].max():.2f}]")
    
    # ========== LAG FEATURES (CRITICAL) ==========
    print("\n3. Creating lag features...")
    
    lag_cols = ['ev_share', 'transition_index', 'ev_yoy_growth', 'ice_yoy_change']
    
    # Vectorized lag creation (correct and fast)
    for col in lag_cols:
        for lag in [1, 2]:
            new_col = f"{col}_t-{lag}"
            df[new_col] = (
                df.groupby(['state', 'vehicle_segment'])[col]
                  .shift(lag)
            )
    
    print(f"   Created {len(lag_cols) * 2} lag features")
    
    # ========== POLICY & ECONOMIC FEATURES ==========
    print("\n4. Computing policy & economic changes...")
    
    # Vectorized policy change calculations
    df['subsidy_yoy_change'] = (
        df.groupby(['state', 'vehicle_segment'])['avg_ev_subsidy_rs']
          .pct_change()
    )
    
    df['fuel_price_yoy_change'] = (
        df.groupby(['state', 'vehicle_segment'])['fuel_price_rs_per_litre']
          .pct_change()
    )
    
    # Income buckets (quantile-based for non-0-100 range)
    df['income_bucket'] = pd.qcut(
        df['avg_income_index'],
        q=3,
        labels=['Low', 'Mid', 'High'],
        duplicates='drop'
    )
    
    # ========== CLEAN BAD VALUES ==========
    print("\n5. Cleaning infinite/NaN values...")
    
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    
    initial_rows = len(df)
    df = df.dropna(subset=['ev_share', 'ev_share_t-1', 'ev_share_t-2'])
    final_rows = len(df)
    
    print(f"   Rows dropped due to insufficient history: {initial_rows - final_rows:,}")
    print(f"   Final rows: {final_rows:,}")
    
    return df


# ============================================================================
# STEP 2B: FEATURE ENGINEERING - INFRASTRUCTURE TABLE
# ============================================================================

def engineer_infra_features(df):
    """
    Create infrastructure-specific features.
    
    Features created:
    - Normalized infra metrics
    - Infra growth rates
    - Quality indices
    - Lag features
    """
    print("\n" + "="*60)
    print("STEP 2B: FEATURE ENGINEERING - INFRASTRUCTURE")
    print("="*60)
    
    df = df.copy()
    
    # Use final charging stations
    if 'charging_stations_final' in df.columns:
        df['charging_stations'] = df['charging_stations_final']
    
    # ========== NORMALIZATION ==========
    print("\n1. Computing normalized infra metrics...")
    
    # Stations per 10k vehicles
    df['stations_per_10k_vehicles'] = (
        df['charging_stations'] / (df['total_registrations'] / 10000 + 1e-6)
    )
    
    # Stations per 1k EVs
    df['stations_per_1k_ev'] = (
        df['charging_stations'] / (df['ev_vehicle_registrations'] / 1000 + 1e-6)
    )
    
    # Fast charger index
    df['fast_charger_index'] = (
        df['charging_stations'] * (df['infra_fast_charger_pct'] / 100)
    )
    
    print(f"   Stations per 10k vehicles range: [{df['stations_per_10k_vehicles'].min():.2f}, {df['stations_per_10k_vehicles'].max():.2f}]")
    
    # ========== MOMENTUM FEATURES ==========
    print("\n2. Computing infra growth rates...")
    
    # Infrastructure is STATE-level, not segment-level
    df['infra_yoy_growth'] = (
        df.groupby('state')['charging_stations']
          .pct_change()
    )
    
    df['fast_charger_pct_change'] = (
        df.groupby('state')['infra_fast_charger_pct']
          .pct_change()
    )
    
    df['urban_coverage_yoy_change'] = (
        df.groupby('state')['infra_urban_coverage_pct']
          .pct_change()
    )
    
    # ========== LAG FEATURES (CRITICAL FOR CAUSALITY) ==========
    print("\n3. Creating lagged infra features...")
    
    lag_cols = ['infra_yoy_growth', 'fast_charger_index', 'stations_per_10k_vehicles']
    
    # Vectorized lag creation - lagged by (state, segment) for alignment
    for col in lag_cols:
        for lag in [1]:
            new_col = f"{col}_t-{lag}"
            df[new_col] = (
                df.groupby(['state', 'vehicle_segment'])[col]
                  .shift(lag)
            )
    
    print(f"   Created {len(lag_cols)} lagged infra features")
    
    # ========== FLAGS (EXPLAINABILITY) ==========
    print("\n4. Creating categorical flags...")
    
    # High infra flag
    infra_median = df['stations_per_10k_vehicles'].median()
    df['high_infra_flag'] = (df['stations_per_10k_vehicles'] > infra_median).astype(int)
    
    # High subsidy flag
    subsidy_median = df['avg_ev_subsidy_rs'].median()
    df['high_subsidy_flag'] = (df['avg_ev_subsidy_rs'] > subsidy_median).astype(int)
    
    # ========== CLEAN BAD VALUES ==========
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    
    return df


# ============================================================================
# STEP 2C: FEATURE ENGINEERING - DETAILED REGISTRATIONS
# ============================================================================

def engineer_detailed_features(df):
    """
    Create city-level and OEM-level insights.
    This stays SEPARATE from ML dataset.
    """
    print("\n" + "="*60)
    print("STEP 2C: FEATURE ENGINEERING - DETAILED REGISTRATIONS")
    print("="*60)
    
    df = df.copy()
    
    # City-level EV share
    print("\n1. Computing city-level metrics...")
    
    city_metrics = df.groupby(['state', 'city', 'year', 'vehicle_category']).agg({
        'registrations': 'sum'
    }).reset_index()
    
    city_metrics['total_by_city'] = city_metrics.groupby(
        ['state', 'city', 'year', 'vehicle_category']
    )['registrations'].transform('sum')
    
    # OEM-level share
    print("2. Computing OEM-level metrics...")
    
    oem_metrics = df.groupby(['manufacturer', 'year', 'fuel_type']).agg({
        'registrations': 'sum'
    }).reset_index()
    
    print(f"   City-level records: {len(city_metrics):,}")
    print(f"   OEM-level records: {len(oem_metrics):,}")
    
    return df


# ============================================================================
# STEP 2D: FEATURE ENGINEERING - BATTERY SPECS
# ============================================================================

def engineer_battery_features(df):
    """
    Create segment-level battery context.
    Used for explanation, not prediction.
    """
    print("\n" + "="*60)
    print("STEP 2D: FEATURE ENGINEERING - BATTERY SPECS")
    print("="*60)
    
    df = df.copy()
    
    # Efficiency metrics
    print("\n1. Computing battery efficiency metrics...")
    
    df['range_per_kwh'] = df['range_km'] / (df['battery_capacity_kwh'] + 1e-6)
    df['price_per_kwh'] = df['ex_showroom_price_lakh'] / (df['battery_capacity_kwh'] + 1e-6)
    df['charging_speed_index'] = df['battery_capacity_kwh'] / (df['charging_time_hrs'] + 1e-6)
    
    # Affordability buckets
    df['affordability_bucket'] = pd.cut(
        df['ex_showroom_price_lakh'],
        bins=[0, 10, 20, 100],
        labels=['Budget', 'Mid', 'Premium']
    )
    
    # Segment-level aggregation
    segment_metrics = df.groupby('segment').agg({
        'range_per_kwh': 'mean',
        'price_per_kwh': 'mean',
        'charging_speed_index': 'mean',
        'range_km': 'mean'
    }).reset_index()
    
    print(f"   Segment-level metrics: {len(segment_metrics)} segments")
    
    return df


# ============================================================================
# STEP 3: CREATE FINAL ML DATASET
# ============================================================================

def create_final_ml_dataset(df):
    """
    Create the single, clean, prediction-ready dataset.
    
    Grain: state √ó year √ó vehicle_segment
    Purpose: Forecast ev_share for t+1, t+2, t+3
    """
    print("\n" + "="*60)
    print("STEP 3: CREATE FINAL ML DATASET")
    print("="*60)
    
    df = df.copy()
    
    # ========== TARGET CREATION (t+1, t+2, t+3) ==========
    print("\n1. Creating future targets...")
    
    # Vectorized target creation (safe and fast)
    for horizon in [1, 2, 3]:
        target_col = f'ev_share_t+{horizon}'
        df[target_col] = (
            df.groupby(['state', 'vehicle_segment'])['ev_share']
              .shift(-horizon)
        )
    
    print(f"   Created targets: ev_share_t+1, ev_share_t+2, ev_share_t+3")
    
    # ========== SELECT FINAL COLUMNS ==========
    print("\n2. Selecting final feature set...")
    
    feature_cols = [
        # Identifiers
        'state', 'year', 'vehicle_segment',
        
        # Current state
        'ev_share', 'ice_share', 'total_registrations',
        'conversion_pressure',
        
        # Temporal features
        'ev_yoy_growth', 'ice_yoy_change', 'transition_index',
        
        # Lag features (adoption)
        'ev_share_t-1', 'ev_share_t-2',
        'transition_index_t-1',
        'ev_yoy_growth_t-1', 'ice_yoy_change_t-1',
        
        # Infrastructure features (lagged)
        'infra_yoy_growth_t-1', 'fast_charger_index_t-1',
        'stations_per_10k_vehicles_t-1',
        
        # Policy & economic
        'avg_ev_subsidy_rs', 'subsidy_yoy_change',
        'fuel_price_rs_per_litre', 'fuel_price_yoy_change',
        'avg_income_index',
        
        # Flags
        'high_infra_flag', 'high_subsidy_flag',
        
        # Targets
        'ev_share_t+1', 'ev_share_t+2', 'ev_share_t+3'
    ]
    
    # Filter to existing columns
    available_cols = [col for col in feature_cols if col in df.columns]
    missing_cols = [col for col in feature_cols if col not in df.columns]
    
    if missing_cols:
        print(f"\n   ‚ö†Ô∏è  Missing columns: {missing_cols}")
    
    final_df = df[available_cols].copy()
    
    # ========== FINAL CLEANING ==========
    print("\n3. Final data cleaning...")
    
    initial_rows = len(final_df)
    
    # Drop rows with missing targets
    final_df = final_df.dropna(subset=['ev_share_t+1', 'ev_share_t+2', 'ev_share_t+3'])
    
    # Drop rows with missing critical features
    final_df = final_df.dropna(subset=['ev_share_t-1', 'ev_share_t-2'])
    
    final_rows = len(final_df)
    
    print(f"   Initial rows: {initial_rows:,}")
    print(f"   Final rows: {final_rows:,}")
    print(f"   Rows dropped: {initial_rows - final_rows:,}")
    
    # ========== VALIDATION ==========
    print("\n4. Data validation...")
    
    print(f"   Year range: {final_df['year'].min()} - {final_df['year'].max()}")
    print(f"   States: {final_df['state'].nunique()}")
    print(f"   Segments: {final_df['vehicle_segment'].nunique()}")
    print(f"   EV share range: [{final_df['ev_share'].min():.4f}, {final_df['ev_share'].max():.4f}]")
    print(f"   Target t+1 range: [{final_df['ev_share_t+1'].min():.4f}, {final_df['ev_share_t+1'].max():.4f}]")
    
    print("\n   ‚ö†Ô∏è  NOTE: Identifiers (state, year, vehicle_segment) must be")
    print("            dropped or encoded before model training.")
    
    # Check for data leakage
    future_cols = [col for col in final_df.columns if 't+' in col and col not in ['ev_share_t+1', 'ev_share_t+2', 'ev_share_t+3']]
    if future_cols:
        print(f"\n   ‚ö†Ô∏è  WARNING: Potential future leakage detected: {future_cols}")
    
    return final_df


# ============================================================================
# MAIN EXECUTION
# ============================================================================

def main():
    """Execute complete pipeline."""
    
    print("\n" + "="*60)
    print("EV TRANSITION FORECASTING - COMPLETE PIPELINE")
    print("="*60)
    print("\nExecution steps:")
    print("0. Load raw data")
    print("1. Join adoption + infrastructure")
    print("2. Feature engineer individual files")
    print("3. Create final ML dataset")
    print("4. Save all outputs")
    print("\n" + "="*60)
    
    # ========== DOWNLOAD DATA ==========
    print("\nDownloading dataset from Kaggle...")
    
    import kagglehub
    data_path = kagglehub.dataset_download("shubhamindulkar/ev-datasets-for-the-indian-market")
    print(f"Dataset path: {data_path}")
    
    global RAW_DATA_PATH
    RAW_DATA_PATH = data_path
    
    # ========== STEP 0: LOAD ==========
    raw_dfs = load_raw_data(data_path)
    
    if 'adoption' not in raw_dfs or 'infra' not in raw_dfs:
        print("\n‚ùå ERROR: Core datasets not found!")
        return
    
    # ========== STEP 1: JOIN ==========
    joined_df = join_adoption_infra(raw_dfs['adoption'], raw_dfs['infra'])
    
    # Save intermediate join
    join_path = os.path.join(OUTPUT_DIR, 'adoption_infra_joined.csv')
    joined_df.to_csv(join_path, index=False)
    print(f"\n‚úì Saved: adoption_infra_joined.csv")
    
    # ========== STEP 2: FEATURE ENGINEERING ==========
    
    # 2A: Adoption features
    adoption_eng = engineer_adoption_features(joined_df)
    adoption_eng_path = os.path.join(OUTPUT_DIR, 'india_ev_ice_adoption_large(1).csv')
    adoption_eng.to_csv(adoption_eng_path, index=False)
    print(f"\n‚úì Saved: india_ev_ice_adoption_large(1).csv ({len(adoption_eng):,} rows)")
    
    # 2B: Infrastructure features (same DF after both feature sets)
    infra_eng = engineer_infra_features(adoption_eng)
    infra_eng_path = os.path.join(OUTPUT_DIR, 'adoption_infra_features(1).csv')
    infra_eng.to_csv(infra_eng_path, index=False)
    print(f"‚úì Saved: adoption_infra_features(1).csv ({len(infra_eng):,} rows)")
    
    # 2C: Detailed registrations (if available)
    if 'detailed' in raw_dfs:
        detailed_eng = engineer_detailed_features(raw_dfs['detailed'])
        detailed_eng_path = os.path.join(OUTPUT_DIR, 'vehicle_registrations_detailed(1).csv')
        detailed_eng.to_csv(detailed_eng_path, index=False)
        print(f"‚úì Saved: vehicle_registrations_detailed(1).csv ({len(detailed_eng):,} rows)")
    
    # 2D: Battery specs (if available)
    if 'battery' in raw_dfs:
        battery_eng = engineer_battery_features(raw_dfs['battery'])
        battery_eng_path = os.path.join(OUTPUT_DIR, 'ev_vehicle_battery_specs_india(1).csv')
        battery_eng.to_csv(battery_eng_path, index=False)
        print(f"‚úì Saved: ev_vehicle_battery_specs_india(1).csv ({len(battery_eng):,} rows)")
    
    # ========== STEP 3: FINAL ML DATASET ==========
    final_ml_df = create_final_ml_dataset(infra_eng)
    
    final_ml_path = os.path.join(OUTPUT_DIR, 'ev_transition_forecast_dataset.csv')
    final_ml_df.to_csv(final_ml_path, index=False)
    print(f"\n‚úì Saved: ev_transition_forecast_dataset.csv ({len(final_ml_df):,} rows)")
    
    # ========== SUMMARY ==========
    print("\n" + "="*60)
    print("PIPELINE COMPLETE")
    print("="*60)
    print("\nOutput files created:")
    print("  1. adoption_infra_joined.csv (intermediate)")
    print("  2. india_ev_ice_adoption_large(1).csv (engineered)")
    print("  3. adoption_infra_features(1).csv (combined adoption + infra)")
    
    if 'detailed' in raw_dfs:
        print("  4. vehicle_registrations_detailed(1).csv (engineered)")
    
    if 'battery' in raw_dfs:
        print("  5. ev_vehicle_battery_specs_india(1).csv (engineered)")
    
    print(f"\n  üî• FINAL ML DATASET: ev_transition_forecast_dataset.csv")
    print(f"     - {len(final_ml_df):,} rows")
    print(f"     - {len(final_ml_df.columns)} features")
    print(f"     - Ready for model training")
    
    print("\n" + "="*60)
    print("Next steps:")
    print("  - Review engineered files")
    print("  - Validate data quality")
    print("  - Build prediction models (separate notebook)")
    print("="*60)


if __name__ == "__main__":
    main()


EV TRANSITION FORECASTING - COMPLETE PIPELINE

Execution steps:
0. Load raw data
1. Join adoption + infrastructure
2. Feature engineer individual files
3. Create final ML dataset
4. Save all outputs


Downloading dataset from Kaggle...
Dataset path: /kaggle/input/ev-datasets-for-the-indian-market

STEP 0: LOADING RAW DATA
‚úì Loaded adoption: india_ev_ice_adoption_large.csv
  Shape: (120000, 9)
  Columns: ['state', 'year', 'vehicle_segment', 'ice_vehicle_registrations', 'ev_vehicle_registrations']...
‚úì Loaded infra: ev_charging_infrastructure_india.csv
  Shape: (162, 5)
  Columns: ['state', 'year', 'charging_stations', 'fast_charger_pct', 'urban_coverage_pct']...
‚úì Loaded detailed: vehicle_registrations_detailed.csv
  Shape: (9072, 9)
  Columns: ['year', 'month', 'state', 'city', 'vehicle_category']...
‚úì Loaded battery: ev_vehicle_battery_specs_india.csv
  Shape: (6, 6)
  Columns: ['vehicle_model', 'segment', 'battery_capacity_kwh', 'range_km', 'charging_time_hrs']...
‚úì Loaded

In [3]:
"""
EV Transition Forecasting - Model Training & Evaluation
========================================================
6 Models + Ensemble for t+1, t+2, t+3 horizon predictions

Models:
1. Linear Regression (Baseline)
2. Ridge Regression (L2 regularization)
3. Lasso Regression (L1 regularization, feature selection)
4. ElasticNet (L1+L2, best of both)
5. Random Forest (Tree ensemble)
6. LightGBM (Gradient boosting)
7. ENSEMBLE (Weighted average of best 3)

Execution: python ev_model_training.py
"""

import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_absolute_percentage_error

# Models
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor

import pickle
import os

# ============================================================================
# CONFIGURATION
# ============================================================================

INPUT_FILE = 'ev_transition_forecast_dataset.csv'
OUTPUT_DIR = 'models/'
PREDICTIONS_DIR = 'predictions/'

# Create directories
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(PREDICTIONS_DIR, exist_ok=True)

# Random seed for reproducibility
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

# ============================================================================
# DATA LOADING & PREPROCESSING
# ============================================================================

def load_and_prepare_data(filepath):
    """Load dataset and prepare for modeling."""
    print("\n" + "="*60)
    print("STEP 1: LOADING & PREPROCESSING DATA")
    print("="*60)
    
    df = pd.read_csv(filepath)
    print(f"‚úì Loaded dataset: {df.shape}")
    
    # Separate features, identifiers, and targets
    identifier_cols = ['state', 'year', 'vehicle_segment']
    target_cols = ['ev_share_t+1', 'ev_share_t+2', 'ev_share_t+3']
    
    # Feature columns (everything except identifiers and targets)
    feature_cols = [col for col in df.columns 
                    if col not in identifier_cols + target_cols]
    
    print(f"\n‚úì Identifiers: {len(identifier_cols)} - {identifier_cols}")
    print(f"‚úì Features: {len(feature_cols)}")
    print(f"‚úì Targets: {len(target_cols)} - {target_cols}")
    
    return df, feature_cols, identifier_cols, target_cols


def encode_categorical(df, feature_cols):
    """Encode categorical variables."""
    print("\n" + "="*60)
    print("STEP 2: ENCODING CATEGORICAL VARIABLES")
    print("="*60)
    
    df = df.copy()
    encoders = {}
    
    # Encode state
    if 'state' in df.columns:
        le_state = LabelEncoder()
        df['state_encoded'] = le_state.fit_transform(df['state'])
        encoders['state'] = le_state
        print(f"‚úì Encoded 'state': {df['state'].nunique()} unique values")
    
    # Encode vehicle_segment
    if 'vehicle_segment' in df.columns:
        le_segment = LabelEncoder()
        df['segment_encoded'] = le_segment.fit_transform(df['vehicle_segment'])
        encoders['vehicle_segment'] = le_segment
        print(f"‚úì Encoded 'vehicle_segment': {df['vehicle_segment'].nunique()} unique values")
    
    # Update feature columns
    new_features = feature_cols + ['state_encoded', 'segment_encoded']
    
    return df, new_features, encoders


def handle_missing_values(df, feature_cols):
    """Handle missing values in features."""
    print("\n" + "="*60)
    print("STEP 3: HANDLING MISSING VALUES")
    print("="*60)
    
    df = df.copy()
    
    # Check for missing values
    missing = df[feature_cols].isna().sum()
    missing_cols = missing[missing > 0].sort_values(ascending=False)
    
    if len(missing_cols) > 0:
        print(f"\n‚ö†Ô∏è  Columns with missing values:")
        for col, count in missing_cols.items():
            pct = count / len(df) * 100
            print(f"   {col}: {count} ({pct:.1f}%)")
        
        # Strategy: Fill with median for numerical columns
        for col in missing_cols.index:
            if col in df[feature_cols].columns:
                median_val = df[col].median()
                df[col].fillna(median_val, inplace=True)
                print(f"   ‚Üí Filled '{col}' with median: {median_val:.4f}")
        
        print(f"\n‚úì All missing values handled")
    else:
        print("‚úì No missing values detected in features")
    
    return df


def create_train_test_split(df, feature_cols, target_cols):
    """Create time-based train/test split."""
    print("\n" + "="*60)
    print("STEP 4: TRAIN/TEST SPLIT (TIME-BASED)")
    print("="*60)
    
    # Time-based split: 2018-2019 train, 2020-2021 test
    train_years = [2018, 2019]
    test_years = [2020, 2021]
    
    train_mask = df['year'].isin(train_years)
    test_mask = df['year'].isin(test_years)
    
    train_df = df[train_mask].copy()
    test_df = df[test_mask].copy()
    
    print(f"‚úì Train set: {len(train_df)} records (years {train_years})")
    print(f"‚úì Test set: {len(test_df)} records (years {test_years})")
    print(f"‚úì Train/Test ratio: {len(train_df)/len(test_df):.2f}")
    
    # Extract features and targets
    X_train = train_df[feature_cols]
    X_test = test_df[feature_cols]
    
    targets = {}
    for target in target_cols:
        targets[target] = {
            'y_train': train_df[target],
            'y_test': test_df[target]
        }
    
    # Keep identifiers for later analysis
    train_ids = train_df[['state', 'year', 'vehicle_segment']]
    test_ids = test_df[['state', 'year', 'vehicle_segment']]
    
    return X_train, X_test, targets, train_ids, test_ids


def scale_features(X_train, X_test):
    """Scale features for models that benefit from normalization."""
    print("\n" + "="*60)
    print("STEP 5: FEATURE SCALING")
    print("="*60)
    
    scaler = StandardScaler()
    
    X_train_scaled = pd.DataFrame(
        scaler.fit_transform(X_train),
        columns=X_train.columns,
        index=X_train.index
    )
    
    X_test_scaled = pd.DataFrame(
        scaler.transform(X_test),
        columns=X_test.columns,
        index=X_test.index
    )
    
    print(f"‚úì Features scaled (mean=0, std=1)")
    print(f"   Train shape: {X_train_scaled.shape}")
    print(f"   Test shape: {X_test_scaled.shape}")
    
    return X_train_scaled, X_test_scaled, scaler


# ============================================================================
# MODEL DEFINITIONS
# ============================================================================

def get_models():
    """Define all models to train."""
    models = {
        '1_LinearRegression': {
            'model': LinearRegression(),
            'needs_scaling': True,
            'description': 'Baseline - OLS regression'
        },
        '2_Ridge': {
            'model': Ridge(alpha=1.0, random_state=RANDOM_STATE),
            'needs_scaling': True,
            'description': 'L2 regularization'
        },
        '3_Lasso': {
            'model': Lasso(alpha=0.001, random_state=RANDOM_STATE, max_iter=5000),
            'needs_scaling': True,
            'description': 'L1 regularization + feature selection'
        },
        '4_ElasticNet': {
            'model': ElasticNet(alpha=0.001, l1_ratio=0.5, random_state=RANDOM_STATE, max_iter=5000),
            'needs_scaling': True,
            'description': 'L1 + L2 regularization'
        },
        '5_RandomForest': {
            'model': RandomForestRegressor(
                n_estimators=100,
                max_depth=5,
                min_samples_split=5,
                min_samples_leaf=2,
                random_state=RANDOM_STATE,
                n_jobs=-1
            ),
            'needs_scaling': False,
            'description': 'Tree ensemble - handles non-linearity'
        },
        '6_LightGBM': {
            'model': LGBMRegressor(
                n_estimators=100,
                max_depth=5,
                learning_rate=0.05,
                num_leaves=31,
                random_state=RANDOM_STATE,
                verbose=-1
            ),
            'needs_scaling': False,
            'description': 'Gradient boosting - fast & accurate'
        }
    }
    
    return models


# ============================================================================
# TRAINING & EVALUATION
# ============================================================================

def evaluate_model(y_true, y_pred, model_name, target_name):
    """Calculate evaluation metrics."""
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)  # Compatible with all scikit-learn versions
    r2 = r2_score(y_true, y_pred)
    
    # MAPE (handle division by zero)
    mape = np.mean(np.abs((y_true - y_pred) / (y_true + 1e-10))) * 100
    
    metrics = {
        'model': model_name,
        'target': target_name,
        'MAE': mae,
        'RMSE': rmse,
        'R2': r2,
        'MAPE': mape
    }
    
    return metrics


def train_all_models(X_train, X_train_scaled, X_test, X_test_scaled, targets):
    """Train all models for all target horizons."""
    print("\n" + "="*60)
    print("STEP 6: TRAINING MODELS")
    print("="*60)
    
    models = get_models()
    all_results = []
    trained_models = {}
    all_predictions = {}
    
    target_names = list(targets.keys())
    
    for target_name in target_names:
        print(f"\n{'='*60}")
        print(f"TARGET: {target_name}")
        print(f"{'='*60}")
        
        y_train = targets[target_name]['y_train']
        y_test = targets[target_name]['y_test']
        
        target_predictions = {}
        
        for model_name, model_config in models.items():
            print(f"\n{model_name}: {model_config['description']}")
            
            # Select appropriate data
            if model_config['needs_scaling']:
                X_tr = X_train_scaled
                X_te = X_test_scaled
            else:
                X_tr = X_train
                X_te = X_test
            
            # Train model
            model = model_config['model']
            model.fit(X_tr, y_train)
            
            # Predictions
            y_pred_train = model.predict(X_tr)
            y_pred_test = model.predict(X_te)
            
            # Evaluate
            train_metrics = evaluate_model(y_train, y_pred_train, model_name, target_name)
            test_metrics = evaluate_model(y_test, y_pred_test, model_name, target_name)
            
            print(f"   Train - MAE: {train_metrics['MAE']:.6f} | R¬≤: {train_metrics['R2']:.4f}")
            print(f"   Test  - MAE: {test_metrics['MAE']:.6f} | R¬≤: {test_metrics['R2']:.4f}")
            
            # Store results
            train_metrics['split'] = 'train'
            test_metrics['split'] = 'test'
            all_results.append(train_metrics)
            all_results.append(test_metrics)
            
            # Store model
            model_key = f"{model_name}_{target_name}"
            trained_models[model_key] = {
                'model': model,
                'scaler_used': model_config['needs_scaling']
            }
            
            # Store predictions
            target_predictions[model_name] = y_pred_test
        
        all_predictions[target_name] = target_predictions
    
    return trained_models, all_results, all_predictions


# ============================================================================
# ENSEMBLE MODEL
# ============================================================================

def create_ensemble(all_predictions, targets):
    """Create ensemble model from top 3 performers."""
    print("\n" + "="*60)
    print("STEP 7: CREATING ENSEMBLE MODEL")
    print("="*60)
    
    ensemble_predictions = {}
    ensemble_results = []
    
    for target_name, predictions_dict in all_predictions.items():
        print(f"\n{target_name}:")
        
        y_test = targets[target_name]['y_test']
        
        # Calculate MAE for each model
        model_scores = {}
        for model_name, y_pred in predictions_dict.items():
            mae = mean_absolute_error(y_test, y_pred)
            model_scores[model_name] = mae
        
        # Select top 3 models
        top_3 = sorted(model_scores.items(), key=lambda x: x[1])[:3]
        print(f"   Top 3 models:")
        for i, (model_name, mae) in enumerate(top_3, 1):
            print(f"      {i}. {model_name}: MAE = {mae:.6f}")
        
        # Weighted ensemble (inverse MAE)
        weights = [1/mae for _, mae in top_3]
        weight_sum = sum(weights)
        weights = [w/weight_sum for w in weights]
        
        # Create ensemble prediction
        ensemble_pred = np.zeros_like(y_test)
        for (model_name, _), weight in zip(top_3, weights):
            ensemble_pred += weight * predictions_dict[model_name]
        
        # Evaluate ensemble
        ensemble_metrics = evaluate_model(y_test, ensemble_pred, 'ENSEMBLE', target_name)
        ensemble_metrics['split'] = 'test'
        ensemble_results.append(ensemble_metrics)
        
        print(f"   ENSEMBLE - MAE: {ensemble_metrics['MAE']:.6f} | R¬≤: {ensemble_metrics['R2']:.4f}")
        
        ensemble_predictions[target_name] = ensemble_pred
    
    return ensemble_predictions, ensemble_results


# ============================================================================
# SAVE MODELS & RESULTS
# ============================================================================

def save_models(trained_models, scaler, encoders):
    """Save all trained models."""
    print("\n" + "="*60)
    print("STEP 8: SAVING MODELS")
    print("="*60)
    
    for model_name, model_data in trained_models.items():
        filepath = os.path.join(OUTPUT_DIR, f"{model_name}.pkl")
        with open(filepath, 'wb') as f:
            pickle.dump(model_data['model'], f)
        print(f"‚úì Saved: {filepath}")
    
    # Save scaler
    scaler_path = os.path.join(OUTPUT_DIR, 'scaler.pkl')
    with open(scaler_path, 'wb') as f:
        pickle.dump(scaler, f)
    print(f"‚úì Saved: {scaler_path}")
    
    # Save encoders
    encoders_path = os.path.join(OUTPUT_DIR, 'encoders.pkl')
    with open(encoders_path, 'wb') as f:
        pickle.dump(encoders, f)
    print(f"‚úì Saved: {encoders_path}")


def save_results(all_results, ensemble_results, test_ids, all_predictions, ensemble_predictions, targets):
    """Save evaluation results and predictions."""
    print("\n" + "="*60)
    print("STEP 9: SAVING RESULTS")
    print("="*60)
    
    # Combine all results
    results_df = pd.DataFrame(all_results + ensemble_results)
    results_path = os.path.join(PREDICTIONS_DIR, 'model_evaluation_results.csv')
    results_df.to_csv(results_path, index=False)
    print(f"‚úì Saved evaluation results: {results_path}")
    
    # Save predictions for each target
    for target_name in all_predictions.keys():
        pred_df = test_ids.copy()
        pred_df['y_true'] = targets[target_name]['y_test'].values
        
        # Add all model predictions
        for model_name, y_pred in all_predictions[target_name].items():
            pred_df[f'pred_{model_name}'] = y_pred
        
        # Add ensemble
        pred_df['pred_ENSEMBLE'] = ensemble_predictions[target_name]
        
        # Save
        pred_path = os.path.join(PREDICTIONS_DIR, f'predictions_{target_name}.csv')
        pred_df.to_csv(pred_path, index=False)
        print(f"‚úì Saved predictions: {pred_path}")


def print_final_summary(all_results, ensemble_results):
    """Print final performance summary."""
    print("\n" + "="*60)
    print("FINAL PERFORMANCE SUMMARY")
    print("="*60)
    
    results_df = pd.DataFrame(all_results + ensemble_results)
    
    # Test set performance only
    test_results = results_df[results_df['split'] == 'test'].copy()
    
    for target in test_results['target'].unique():
        print(f"\n{target}:")
        target_df = test_results[test_results['target'] == target].sort_values('MAE')
        
        print(f"\n{'Model':<20} {'MAE':>10} {'RMSE':>10} {'R¬≤':>10} {'MAPE':>10}")
        print("-" * 62)
        
        for _, row in target_df.iterrows():
            print(f"{row['model']:<20} {row['MAE']:>10.6f} {row['RMSE']:>10.6f} {row['R2']:>10.4f} {row['MAPE']:>9.2f}%")
        
        # Highlight best
        best = target_df.iloc[0]
        print(f"\nüèÜ BEST: {best['model']} (MAE: {best['MAE']:.6f})")


# ============================================================================
# MAIN EXECUTION
# ============================================================================

def main():
    """Execute complete model training pipeline."""
    
    print("\n" + "="*60)
    print("EV TRANSITION FORECASTING - MODEL TRAINING")
    print("="*60)
    print("\nModels to train:")
    print("  1. Linear Regression (Baseline)")
    print("  2. Ridge Regression (L2)")
    print("  3. Lasso Regression (L1)")
    print("  4. ElasticNet (L1+L2)")
    print("  5. Random Forest")
    print("  6. LightGBM")
    print("  7. ENSEMBLE (Top 3)")
    print("\nTargets: ev_share_t+1, ev_share_t+2, ev_share_t+3")
    print("="*60)
    
    # Load data
    df, feature_cols, identifier_cols, target_cols = load_and_prepare_data(INPUT_FILE)
    
    # Encode categorical
    df, feature_cols, encoders = encode_categorical(df, feature_cols)
    
    # Handle missing values
    df = handle_missing_values(df, feature_cols)
    
    # Train/test split
    X_train, X_test, targets, train_ids, test_ids = create_train_test_split(
        df, feature_cols, target_cols
    )
    
    # Scale features
    X_train_scaled, X_test_scaled, scaler = scale_features(X_train, X_test)
    
    # Train all models
    trained_models, all_results, all_predictions = train_all_models(
        X_train, X_train_scaled, X_test, X_test_scaled, targets
    )
    
    # Create ensemble
    ensemble_predictions, ensemble_results = create_ensemble(all_predictions, targets)
    
    # Save everything
    save_models(trained_models, scaler, encoders)
    save_results(all_results, ensemble_results, test_ids, all_predictions, 
                 ensemble_predictions, targets)
    
    # Final summary
    print_final_summary(all_results, ensemble_results)
    
    print("\n" + "="*60)
    print("TRAINING COMPLETE ‚úÖ")
    print("="*60)
    print(f"\nOutputs saved to:")
    print(f"  Models: {OUTPUT_DIR}")
    print(f"  Predictions: {PREDICTIONS_DIR}")
    print("\n" + "="*60)


if __name__ == "__main__":
    main()


EV TRANSITION FORECASTING - MODEL TRAINING

Models to train:
  1. Linear Regression (Baseline)
  2. Ridge Regression (L2)
  3. Lasso Regression (L1)
  4. ElasticNet (L1+L2)
  5. Random Forest
  6. LightGBM
  7. ENSEMBLE (Top 3)

Targets: ev_share_t+1, ev_share_t+2, ev_share_t+3

STEP 1: LOADING & PREPROCESSING DATA
‚úì Loaded dataset: (216, 28)

‚úì Identifiers: 3 - ['state', 'year', 'vehicle_segment']
‚úì Features: 22
‚úì Targets: 3 - ['ev_share_t+1', 'ev_share_t+2', 'ev_share_t+3']

STEP 2: ENCODING CATEGORICAL VARIABLES
‚úì Encoded 'state': 18 unique values
‚úì Encoded 'vehicle_segment': 3 unique values

STEP 3: HANDLING MISSING VALUES

‚ö†Ô∏è  Columns with missing values:
   infra_yoy_growth_t-1: 72 (33.3%)
   fast_charger_index_t-1: 54 (25.0%)
   stations_per_10k_vehicles_t-1: 54 (25.0%)
   ‚Üí Filled 'infra_yoy_growth_t-1' with median: 0.1538
   ‚Üí Filled 'fast_charger_index_t-1' with median: 2.5743
   ‚Üí Filled 'stations_per_10k_vehicles_t-1' with median: 0.2559

‚úì All miss