In [None]:
import pandas as pd
import numpy as np
from scipy import stats
import warnings
warnings.filterwarnings('ignore')


def engineer_climate_features(df):
    """
    Engineer climate features from precipitation and temperature data.
    Creates features relevant to fire risk prediction.
    """
    print("Engineering climate features...")

    # Temperature features
    df['temp_range'] = df['tmax'] - df['tmin']
    df['temp_avg'] = (df['tmax'] + df['tmin']) / 2

    # Aridity indicators (higher = drier = more fire risk)
    # Avoid division by zero
    df['aridity_index'] = df['temp_avg'] / (df['prec'] + 1)

    # Temperature stress indicators
    df['heat_stress'] = np.maximum(0, df['tmax'] - 30)  # Days above 30°C
    df['extreme_heat'] = (df['tmax'] > 35).astype(int)

    # Precipitation categories (fire risk typically higher with low precip)
    df['precip_category'] = pd.cut(df['prec'],
                                     bins=[-np.inf, 100, 300, 600, np.inf],
                                     labels=['very_low', 'low', 'moderate', 'high'])

    # Drought indicator (low precip + high temp)
    df['drought_index'] = (df['tmax'] - df['tmin']) * df['aridity_index']

    # Evapotranspiration proxy (simplified)
    df['et_proxy'] = df['temp_avg'] * 0.5  # Simplified ET estimation
    df['water_deficit'] = np.maximum(0, df['et_proxy'] - df['prec'])

    return df


def engineer_soil_features(df):
    """
    Engineer soil features relevant to fire prediction.
    Handles missing values efficiently.
    """
    print("Engineering soil features...")

    # Handle missing values (-4 appears to be the missing value indicator)
    soil_cols = ['COARSE', 'SAND', 'SILT', 'CLAY', 'BULK', 'REF_BULK',
                 'ORG_CARBON', 'PH_WATER', 'TOTAL_N', 'CN_RATIO',
                 'CEC_SOIL', 'CEC_CLAY', 'CEC_EFF', 'TEB', 'BSAT',
                 'ALUM_SAT', 'ESP', 'TCARBON_EQ', 'GYPSUM', 'ELEC_COND']

    # Replace -4 with NaN for proper handling
    for col in soil_cols:
        if col in df.columns:
            df[col] = df[col].replace(-4, np.nan)

    # === TEXTURE FEATURES ===
    # Soil texture influences water retention and fire behavior
    if all(col in df.columns for col in ['SAND', 'SILT', 'CLAY']):
        # Sandy soils dry quickly (higher fire risk)
        df['sand_dominant'] = (df['SAND'] > 60).astype(int)

        # Clay soils retain moisture (lower fire risk)
        df['clay_dominant'] = (df['CLAY'] > 35).astype(int)

        # Loamy soils (balanced)
        df['loam_indicator'] = ((df['SAND'] >= 40) & (df['SAND'] <= 60) &
                                (df['CLAY'] >= 15) & (df['CLAY'] <= 35)).astype(int)

        # Texture balance ratio
        df['sand_clay_ratio'] = df['SAND'] / (df['CLAY'] + 1)

    # === ORGANIC MATTER FEATURES ===
    # Organic matter = fuel for fires
    if 'ORG_CARBON' in df.columns:
        # Convert to numeric if string
        if df['ORG_CARBON'].dtype == 'object':
            df['ORG_CARBON'] = pd.to_numeric(df['ORG_CARBON'], errors='coerce')

        df['high_organic_matter'] = (df['ORG_CARBON'] > 2).astype(int)
        df['organic_fuel_load'] = df['ORG_CARBON'].fillna(df['ORG_CARBON'].median())

    # === SOIL DENSITY FEATURES ===
    # Bulk density affects soil moisture and root penetration
    if 'BULK' in df.columns:
        df['bulk_density_class'] = pd.cut(df['BULK'],
                                          bins=[0, 1.2, 1.4, 1.6, np.inf],
                                          labels=['very_low', 'low', 'moderate', 'high'])

        # Compaction indicator
        df['soil_compaction'] = (df['BULK'] > 1.5).astype(int)

    # === CHEMICAL FEATURES ===
    # pH affects vegetation and decomposition
    if 'PH_WATER' in df.columns:
        df['soil_acidity'] = pd.cut(df['PH_WATER'],
                                    bins=[0, 5.5, 6.5, 7.5, np.inf],
                                    labels=['acidic', 'slightly_acidic', 'neutral', 'alkaline'])

        df['extreme_ph'] = ((df['PH_WATER'] < 5) | (df['PH_WATER'] > 8)).astype(int)

    # === NUTRIENT FEATURES ===
    # Nutrient availability affects vegetation (fuel) growth
    if 'TOTAL_N' in df.columns:
        df['nitrogen_level'] = pd.cut(df['TOTAL_N'],
                                      bins=[0, 0.5, 1.5, 3, np.inf],
                                      labels=['very_low', 'low', 'moderate', 'high'])

    if 'CN_RATIO' in df.columns:
        # C/N ratio affects decomposition rate
        df['slow_decomposition'] = (df['CN_RATIO'] > 25).astype(int)

    # === CEC FEATURES (Cation Exchange Capacity) ===
    # Indicates soil fertility and water holding capacity
    if 'CEC_SOIL' in df.columns:
        df['cec_category'] = pd.cut(df['CEC_SOIL'],
                                    bins=[0, 10, 20, 30, np.inf],
                                    labels=['low', 'moderate', 'high', 'very_high'])

        df['high_fertility'] = (df['CEC_SOIL'] > 25).astype(int)

    # === SALINITY FEATURES ===
    if 'ELEC_COND' in df.columns:
        df['saline_soil'] = (df['ELEC_COND'] > 4).astype(int)
        df['salinity_stress'] = pd.cut(df['ELEC_COND'],
                                       bins=[0, 2, 4, 8, np.inf],
                                       labels=['non_saline', 'slightly', 'moderate', 'high'])

    # === COARSE FRAGMENTS ===
    if 'COARSE' in df.columns:
        df['rocky_soil'] = (df['COARSE'] > 15).astype(int)

    return df


def engineer_elevation_features(df):
    """
    Engineer elevation features relevant to fire prediction.
    Elevation affects temperature, precipitation, and vegetation patterns.
    """
    print("Engineering elevation features...")

    if 'elevation' not in df.columns:
        print("Warning: elevation column not found")
        return df

    # === ELEVATION CATEGORIES ===
    # Different elevation zones have different fire risks
    df['elevation_category'] = pd.cut(df['elevation'],
                                      bins=[-np.inf, 200, 500, 1000, 2000, np.inf],
                                      labels=['lowland', 'low_hills', 'hills', 'mountains', 'high_mountains'])

    # === TOPOGRAPHIC INDICATORS ===
    # Low elevations in hot climates = higher fire risk
    df['lowland_area'] = (df['elevation'] < 200).astype(int)

    # Mountain areas (different fire behavior)
    df['mountain_area'] = (df['elevation'] > 1000).astype(int)

    # High altitude (typically lower fire risk due to climate)
    df['high_altitude'] = (df['elevation'] > 2000).astype(int)

    # === ELEVATION-BASED CLIMATE ADJUSTMENTS ===
    # Temperature typically decreases with elevation (~6.5°C per 1000m)
    df['elevation_km'] = df['elevation'] / 1000

    # Temperature lapse rate correction
    if 'temp_avg' in df.columns:
        df['temp_adjusted'] = df['temp_avg'] - (df['elevation_km'] * 6.5)

        # Relative elevation temperature (normalized)
        temp_range = df['temp_avg'].max() - df['temp_avg'].min()
        if temp_range > 0:
            df['relative_temp_elev'] = (df['temp_avg'] - df['temp_avg'].min()) / temp_range

    # Precipitation often increases with elevation (orographic effect)
    if 'prec' in df.columns:
        df['prec_elevation_interaction'] = df['prec'] * df['elevation_km']

        # Low elevation + low precipitation = high fire risk
        df['dry_lowland'] = ((df['elevation'] < 500) & (df['prec'] < 300)).astype(int)

    # === ELEVATION VARIABILITY (using local statistics) ===
    # Calculate percentile position within dataset
    df['elevation_percentile'] = df['elevation'].rank(pct=True)

    # Identify unusual elevations (potential microclimates)
    elev_mean = df['elevation'].mean()
    elev_std = df['elevation'].std()
    df['elevation_zscore'] = (df['elevation'] - elev_mean) / elev_std
    df['unusual_elevation'] = (np.abs(df['elevation_zscore']) > 2).astype(int)

    # === ELEVATION BINS (for non-linear relationships) ===
    # Quadratic term to capture non-linear effects
    df['elevation_squared'] = df['elevation'] ** 2

    # Log transform for skewed distributions
    df['elevation_log'] = np.log1p(df['elevation'])

    return df


def engineer_landcover_features(df):
    """
    Engineer land cover features from LCCCODE.
    Land cover type is critical for fire prediction as it determines fuel type and load.
    """
    print("Engineering land cover features...")

    if 'LCCCODE' not in df.columns:
        print("Warning: LCCCODE column not found")
        return df

    # Handle multiple land cover codes (some entries have format '0003 / 0004')

    FOREST_CODES = {
    '21496','21497','21499','20132','20134','20135','20797','20809'
    }

    SHRUBLAND_CODES = {
        '21450','21517','21518','21520','20056'
    }

    GRASSLAND_CODES = {
        '21454','21465','20038','20058'
    }

    CROPLAND_CODES = {
        '11490','11491','11494','11495','11498','11499','11500','30001','0003'
    }

    URBAN_CODES = {'0010'}
    BARREN_CODES = {'0011','6001','6004','6020'}
    WATER_CODES = {'7001','8001','0007'}
    SNOW_CODES = {'8005','8008'}

    FLOODED_CODES_PREFIX = {'416','417','415','423'}


    df['LCCCODE'] = df['LCCCODE'].astype(str)
    df['lcc_primary'] = df['LCCCODE'].str.split('/').str[0].str.strip()

    # Mosaic
    df['lcc_mixed'] = df['LCCCODE'].str.contains('/').astype(np.int8)
    df['lcc_count'] = df['LCCCODE'].str.split('/').apply(len).astype(np.int8)

    # Core land cover
    df['lcc_forest'] = df['lcc_primary'].isin(FOREST_CODES).astype(np.int8)
    df['lcc_shrubland'] = df['lcc_primary'].isin(SHRUBLAND_CODES).astype(np.int8)
    df['lcc_grassland'] = df['lcc_primary'].isin(GRASSLAND_CODES).astype(np.int8)
    df['lcc_cropland'] = df['lcc_primary'].isin(CROPLAND_CODES).astype(np.int8)
    df['lcc_urban'] = df['lcc_primary'].isin(URBAN_CODES).astype(np.int8)
    df['lcc_barren'] = df['lcc_primary'].isin(BARREN_CODES).astype(np.int8)
    df['lcc_water'] = df['lcc_primary'].isin(WATER_CODES).astype(np.int8)
    df['lcc_snow_ice'] = df['lcc_primary'].isin(SNOW_CODES).astype(np.int8)

    # Flooding / moisture
    df['flooded_area'] = df['lcc_primary'].str[:3].isin(FLOODED_CODES_PREFIX).astype(np.int8)

    # Vegetation structure
    df['woody_vegetation'] = (
        df['lcc_forest'] | df['lcc_shrubland']
    ).astype(np.int8)

    df['herbaceous_vegetation'] = (
        df['lcc_grassland'] | df['lcc_cropland']
    ).astype(np.int8)

    # Fuel load
    df['fuel_load'] = np.select(
        [
            (df['lcc_forest'] | df['lcc_shrubland']).astype(bool),
            (df['lcc_grassland'] | df['lcc_cropland']).astype(bool),
            (df['lcc_barren'] | df['lcc_urban'] | df['lcc_water']).astype(bool)
        ],
        [3, 2, 1],
        default=0
    ).astype(np.int8)


    # Fire-prone land cover
    df['fire_prone_landcover'] = (
        (df['fuel_load'] >= 2) &
        (df['flooded_area'] == 0)
    ).astype(np.int8)

    return df


def create_interaction_features(df):
    """
    Create interaction features between climate, soil, elevation, and land cover.
    These capture complex relationships important for fire risk.
    """
    print("Creating comprehensive interaction features...")

    # === CLIMATE-SOIL INTERACTIONS ===
    # Soil moisture proxy (inverse relationship with fire risk)
    if all(col in df.columns for col in ['CLAY', 'prec', 'temp_avg']):
        # Clay content + precipitation - temperature stress
        df['moisture_proxy'] = (df['CLAY'] * df['prec']) / (df['temp_avg'] + 1)

    # Fuel moisture content indicator
    if all(col in df.columns for col in ['organic_fuel_load', 'prec', 'aridity_index']):
        df['fuel_moisture_deficit'] = (df['organic_fuel_load'] * df['aridity_index']) / (df['prec'] + 1)

    # Dryness-texture interaction
    if all(col in df.columns for col in ['SAND', 'drought_index']):
        df['sandy_drought_risk'] = df['SAND'] * df['drought_index'] / 100

    # Water holding capacity vs evapotranspiration
    if all(col in df.columns for col in ['CLAY', 'water_deficit']):
        df['water_stress_index'] = df['water_deficit'] / (df['CLAY'] + 1)

    # === ELEVATION-CLIMATE INTERACTIONS ===
    # High temperature at low elevation (extreme fire risk)
    if all(col in df.columns for col in ['elevation', 'temp_avg']):
        df['lowland_heat_risk'] = (df['temp_avg'] * 100) / (df['elevation'] + 1)

    # Aridity at different elevations
    if all(col in df.columns for col in ['aridity_index', 'elevation_km']):
        df['elevation_aridity'] = df['aridity_index'] / (df['elevation_km'] + 1)

    # Precipitation deficit by elevation
    if all(col in df.columns for col in ['water_deficit', 'lowland_area']):
        df['lowland_water_deficit'] = df['water_deficit'] * df['lowland_area']

    # === LAND COVER-CLIMATE INTERACTIONS ===
    # Fire-prone vegetation in dry conditions
    if all(col in df.columns for col in ['fire_prone_landcover', 'aridity_index']):
        df['dry_fire_prone_area'] = df['fire_prone_landcover'] * df['aridity_index']

    # High fuel load with drought
    if all(col in df.columns for col in ['high_fuel_load', 'drought_index']):
        df['fuel_drought_risk'] = df['high_fuel_load'] * df['drought_index']

    # Woody vegetation in hot conditions
    if all(col in df.columns for col in ['woody_vegetation', 'heat_stress']):
        df['woody_heat_risk'] = df['woody_vegetation'] * df['heat_stress']

    # === LAND COVER-SOIL INTERACTIONS ===
    # Organic matter in forested areas (high fuel)
    if all(col in df.columns for col in ['lcc_forest', 'organic_fuel_load']):
        df['forest_organic_fuel'] = df['lcc_forest'] * df['organic_fuel_load']

    # Sandy soil with grassland (rapid drying)
    if all(col in df.columns for col in ['lcc_grassland', 'sand_dominant']):
        df['sandy_grassland'] = df['lcc_grassland'] * df['sand_dominant']

    # === ELEVATION-LAND COVER INTERACTIONS ===
    # Forest at low elevation (different fire dynamics)
    if all(col in df.columns for col in ['lcc_forest', 'lowland_area']):
        df['lowland_forest'] = df['lcc_forest'] * df['lowland_area']

    # Mountain grassland (alpine meadows - different fire regime)
    if all(col in df.columns for col in ['lcc_grassland', 'mountain_area']):
        df['mountain_grassland'] = df['lcc_grassland'] * df['mountain_area']

    # === TRIPLE INTERACTIONS ===
    # High-risk triple combination: fire-prone landcover + dry climate + low elevation
    if all(col in df.columns for col in ['fire_prone_landcover', 'aridity_index', 'lowland_area']):
        df['triple_risk_lowland'] = df['fire_prone_landcover'] * df['aridity_index'] * df['lowland_area']

    # Fuel load + drought + sandy soil
    if all(col in df.columns for col in ['high_fuel_load', 'drought_index', 'sand_dominant']):
        df['triple_risk_fuel_drought_sand'] = df['high_fuel_load'] * df['drought_index'] * df['sand_dominant']

    return df


def create_aggregated_features(df):
    """
    Create aggregated risk indices.
    """
    print("Creating aggregated risk indices...")

    # Fire risk score components (normalize to 0-1 scale)
    risk_components = []

    # Climate risk
    if 'aridity_index' in df.columns:
        climate_risk = (df['aridity_index'] - df['aridity_index'].min()) / \
                       (df['aridity_index'].max() - df['aridity_index'].min())
        risk_components.append(climate_risk)

    if 'heat_stress' in df.columns:
        heat_risk = (df['heat_stress'] - df['heat_stress'].min()) / \
                    (df['heat_stress'].max() - df['heat_stress'].min() + 1)
        risk_components.append(heat_risk)

    # Soil risk
    if 'sand_dominant' in df.columns:
        risk_components.append(df['sand_dominant'])

    if 'high_organic_matter' in df.columns:
        risk_components.append(df['high_organic_matter'])

    # Elevation risk (lowlands typically higher risk in hot climates)
    if 'lowland_area' in df.columns:
        risk_components.append(df['lowland_area'])

    # Land cover risk
    if 'fire_prone_landcover' in df.columns:
        risk_components.append(df['fire_prone_landcover'])

    if 'high_fuel_load' in df.columns:
        risk_components.append(df['high_fuel_load'])

    # Combined risk index
    if risk_components:
        df['composite_fire_risk'] = np.mean(risk_components, axis=0)

    # === DOMAIN-SPECIFIC INDICES ===

    # Fuel Availability Index
    fuel_components = []
    if 'organic_fuel_load' in df.columns:
        fuel_normalized = (df['organic_fuel_load'] - df['organic_fuel_load'].min()) / \
                         (df['organic_fuel_load'].max() - df['organic_fuel_load'].min() + 1)
        fuel_components.append(fuel_normalized)

    if 'high_fuel_load' in df.columns:
        fuel_components.append(df['high_fuel_load'])

    if fuel_components:
        df['fuel_availability_index'] = np.mean(fuel_components, axis=0)

    # Environmental Stress Index
    stress_components = []
    if 'water_deficit' in df.columns:
        stress_normalized = (df['water_deficit'] - df['water_deficit'].min()) / \
                           (df['water_deficit'].max() - df['water_deficit'].min() + 1)
        stress_components.append(stress_normalized)

    if 'heat_stress' in df.columns:
        heat_normalized = (df['heat_stress'] - df['heat_stress'].min()) / \
                         (df['heat_stress'].max() - df['heat_stress'].min() + 1)
        stress_components.append(heat_normalized)

    if stress_components:
        df['environmental_stress_index'] = np.mean(stress_components, axis=0)

    return df


def handle_missing_values_efficiently(df, strategy='median'):
    """
    Efficiently handle missing values in numerical columns.
    Uses median imputation by default (robust to outliers).
    """
    print("Handling missing values...")

    # Exclude non-feature columns
    exclude_cols = ['latitude', 'longitude', 'class']

    # Get numerical columns (exclude specified columns)
    num_cols = df.select_dtypes(include=[np.number]).columns
    num_cols = [col for col in num_cols if col not in exclude_cols]

    # Impute using vectorized operations
    if strategy == 'median':
        df[num_cols] = df[num_cols].fillna(df[num_cols].median())
    elif strategy == 'mean':
        df[num_cols] = df[num_cols].fillna(df[num_cols].mean())

    return df


def encode_categorical_features(df):
    """
    Encode categorical features created during feature engineering.
    Uses efficient one-hot encoding.
    """
    print("Encoding categorical features...")

    # Get categorical columns (from feature engineering)
    cat_cols = df.select_dtypes(include=['category', 'object']).columns

    # Exclude original categorical columns and land cover codes
    exclude_cols = ['LCCCODE', 'lcc_primary', 'TEXTURE_USDA', 'TEXTURE_SOTER']
    cat_cols = [col for col in cat_cols if col not in exclude_cols]

    if len(cat_cols) > 0:
        # Use pd.get_dummies for efficient one-hot encoding
        df = pd.get_dummies(df, columns=cat_cols, drop_first=True, dtype=int)

    return df


def remove_outliers_efficiently(df, contamination=0.01):
    """
    Optional: Remove extreme outliers using IQR method.
    Only for engineered numerical features.
    """
    print("Checking for outliers...")

    exclude_cols = ['latitude', 'longitude', 'class']
    num_cols = df.select_dtypes(include=[np.number]).columns
    num_cols = [col for col in num_cols if col not in exclude_cols]

    # Use IQR method for each numerical column
    Q1 = df[num_cols].quantile(0.25)
    Q3 = df[num_cols].quantile(0.75)
    IQR = Q3 - Q1

    # Define outlier bounds
    lower_bound = Q1 - 3 * IQR
    upper_bound = Q3 + 3 * IQR

    # Create mask for rows without extreme outliers
    mask = ~((df[num_cols] < lower_bound) | (df[num_cols] > upper_bound)).any(axis=1)

    outliers_removed = len(df) - mask.sum()
    print(f"Removed {outliers_removed} rows with extreme outliers ({outliers_removed/len(df)*100:.2f}%)")

    return df[mask].reset_index(drop=True)


# ============================================================================
# MAIN FEATURE ENGINEERING PIPELINE
# ============================================================================

def feature_engineering_pipeline(df, remove_outliers=False):
    """
    Complete feature engineering pipeline.

    Parameters:
    -----------
    df : pandas DataFrame
        Input dataframe with raw features
    remove_outliers : bool
        Whether to remove extreme outliers (default: False)

    Returns:
    --------
    df_engineered : pandas DataFrame
        Dataframe with engineered features
    """
    print("=" * 70)
    print("STARTING COMPREHENSIVE FEATURE ENGINEERING PIPELINE")
    print("=" * 70)
    print(f"Initial shape: {df.shape}")

    # Create a copy to avoid modifying original
    df_eng = df.copy()

    # 1. Engineer climate features
    df_eng = engineer_climate_features(df_eng)

    # 2. Engineer soil features
    df_eng = engineer_soil_features(df_eng)

    # 3. Engineer elevation features
    df_eng = engineer_elevation_features(df_eng)

    # 4. Engineer land cover features
    df_eng = engineer_landcover_features(df_eng)

    # 5. Create interaction features (must be after all base features)
    df_eng = create_interaction_features(df_eng)

    # 6. Create aggregated features
    df_eng = create_aggregated_features(df_eng)

    # 7. Handle missing values
    df_eng = handle_missing_values_efficiently(df_eng, strategy='median')

    # 8. Encode categorical features
    df_eng = encode_categorical_features(df_eng)

    # 9. Optional: Remove outliers
    if remove_outliers:
        df_eng = remove_outliers_efficiently(df_eng)

    print("=" * 70)
    print("FEATURE ENGINEERING COMPLETE")
    print("=" * 70)
    print(f"Final shape: {df_eng.shape}")
    print(f"New features created: {df_eng.shape[1] - df.shape[1]}")

    # Summary of feature types
    print("\nFeature Summary:")
    print(f"- Total features: {df_eng.shape[1]}")
    print(f"- Numerical features: {len(df_eng.select_dtypes(include=[np.number]).columns)}")
    print(f"- Categorical features: {len(df_eng.select_dtypes(include=['object', 'category']).columns)}")

    # Feature breakdown by category
    climate_features = [col for col in df_eng.columns if any(x in col.lower() for x in
                       ['temp', 'prec', 'arid', 'drought', 'heat', 'water', 'et_'])]
    soil_features = [col for col in df_eng.columns if any(x in col.lower() for x in
                    ['sand', 'clay', 'silt', 'bulk', 'organic', 'ph', 'cec', 'nitrogen', 'salin'])]
    elevation_features = [col for col in df_eng.columns if any(x in col.lower() for x in
                         ['elevation', 'altitude', 'lowland', 'mountain'])]
    landcover_features = [col for col in df_eng.columns if any(x in col.lower() for x in
                         ['lcc', 'forest', 'grass', 'crop', 'fuel', 'vegetation', 'woody'])]

    print(f"\nFeature Categories:")
    print(f"- Climate features: {len(climate_features)}")
    print(f"- Soil features: {len(soil_features)}")
    print(f"- Elevation features: {len(elevation_features)}")
    print(f"- Land cover features: {len(landcover_features)}")

    return df_eng



if __name__ == "__main__":
    df = pd.read_csv("/content/merged_data_with_soil.csv")

    print(f"Original data shape: {df.shape}")
    print(f"Original columns: {list(df.columns)}")

    # Run feature engineering
    df_engineered = feature_engineering_pipeline(df, remove_outliers=False)

    # Save engineered data
    print("\nSaving engineered data...")
    df_engineered.to_csv("fire_data_engineered.csv", index=False)
    print("Saved to: fire_data_engineered.csv")