In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import gc
import warnings
warnings.filterwarnings('ignore')

def reduce_memory_usage(df):
    """Reduce memory usage by downcasting numeric types"""
    start_mem = df.memory_usage().sum() / 1024**2
    print(f'Memory usage: {start_mem:.2f} MB')

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object and col not in ['latitude', 'longitude']:
            c_min = df[col].min()
            c_max = df[col].max()

            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)

    end_mem = df.memory_usage().sum() / 1024**2
    print(f'Memory usage after optimization: {end_mem:.2f} MB')
    print(f'Decreased by {100 * (start_mem - end_mem) / start_mem:.1f}%')
    return df

def preprocess_wildfire_data_efficient(csv_path, chunk_size=50000):
    """
    Memory-efficient preprocessing for large wildfire datasets
    """

    print("="*60)
    print("MEMORY-EFFICIENT WILDFIRE PREPROCESSING PIPELINE")
    print("="*60)

    # ============================================
    # 1. LOAD DATA IN CHUNKS AND GET INFO
    # ============================================
    print("\n[1/7] Loading data information...")

    # Get basic info and column types
    sample = pd.read_csv(csv_path, nrows=1000)
    print(f"Sample loaded. Columns: {len(sample.columns)}")

    # ============================================
    # 2. PROCESS IN CHUNKS
    # ============================================
    print("\n[2/7] Processing data in chunks to save memory...")

    chunks_processed = []
    chunk_num = 0

    for chunk in pd.read_csv(csv_path, chunksize=chunk_size):
        chunk_num += 1
        print(f"\nProcessing chunk {chunk_num} ({len(chunk)} rows)...")

        # Convert ORG_CARBON to numeric
        if 'ORG_CARBON' in chunk.columns:
            chunk['ORG_CARBON'] = pd.to_numeric(chunk['ORG_CARBON'], errors='coerce')

        # Fill missing values
        numeric_cols = chunk.select_dtypes(include=[np.number]).columns
        for col in numeric_cols:
            if chunk[col].isnull().sum() > 0:
                chunk[col].fillna(chunk[col].median(), inplace=True)

        # Feature Engineering (simplified to save memory)
        if 'tmax' in chunk.columns and 'tmin' in chunk.columns:
            chunk['temp_range'] = chunk['tmax'] - chunk['tmin']
            chunk['temp_mean'] = (chunk['tmax'] + chunk['tmin']) / 2

        if 'tmax' in chunk.columns and 'precipitation' in chunk.columns:
            chunk['aridity_index'] = np.where(
                chunk['precipitation'] > 0.1,
                chunk['tmax'] / chunk['precipitation'],
                chunk['tmax'] * 10
            )

        if 'precipitation' in chunk.columns and 'temp_mean' in chunk.columns:
            chunk['drought_stress'] = np.where(
                chunk['precipitation'] > 0.1,
                chunk['temp_mean'] / chunk['precipitation'],
                chunk['temp_mean'] * 10
            )

        if all(col in chunk.columns for col in ['SAND', 'CLAY']):
            chunk['sand_clay_ratio'] = np.where(
                chunk['CLAY'] > 1,
                chunk['SAND'] / chunk['CLAY'],
                chunk['SAND']
            )

        if all(col in chunk.columns for col in ['tmax', 'precipitation', 'ORG_CARBON']):
            chunk['fire_risk_score'] = np.where(
                chunk['precipitation'] > 0.1,
                (chunk['tmax'] * chunk['ORG_CARBON']) / chunk['precipitation'],
                chunk['tmax'] * chunk['ORG_CARBON'] * 10
            )

        # Cap extreme values
        for col in chunk.select_dtypes(include=[np.number]).columns:
            if col not in ['latitude', 'longitude', 'class']:
                chunk[col] = chunk[col].replace([np.inf, -np.inf], np.nan)
                if chunk[col].isnull().sum() > 0:
                    chunk[col].fillna(chunk[col].median(), inplace=True)
                # Cap at 99.9th percentile to remove extreme outliers
                upper_limit = chunk[col].quantile(0.999)
                lower_limit = chunk[col].quantile(0.001)
                chunk[col] = chunk[col].clip(lower_limit, upper_limit)

        # Reduce memory
        chunk = reduce_memory_usage(chunk)

        chunks_processed.append(chunk)

        # Force garbage collection
        gc.collect()

    # ============================================
    # 3. COMBINE CHUNKS
    # ============================================
    print("\n[3/7] Combining processed chunks...")
    data = pd.concat(chunks_processed, ignore_index=True)
    del chunks_processed
    gc.collect()

    print(f"Total rows: {len(data)}")
    print(f"Total columns: {len(data.columns)}")

    # ============================================
    # 4. HANDLE CATEGORICAL VARIABLES (SIMPLIFIED)
    # ============================================
    print("\n[4/7] Encoding categorical variables...")

    # Only keep most important categorical features
    if 'LCCCODE' in data.columns:
        # Keep only top 5 categories
        top_cats = data['LCCCODE'].value_counts().nlargest(5).index
        data['LCCCODE'] = data['LCCCODE'].apply(lambda x: x if x in top_cats else 'Other')
        dummies = pd.get_dummies(data['LCCCODE'], prefix='LCC', drop_first=True, dtype=np.int8)
        data = pd.concat([data, dummies], axis=1)
        data.drop('LCCCODE', axis=1, inplace=True)

    # Drop other high-cardinality categorical columns to save memory
    cats_to_drop = ['TEXTURE_USDA', 'TEXTURE_SOTER', 'SMU', 'WISE30s_SMU_ID']
    for col in cats_to_drop:
        if col in data.columns:
            data.drop(col, axis=1, inplace=True)

    gc.collect()

    # ============================================
    # 5. PREPARE FOR SCALING
    # ============================================
    print("\n[5/7] Preparing features for scaling...")

    protected_features = ['latitude', 'longitude', 'class']
    features_to_scale = [col for col in data.columns
                         if col not in protected_features
                         and data[col].dtype in [np.float32, np.float64, np.int8, np.int16, np.int32, np.int64]]

    print(f"Features to scale: {len(features_to_scale)}")

    # ============================================
    # 6. SPLIT FIRST, THEN SCALE
    # ============================================
    print("\n[6/7] Splitting data...")

    X = data.drop(['class'], axis=1)
    y = data['class']

    del data
    gc.collect()

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    del X, y
    gc.collect()

    print(f"Training samples: {len(X_train)}")
    print(f"Testing samples: {len(X_test)}")

    # ============================================
    # 7. SCALE ONLY TRAINING DATA
    # ============================================
    print("\n[7/7] Scaling features (Standard Scaler only)...")

    scaler = StandardScaler()
    X_train[features_to_scale] = scaler.fit_transform(X_train[features_to_scale])
    X_test[features_to_scale] = scaler.transform(X_test[features_to_scale])

    print("\n✓ Preprocessing complete!")

    # ============================================
    # 8. SAVE TO CSV
    # ============================================
    print("\n[8/8] Saving to CSV files...")

    train_data = X_train.copy()
    train_data['class'] = y_train.values
    test_data = X_test.copy()
    test_data['class'] = y_test.values

    train_data.to_csv('train_data_preprocessed.csv', index=False)
    print("✓ Saved: train_data_preprocessed.csv")

    test_data.to_csv('test_data_preprocessed.csv', index=False)
    print("✓ Saved: test_data_preprocessed.csv")

    # Save feature names
    feature_info = pd.DataFrame({
        'feature_name': X_train.columns.tolist()
    })
    feature_info.to_csv('feature_names.csv', index=False)
    print("✓ Saved: feature_names.csv")

    # ============================================
    # SUMMARY
    # ============================================
    print("\n" + "="*60)
    print("SUMMARY")
    print("="*60)
    print(f"Training samples: {len(X_train)}")
    print(f"Testing samples: {len(X_test)}")
    print(f"Total features: {len(X_train.columns)}")
    print(f"\nClass distribution in training set:")
    print(y_train.value_counts())
    print(f"\nClass balance: {y_train.value_counts(normalize=True).round(3)}")

    return {
        'X_train': X_train,
        'X_test': X_test,
        'y_train': y_train,
        'y_test': y_test,
        'scaler': scaler,
        'feature_names': X_train.columns.tolist()
    }





if __name__ == "__main__":

    results = preprocess_wildfire_data_efficient('/content/merged_data_with_soil.csv')