In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline as SklearnPipeline # <-- Original Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
import sys
import os

from sklearn.ensemble import RandomForestClassifier # Our Model (Same)
from sklearn.metrics import accuracy_score, classification_report

# --- UPDATE 1: Import the "Weapons" (Imblearn) ---
try:
    from imblearn.pipeline import Pipeline as ImbPipeline # <-- SPECIAL Pipeline for SMOTE
    from imblearn.over_sampling import SMOTE
    print("SUCCESS: 'imbalanced-learn' (SMOTE) library imported.")
except ImportError:
    print("ERROR: 'imbalanced-learn' library not found.")
    print("Please run 'pip install imbalanced-learn' in the terminal.")
    # raise

# --- System Integration (Same) ---
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
try:
    from src.processing.classification import SubgroupDataPreprocessor
    print("SUCCESS: 'src.processing.classification.SubgroupDataPreprocessor' imported.")
except ImportError:
    print("ERROR: Could not import from 'src/processing/classification.py'.")
    raise

# --- 1-3. Data Preparation (Same) ---
file_path = "../1900_2021_DISASTERS.xlsx - emdat data.csv"
df_raw = pd.read_csv(file_path)
preprocessor = SubgroupDataPreprocessor(target_col="Disaster Subgroup")
df_clean = preprocessor.fit_transform(df_raw)
target = 'Disaster Subgroup'
categorical_features = ['Continent', 'Disaster Group']
numerical_features = ['Total Deaths', 'No Injured', 'No Affected', 'Start Year']
available_features = [col for col in (categorical_features + numerical_features) if col in df_clean.columns]
X = df_clean[available_features]
y = df_clean[target]

# Filter Rare Classes (Same)
class_counts = y.value_counts()
rare_classes = class_counts[class_counts < 2].index
y_filtered = y[~y.isin(rare_classes)]
X_filtered = X.loc[y_filtered.index]
print(f"Rare classes removed. New row count: {len(y_filtered)}")

# Split the data (Same)
# CRITICAL LESSON: SMOTE is NEVER applied to the 'test_df'.
# We perform the 'train_test_split' FIRST.
X_train, X_test, y_train, y_test = train_test_split(
    X_filtered, y_filtered, test_size=0.2, random_state=42, stratify=y_filtered
)
print(f"Training set size: {len(X_train)}, Test set size: {len(X_test)}")

# --- 4. Advanced Model Pipeline (ColumnTransformer) (Same) ---
# Our plumbing (data processing) DOES NOT CHANGE.
numerical_transformer = SklearnPipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
categorical_transformer = SklearnPipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])
preprocessor_pipeline = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, [f for f in numerical_features if f in available_features]),
        ('cat', categorical_transformer, [f for f in categorical_features if f in available_features])
    ])

# --- UPDATE 2: The "Final System" Pipeline with SMOTE ---
# We are COMBINING the "System" (ColumnTransformer) AND the "Weapon" (SMOTE).
model_pipeline_classification_v3_smote = ImbPipeline(steps=[
    # Step 1: "Clean" the data (ColumnTransformer)
    ('preprocessor', preprocessor_pipeline),

    # Step 2: Apply the "Weapon" (SMOTE)
    # (This is applied ONLY to 'X_train' during 'fit',
    # and NOT to 'X_test' during 'predict'. 'ImbPipeline' guarantees this)
    ('smote', SMOTE(random_state=42)),

    # Step 3: Train the "Model" (RandomForest)
    ('model', RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1))
])

# --- 6. Training the Model ---
print("Training classification pipeline v3 (with RandomForest + SMOTE)...")
model_pipeline_classification_v3_smote.fit(X_train, y_train)
print("Model v3 (SMOTE) trained.")

# --- 7. Evaluating the Model ---
y_pred = model_pipeline_classification_v3_smote.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("\n--- Classification Model v3 Evaluation Results (RandomForest + SMOTE) ---")
print(f"Overall Accuracy: {accuracy:.4f}")
print("\n--- Detailed Classification Report ---")
print(classification_report(y_test, y_pred))
print("-----------------------------------")

SUCCESS: 'imbalanced-learn' (SMOTE) library imported.
SUCCESS: 'src.processing.classification.SubgroupDataPreprocessor' imported.
Rare classes removed. New row count: 16125
Training set size: 12900, Test set size: 3225
Training classification pipeline v3 (with RandomForest + SMOTE)...
Model v3 (SMOTE) trained.

--- Classification Model v3 Evaluation Results (RandomForest + SMOTE) ---
Overall Accuracy: 0.5219

--- Detailed Classification Report ---
                precision    recall  f1-score   support

    Biological       0.56      0.73      0.63       320
Climatological       0.37      0.54      0.44       249
   Geophysical       0.43      0.53      0.48       371
  Hydrological       0.60      0.53      0.56      1265
Meteorological       0.52      0.44      0.47      1020

      accuracy                           0.52      3225
     macro avg       0.49      0.55      0.52      3225
  weighted avg       0.53      0.52      0.52      3225

-----------------------------------
