In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
import sys
import os

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# --- System Integration (Same) ---
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
try:
    from src.processing.classification import SubgroupDataPreprocessor
    print("SUCCESS: 'src.processing.classification.SubgroupDataPreprocessor' imported.")
except ImportError:
    print("ERROR: Could not import from 'src/processing.classification.py'.")
    raise

# --- 1. Data Loading (Same) ---
file_path = "../1900_2021_DISASTERS.xlsx - emdat data.csv"
df_raw = pd.read_csv(file_path)

# --- 2. Data Preprocessing (Same) ---
preprocessor = SubgroupDataPreprocessor(target_col="Disaster Subgroup")
df_clean = preprocessor.fit_transform(df_raw)
print(f"Classification data processed. Available rows for the model: {len(df_clean)}")

# --- 3. Preparing Data for Model Training ---
target = 'Disaster Subgroup'
categorical_features = ['Continent', 'Disaster Group']
numerical_features = ['Total Deaths', 'No Injured', 'No Affected', 'Start Year']
available_features = [col for col in (categorical_features + numerical_features) if col in df_clean.columns]

X = df_clean[available_features]
y = df_clean[target]

# --- NEW SECTION: Filtering Rare Classes ---

class_counts = y.value_counts()
print("\n--- Class Counts (Before Filtering) ---")
print(class_counts)

rare_classes = class_counts[class_counts < 2].index

# --- FIX: Removed '\' from the print command ---
# Clean print instead of error-causing '\Rare...'
print(f"Rare classes found (less than 2 members): {list(rare_classes)}")
# --- END FIX ---

y_filtered = y[~y.isin(rare_classes)]
X_filtered = X.loc[y_filtered.index] # Correction with .loc[] (from Step 17) already here

print(f"Rare classes removed. New row count: {len(y_filtered)}")

# --- End: NEW SECTION ---


# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X_filtered, y_filtered, test_size=0.2, random_state=42, stratify=y_filtered
)

print(f"Training set size: {len(X_train)}, Test set size: {len(X_test)}")

# --- 4. Advanced Model Pipeline (Same) ---
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])
preprocessor_pipeline = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, [f for f in numerical_features if f in available_features]),
        ('cat', categorical_transformer, [f for f in categorical_features if f in available_features])
    ])

# --- 5. Classification Pipeline (Same) ---
model_pipeline_classification = Pipeline(steps=[
    ('preprocessor', preprocessor_pipeline),
    ('model', LogisticRegression(max_iter=1000, random_state=42))
])

# --- 6. Training the Model ---
print("Training classification pipeline (LogisticRegression)...")
model_pipeline_classification.fit(X_train, y_train)
print("Model trained.")

# --- 7. Evaluating the Model ---
y_pred = model_pipeline_classification.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("\n--- Classification Model Evaluation Results ---")
print(f"Overall Accuracy: {accuracy:.4f}")
print("\n--- Detailed Classification Report ---")
print(classification_report(y_test, y_pred))
print("-----------------------------------")

SUCCESS: 'src.processing.classification.SubgroupDataPreprocessor' imported.
Classification data processed. Available rows for the model: 16126

--- Class Counts (Before Filtering) ---
Disaster Subgroup
Hydrological         6327
Meteorological       5100
Geophysical          1857
Biological           1598
Climatological       1243
Extra-terrestrial       1
Name: count, dtype: int64
Rare classes found (less than 2 members): ['Extra-terrestrial']
Rare classes removed. New row count: 16125
Training set size: 12900, Test set size: 3225
Training classification pipeline (LogisticRegression)...
Model trained.

--- Classification Model Evaluation Results ---
Overall Accuracy: 0.4369

--- Detailed Classification Report ---
                precision    recall  f1-score   support

    Biological       0.26      0.09      0.13       320
Climatological       0.30      0.01      0.02       249
   Geophysical       0.44      0.12      0.19       371
  Hydrological       0.45      0.76      0.56      1

In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
import sys
import os

# --- UPDATE 1: Import the Models ---
from sklearn.linear_model import LogisticRegression # (Kept for comparison)
from sklearn.ensemble import RandomForestClassifier # <-- NEW MODEL
from sklearn.metrics import accuracy_score, classification_report

# --- System Integration (Same) ---
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
try:
    from src.processing.classification import SubgroupDataPreprocessor
    print("SUCCESS: 'src.processing.classification.SubgroupDataPreprocessor' imported.")
except ImportError:
    print("ERROR: Could not import from 'src/processing/classification.py'.")
    raise

# --- 1. Data Loading (Same) ---
file_path = "../1900_2021_DISASTERS.xlsx - emdat data.csv"
df_raw = pd.read_csv(file_path)

# --- 2. Data Preprocessing (Same) ---
preprocessor = SubgroupDataPreprocessor(target_col="Disaster Subgroup")
df_clean = preprocessor.fit_transform(df_raw)
print(f"Classification data processed. Available rows for the model: {len(df_clean)}")

# --- 3. Preparing Data for Model Training (Same) ---
target = 'Disaster Subgroup'
categorical_features = ['Continent', 'Disaster Group']
numerical_features = ['Total Deaths', 'No Injured', 'No Affected', 'Start Year']
available_features = [col for col in (categorical_features + numerical_features) if col in df_clean.columns]
X = df_clean[available_features]
y = df_clean[target]

# Filter Rare Classes (Same)
class_counts = y.value_counts()
rare_classes = class_counts[class_counts < 2].index
y_filtered = y[~y.isin(rare_classes)]
X_filtered = X.loc[y_filtered.index]
print(f"Rare classes removed. New row count: {len(y_filtered)}")

# Split the data (Same)
X_train, X_test, y_train, y_test = train_test_split(
    X_filtered, y_filtered, test_size=0.2, random_state=42, stratify=y_filtered
)
print(f"Training set size: {len(X_train)}, Test set size: {len(X_test)}")

# --- 4. Advanced Model Pipeline (Same) ---
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])
preprocessor_pipeline = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, [f for f in numerical_features if f in available_features]),
        ('cat', categorical_transformer, [f for f in categorical_features if f in available_features])
    ])

# --- 5. Classification Pipeline (with RandomForest) ---
# --- UPDATE 2: Change the Model ---
model_pipeline_classification_v2 = Pipeline(steps=[
    ('preprocessor', preprocessor_pipeline),
    # ('model', LogisticRegression(max_iter=1000, random_state=42)) # <-- OLD
    # NEW: RandomForest
    ('model', RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1))
])

# --- 6. Training the Model ---
print("Training classification pipeline (RandomForest)...")
# This will take LONGER than LogisticRegression.
model_pipeline_classification_v2.fit(X_train, y_train)
print("Model trained.")

# --- 7. Evaluating the Model ---
y_pred = model_pipeline_classification_v2.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("\n--- Classification Model v2 Evaluation Results (RandomForest) ---")
print(f"Overall Accuracy: {accuracy:.4f}")
print("\n--- Detailed Classification Report ---")
print(classification_report(y_test, y_pred))
print("-----------------------------------")

SUCCESS: 'src.processing.classification.SubgroupDataPreprocessor' imported.
Classification data processed. Available rows for the model: 16126
Rare classes removed. New row count: 16125
Training set size: 12900, Test set size: 3225
Training classification pipeline (RandomForest)...
Model trained.

--- Classification Model v2 Evaluation Results (RandomForest) ---
Overall Accuracy: 0.5619

--- Detailed Classification Report ---
                precision    recall  f1-score   support

    Biological       0.67      0.67      0.67       320
Climatological       0.48      0.41      0.44       249
   Geophysical       0.51      0.41      0.45       371
  Hydrological       0.58      0.66      0.62      1265
Meteorological       0.53      0.51      0.52      1020

      accuracy                           0.56      3225
     macro avg       0.56      0.53      0.54      3225
  weighted avg       0.56      0.56      0.56      3225

-----------------------------------


In [9]:
import joblib
import os

# --- 1. Save Path ---
# (The 'models' directory from Problem 1 should already exist)
model_directory = "../models"
model_file_path = os.path.join(model_directory, "classification_model_v1.joblib")

# --- 2. Save the Model (The Entire Pipeline) ---
# We are saving our 'model_pipeline_classification_v2' object
# (data processing + RandomForest Classifier).
print(f"Saving classification pipeline to the path: {model_file_path}")
try:
    joblib.dump(model_pipeline_classification_v2, model_file_path)
    print("SUCCESS: Model saved as 'classification_model_v1.joblib'.")
except NameError:
    print("ERROR: 'model_pipeline_classification_v2' object not found.")
    print("Please ensure you have successfully run the previous (training) cell.")


# --- 3. (Optional) Check That the Model is Loaded Back ---
print("\nLet's test by loading the model back...")
try:
    loaded_classifier = joblib.load(model_file_path)

    # Take the first test data row (X_test.iloc[0]) and make a prediction
    # (Assumes X_test is in memory from the previous cell)
    test_sample = X_test.iloc[0:1] # 0:1 -> Ensures it remains a DataFrame
    prediction = loaded_classifier.predict(test_sample)

    print(f"Classification Prediction for the test data: {prediction[0]}")
except Exception as e:
    print(f"Error during model loading or testing: {e}")

Saving classification pipeline to the path: ../models/classification_model_v1.joblib
SUCCESS: Model saved as 'classification_model_v1.joblib'.

Let's test by loading the model back...
Classification Prediction for the test data: Meteorological
