In [14]:
import numpy as np  # For numerical operations
import pandas as pd  # For data manipulation
import matplotlib.pyplot as plt  # For visualization
from category_encoders import OneHotEncoder, OrdinalEncoder  # For encoding categorical variables
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.utils.validation import check_is_fitted
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier


## 3. Data Wrangling
Define a function to clean and preprocess the data by removing leaky, high-cardinality, and multicollinear features.


In [15]:
def wrangle(data_path):
    df = pd.read_csv(data_path)
    # Identify leaky features
    drop_col = [col for col in df.columns if 'post_eq' in col]
    drop_col.append('technical_solution_proposed')
    # Remove missing values
    df.dropna(inplace=True)
    # Create binary target
    df['severe_damage'] = df['damage_grade'].str[-1].astype('int')
    df['severe_damage'] = (df['severe_damage'] > 3).astype('int')
    drop_col.append('damage_grade')
    # Drop high cardinality and multicollinear features
    drop_col.append('building_id')
    drop_col.extend(['count_floors_pre_eq', 'ward_id', 'vdcmun_id'])
    df.drop(columns=drop_col, inplace=True)
    return df


## 4. Data Preparation
Apply the wrangling function, subset the data, and separate features and target.


In [16]:
pd.set_option('display.max_columns', None)
df = wrangle('building_structure.csv')
df = df.iloc[:3000, :]  # Use a subset for faster experimentation
print(df.shape)
df.head()


(3000, 23)


Unnamed: 0,district_id,age_building,plinth_area_sq_ft,height_ft_pre_eq,land_surface_condition,foundation_type,roof_type,ground_floor_type,other_floor_type,position,plan_configuration,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,has_superstructure_cement_mortar_stone,has_superstructure_mud_mortar_brick,has_superstructure_cement_mortar_brick,has_superstructure_timber,has_superstructure_bamboo,has_superstructure_rc_non_engineered,has_superstructure_rc_engineered,has_superstructure_other,severe_damage
0,12,9,288,9,Flat,Other,Bamboo/Timber-Light roof,Mud,Not applicable,Not attached,Rectangular,0,1,0,0,0,0,0,1,0,0,0,0
1,12,15,364,9,Flat,Other,Bamboo/Timber-Light roof,Mud,Not applicable,Not attached,Rectangular,0,1,0,0,0,0,0,1,0,0,0,1
2,12,20,384,9,Flat,Other,Bamboo/Timber-Light roof,Mud,Not applicable,Not attached,Rectangular,0,1,0,0,0,0,0,0,0,0,0,0
3,12,20,312,9,Flat,Other,Bamboo/Timber-Light roof,Mud,Not applicable,Not attached,Rectangular,0,1,0,0,0,0,0,0,0,0,0,0
4,12,30,308,9,Flat,Other,Bamboo/Timber-Light roof,Mud,Not applicable,Not attached,Rectangular,0,1,0,0,0,0,0,0,0,0,0,0


In [17]:
target = 'severe_damage'
X = df.drop(columns=target)
y = df[target]
print('X shape:', X.shape)
print('y shape:', y.shape)


X shape: (3000, 22)
y shape: (3000,)


## 5. Train-Test Split
Split the data into training and test sets.


In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print('X_train shape:', X_train.shape)
print('y_train shape:', y_train.shape)
print('X_test shape:', X_test.shape)
print('y_test shape:', y_test.shape)


X_train shape: (2400, 22)
y_train shape: (2400,)
X_test shape: (600, 22)
y_test shape: (600,)


## 6. Feature Engineering
One-hot encode categorical features using sklearn's ColumnTransformer.


In [19]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
categorical_cols = ['land_surface_condition', 'foundation_type', 'roof_type',
                    'ground_floor_type', 'other_floor_type', 'position', 'plan_configuration']
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ],
    remainder='passthrough'
)
X_train_ohe = preprocessor.fit_transform(X_train)
X_test_ohe = preprocessor.transform(X_test)


# Gradient boosting Model classifier

In [26]:
gbm=XGBClassifier(subsample= 0.8, n_estimators= 200, max_depth= 5, learning_rate= 0.01, colsample_bytree= 0.8)
gbm.fit(X_train_ohe, y_train)
print('Train Accuracy:', gbm.score(X_train_ohe, y_train))
print('Test Accuracy:', gbm.score(X_test_ohe, y_test))


Train Accuracy: 0.7945833333333333
Test Accuracy: 0.7616666666666667


# Light gradient boosting

In [48]:
lgbm=LGBMClassifier(subsample= 0.2, num_leaves= 30, n_estimators= 230, max_depth= 2, learning_rate= 0.05, colsample_bytree= 0.6)
lgbm.fit(X_train_ohe, y_train)
#{'subsample': 0.8, 'num_leaves': 80, 'n_estimators': 200, 'max_depth': 2, 'learning_rate': 0.05, 'colsample_bytree': 0.4
print('Train Accuracy:', lgbm.score(X_train_ohe, y_train))
print('Test Accuracy:', lgbm.score(X_test_ohe, y_test))


[LightGBM] [Info] Number of positive: 662, number of negative: 1738
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001085 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 265
[LightGBM] [Info] Number of data points in the train set: 2400, number of used features: 24
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.275833 -> initscore=-0.965225
[LightGBM] [Info] Start training from score -0.965225
Train Accuracy: 0.7925
Test Accuracy: 0.7716666666666666




## 7. Model Training and Hyperparameter Tuning
Train a Random Forest Classifier and use RandomizedSearchCV for hyperparameter tuning.


In [25]:
# Xgboost classifier
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split


# Define model
xgb = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')

# Define hyperparameter search space
param_dist = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7, 10],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

# Randomized search
random_search = RandomizedSearchCV(
    xgb,
    param_distributions=param_dist,
    n_iter=20,
    scoring='accuracy',
    cv=3,
    verbose=1,
    n_jobs=-1
)

random_search.fit(X_train_ohe, y_train)
print("Best XGB Parameters:", random_search.best_params_)


Fitting 3 folds for each of 20 candidates, totalling 60 fits
Best XGB Parameters: {'subsample': 0.8, 'n_estimators': 200, 'max_depth': 5, 'learning_rate': 0.01, 'colsample_bytree': 0.8}


In [29]:
# lgbm classifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import RandomizedSearchCV

# Define model
lgbm = LGBMClassifier()

# Hyperparameter search space
param_dist = {
    'n_estimators': [50,100,150, 200,250, 300],
    'max_depth': [-1,2,3,4, 5,8, 10,12, 15],
    'learning_rate': [0.001,0.005,0.01, 0.05, 0.1, 0.2],
    'num_leaves': [5,15,23, 31,50, 63,80],
    'subsample': [0.2,0.3,0.4,0.6, 0.8, 1.0],
    'colsample_bytree': [0.2,0.3,0.4,0.6, 0.8, 1.0]
}

random_search = RandomizedSearchCV(
    lgbm,
    param_distributions=param_dist,
    n_iter=20,
    scoring='accuracy',
    cv=3,
    verbose=1,
    n_jobs=-1
)

random_search.fit(X_train_ohe, y_train)
print("Best LGBM Parameters:", random_search.best_params_)


Fitting 3 folds for each of 20 candidates, totalling 60 fits
[LightGBM] [Info] Number of positive: 662, number of negative: 1738
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001605 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 265
[LightGBM] [Info] Number of data points in the train set: 2400, number of used features: 24
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.275833 -> initscore=-0.965225
[LightGBM] [Info] Start training from score -0.965225
Best LGBM Parameters: {'subsample': 0.8, 'num_leaves': 80, 'n_estimators': 200, 'max_depth': 2, 'learning_rate': 0.05, 'colsample_bytree': 0.4}
