In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from category_encoders import OneHotEncoder, OrdinalEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score,recall_score,f1_score,roc_auc_score,classification_report,confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.utils.validation import check_is_fitted
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier

In [2]:
%%time
df=pd.read_csv('building_structure.csv')
df.head()

CPU times: total: 2.2 s
Wall time: 2.35 s


Unnamed: 0,building_id,district_id,vdcmun_id,ward_id,count_floors_pre_eq,count_floors_post_eq,age_building,plinth_area_sq_ft,height_ft_pre_eq,height_ft_post_eq,...,has_superstructure_mud_mortar_brick,has_superstructure_cement_mortar_brick,has_superstructure_timber,has_superstructure_bamboo,has_superstructure_rc_non_engineered,has_superstructure_rc_engineered,has_superstructure_other,condition_post_eq,damage_grade,technical_solution_proposed
0,120101000011,12,1207,120703,1,1,9,288,9,9,...,0,0,0,1,0,0,0,Damaged-Used in risk,Grade 3,Major repair
1,120101000021,12,1207,120703,1,1,15,364,9,9,...,0,0,0,1,0,0,0,Damaged-Repaired and used,Grade 5,Reconstruction
2,120101000031,12,1207,120703,1,1,20,384,9,9,...,0,0,0,0,0,0,0,Damaged-Repaired and used,Grade 2,Minor repair
3,120101000041,12,1207,120703,1,1,20,312,9,9,...,0,0,0,0,0,0,0,Damaged-Repaired and used,Grade 2,Minor repair
4,120101000051,12,1207,120703,1,1,30,308,9,9,...,0,0,0,0,0,0,0,Damaged-Repaired and used,Grade 1,Minor repair


In [3]:
def wrangle(data_path):
    #importing dataset
    df=pd.read_csv(data_path)

    #identify leaky features list post_earthquake
    drop_col=[col for col in df.columns if 'post_eq' in col]
    drop_col.append('technical_solution_proposed')

    # removing duplicate values
    df.dropna(inplace=True)
    
    # selecting last number from damage_grade string and convert astype to int
    df['severe_damage']=df['damage_grade'].str[-1].astype('int')
    df['severe_damage']=(df['severe_damage']>3).astype('int')
    # droping damage grade
    drop_col.append('damage_grade')
    
    # High cardinality feature 
    drop_col.append('building_id')

    # multicolinearity with independent columns and with dependent column
    drop_col.extend(['count_floors_pre_eq','ward_id','vdcmun_id'])

    
    # droping Columns
    df.drop(columns=drop_col,inplace=True)

    
    return df

In [73]:
pd.set_option('display.max_columns',None)
df=wrangle('building_structure.csv')
df=df.iloc[:3000,:]
print(df.shape)
df.head()

(3000, 23)


Unnamed: 0,district_id,age_building,plinth_area_sq_ft,height_ft_pre_eq,land_surface_condition,foundation_type,roof_type,ground_floor_type,other_floor_type,position,plan_configuration,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,has_superstructure_cement_mortar_stone,has_superstructure_mud_mortar_brick,has_superstructure_cement_mortar_brick,has_superstructure_timber,has_superstructure_bamboo,has_superstructure_rc_non_engineered,has_superstructure_rc_engineered,has_superstructure_other,severe_damage
0,12,9,288,9,Flat,Other,Bamboo/Timber-Light roof,Mud,Not applicable,Not attached,Rectangular,0,1,0,0,0,0,0,1,0,0,0,0
1,12,15,364,9,Flat,Other,Bamboo/Timber-Light roof,Mud,Not applicable,Not attached,Rectangular,0,1,0,0,0,0,0,1,0,0,0,1
2,12,20,384,9,Flat,Other,Bamboo/Timber-Light roof,Mud,Not applicable,Not attached,Rectangular,0,1,0,0,0,0,0,0,0,0,0,0
3,12,20,312,9,Flat,Other,Bamboo/Timber-Light roof,Mud,Not applicable,Not attached,Rectangular,0,1,0,0,0,0,0,0,0,0,0,0
4,12,30,308,9,Flat,Other,Bamboo/Timber-Light roof,Mud,Not applicable,Not attached,Rectangular,0,1,0,0,0,0,0,0,0,0,0,0


In [74]:
target='severe_damage'
X = df.drop(columns=target)
y = df[target]
print("X shape:", X.shape)
print("y shape:", y.shape)

X shape: (3000, 22)
y shape: (3000,)


In [75]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=True)
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_ shape:", X_test.shape)
print("y_val shape:", y_test.shape)

X_train shape: (2400, 22)
y_train shape: (2400,)
X_ shape: (600, 22)
y_val shape: (600,)


In [76]:
# Using One hot encoder from sklearn
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
# Specify categorical columns to one-hot encode
categorical_cols = ['land_surface_condition', 'foundation_type', 'roof_type',
                    'ground_floor_type', 'other_floor_type', 'position', 'plan_configuration']

# Create the column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ],
    remainder='passthrough'  # keep the rest of the columns as they are
)

X_train_ohe = preprocessor.fit_transform(X_train)
X_test_ohe = preprocessor.transform(X_test)

In [108]:
%%time
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

param_dist = {
    'n_estimators': [100, 300, 500],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', 0.5],
    'bootstrap': [True, False]
}

rf = RandomForestClassifier(random_state=42)

search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_dist,
    n_iter=80,
    cv=5,
    verbose=1,
    n_jobs=-1
)

search.fit(X_train_ohe, y_train)
print(search.best_params_)

Fitting 5 folds for each of 80 candidates, totalling 400 fits
{'n_estimators': 500, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': 'log2', 'max_depth': 10, 'bootstrap': False}
CPU times: total: 4.53 s
Wall time: 1min 24s


In [78]:
%%time
    
model_rf = make_pipeline(RandomForestClassifier(random_state=42))
model_rf.fit(X_train_ohe, y_train)
print(model_rf.score(X_train_ohe,y_train))
print(model_rf.score(X_test_ohe,y_test))



0.9554166666666667
0.75
CPU times: total: 484 ms
Wall time: 492 ms


In [None]:
%%time
    
model_rf = make_pipeline(RandomForestClassifier(n_estimators= 500, min_samples_split= 10, min_samples_leaf=1, max_features= 'log2', max_depth= 10, bootstrap= False,random_state=42))
model_rf.fit(X_train_ohe, y_train)
print(model_rf.score(X_train_ohe,y_train))
print(model_rf.score(X_test_ohe,y_test))



0.8145833333333333
0.7766666666666666
CPU times: total: 1.33 s
Wall time: 1.51 s


In [114]:
%%time
    
model_rf = make_pipeline(OrdinalEncoder(),RandomForestClassifier(n_estimators= 100, min_samples_split= 5, min_samples_leaf=3, max_features= 'sqrt', max_depth= 12, bootstrap= False,random_state=42))
model_rf.fit(X_train_ohe, y_train)
print(model_rf.score(X_train_ohe,y_train))
print(model_rf.score(X_test_ohe,y_test))



0.8083333333333333
0.7816666666666666
CPU times: total: 328 ms
Wall time: 320 ms
