# Sandbox to understand the Data

In [1]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from tqdm import tqdm
import openpyxl as op

In [2]:
import pandas as pd

df = pd.read_csv("Data/data2.csv")
df.to_excel("Data/data2.xlsx", index=False)


In [3]:
xl = list(op.load_workbook('Data/data2.xlsx').worksheets[0].values)
# convert xl to DataFrame for easier handling
df = pd.DataFrame(xl[1:], columns=xl[0])
# Summary statistics of the dataframe
df.describe()
df.head()

Unnamed: 0,district_distance_to_earthquakecenter(mi),count_floors_pre_eq,age_building,plinth_area_sq_ft,per-height_ft_pre_eq,land_surface_condition,foundation_type,roof_type(Bamboo/Timber-Heavy roof=0; Bamboo/Timber-Light roof=1; RCC/RB/RBC=2),ground_floor_type,position,...,has_superstructure_timber,has_superstructure_bamboo,has_superstructure_rc_non_engineered,has_superstructure_rc_engineered,has_superstructure_other,damage_grade,technical_solution_proposed,condition_post_eq,vdcmun_id,ward_id
0,118.88,1,9,288,9.0,0,0,1,1,0,...,0,1,0,0,0,3,Major repair,Damaged-Used in risk,1207,120703
1,118.88,1,15,364,9.0,0,0,1,1,0,...,0,1,0,0,0,5,Reconstruction,Damaged-Repaired and used,1207,120703
2,118.88,1,20,384,9.0,0,0,1,1,0,...,0,0,0,0,0,2,Minor repair,Damaged-Repaired and used,1207,120703
3,118.88,1,20,312,9.0,0,0,1,1,0,...,0,0,0,0,0,2,Minor repair,Damaged-Repaired and used,1207,120703
4,118.88,1,30,308,9.0,0,0,1,1,0,...,0,0,0,0,0,1,Minor repair,Damaged-Repaired and used,1207,120703


In [6]:
xl = list(op.load_workbook('Data/data2.xlsx').worksheets[0].values)
# convert xl to DataFrame for easier handling
df = pd.DataFrame(xl[1:], columns=xl[0])
# Summary statistics of the dataframe
df.describe()
df.head()

InvalidFileException: openpyxl does not support .xslx file format, please check you can open it with Excel first. Supported formats are: .xlsx,.xlsm,.xltx,.xltm

In [None]:
xl = list(op.load_workbook('Data/data2.xlsx').worksheets[0].values)
# convert xl to DataFrame for easier handling
df = pd.DataFrame(xl[1:], columns=xl[0])
# Summary statistics of the dataframe
df.describe()
df.head()

In [None]:
# convert categorical values to one-hot encodings 0/1
#df = pd.get_dummies(df, columns=['technical_solution_proposed', 'condition_post_eq'], dtype=int)
# replace any ' ' with '_' in column names
df.columns = [col.replace(' ', '_') for col in df.columns]
# drop the vdcmun_id and the ward_id columns as they are not useful for modeling
df = df.drop(columns=['vdcmun_id', 'ward_id','technical_solution_proposed', 'condition_post_eq'])

In [None]:
df.head()

In [None]:
# pairplot to visualize relationships
full = False

if full:
    sns.pairplot(df, hue='damage_grade', diag_kind='kde')
    plt.savefig('pairplot_full.pdf')
    plt.show()
else:
    # subsample for faster plotting
    sns.pairplot(df.sample(n=500, random_state=42), hue='damage_grade', diag_kind='kde')
    plt.savefig('pairplot_sampled.pdf')
    plt.show()

In [None]:
# Goal: predict damage_grade (1,2,3,4,5) based on other features
x = df.drop(columns=['damage_grade']).to_numpy()
y = df['damage_grade'].to_numpy()

In [None]:
# Define a Class for our Classification Models
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

class ClassifierModel:
    def __init__(self, model):
        self.model = model

    def train(self, x_train, y_train):
        self.model.fit(x_train, y_train)

    def evaluate(self, x_test, y_test):
        y_pred = self.model.predict(x_test)
        print("Classification Report:")
        print(classification_report(y_test, y_pred))
        print("Confusion Matrix:")
        print(confusion_matrix(y_test, y_pred))

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.6, random_state=42)

# Initialize and train Random Forest Classifier
rf_model = ClassifierModel(RandomForestClassifier(n_estimators=100, random_state=42))
rf_model.train(x_train, y_train)
print("Random Forest Classifier Evaluation:")
rf_model.evaluate(x_test, y_test)

print("\n" + "="*50 + "\n")

# Initialize and train Gradient Boosting Classifier
gb_model = ClassifierModel(GradientBoostingClassifier(n_estimators=100, random_state=42))
gb_model.train(x_train, y_train)
print("Gradient Boosting Classifier Evaluation:")
gb_model.evaluate(x_test, y_test)

print("\n" + "="*50 + "\n")

# Ensemble Model: Voting Classifier
from sklearn.ensemble import VotingClassifier
voting_model = VotingClassifier(estimators=[
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
    ('gb', GradientBoostingClassifier(n_estimators=100, random_state=42)),
], voting='hard')
voting_model.fit(x_train, y_train)
y_pred = voting_model.predict(x_test)
print("Voting Classifier Evaluation:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
