In [1]:
import sys
sys.path.append('/Users/gscerberus/Earthquake_Damage_Predicton')

In [2]:
import os
os.chdir('/Users/gscerberus/Earthquake_Damage_Predicton')

In [3]:
from src.data_preprocessing import DataPreprocessor
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')
import tomli
import logging
import pandas as pd
import logging

In [4]:
logging.basicConfig(
    filename='logs/data_preprocessing.log',
    level=logging.DEBUG,  # Set the logging level (e.g., DEBUG, INFO, WARNING, ERROR, CRITICAL)
    format='%(asctime)s [%(levelname)s] %(message)s',  # Define the log message format
    datefmt='%Y-%m-%d %H:%M:%S'  # Define the date and time format
)

In [5]:
df = pd.read_csv('datasets/first_version.csv')

In [6]:
df.head()

Unnamed: 0,building_id,district_id,count_floors_pre_eq,count_floors_post_eq,age_building,plinth_area_sq_ft,height_ft_pre_eq,height_ft_post_eq,land_surface_condition,foundation_type,roof_type,ground_floor_type,other_floor_type,position,plan_configuration,condition_post_eq,damage
0,365909000431,36,2,2,50,489,14,14,Flat,Cement-Stone/Brick,Bamboo/Timber-Light roof,Mud,TImber/Bamboo-Mud,Not attached,Rectangular,Damaged-Not used,4
1,365909000441,36,3,3,50,551,18,18,Flat,Mud mortar-Stone/Brick,Bamboo/Timber-Heavy roof,Mud,TImber/Bamboo-Mud,Attached-1 side,Rectangular,Damaged-Used in risk,3
2,365909000451,36,3,3,50,432,18,18,Flat,Mud mortar-Stone/Brick,Bamboo/Timber-Heavy roof,Mud,TImber/Bamboo-Mud,Attached-1 side,Rectangular,Damaged-Not used,3
3,365909000461,36,3,3,60,560,20,20,Flat,Mud mortar-Stone/Brick,Bamboo/Timber-Heavy roof,Mud,TImber/Bamboo-Mud,Not attached,Rectangular,Damaged-Not used,3
4,365909000471,36,2,0,15,384,12,0,Flat,Cement-Stone/Brick,Bamboo/Timber-Light roof,Mud,TImber/Bamboo-Mud,Attached-1 side,Rectangular,Damaged-Rubble unclear,5


In [7]:
# Performing all preprocessing and data cleaning steps
DataPreprocessor.drop_high_leakage_features(df, df.columns)
DataPreprocessor.drop_multi_collinearity_columns(df, "count_floors_pre_eq")
DataPreprocessor.drop_not_significant_values(df, "building_id")
DataPreprocessor.drop_low_high_cardinality_features(df, "district_id")
DataPreprocessor.create_severe_damage(df, "damage")
DataPreprocessor.drop_damage_grade_column(df, "damage")

In [8]:
df.head()

Unnamed: 0,age_building,plinth_area_sq_ft,height_ft_pre_eq,land_surface_condition,foundation_type,roof_type,ground_floor_type,other_floor_type,position,plan_configuration,severe_damage
0,50,489,14,Flat,Cement-Stone/Brick,Bamboo/Timber-Light roof,Mud,TImber/Bamboo-Mud,Not attached,Rectangular,1
1,50,551,18,Flat,Mud mortar-Stone/Brick,Bamboo/Timber-Heavy roof,Mud,TImber/Bamboo-Mud,Attached-1 side,Rectangular,0
2,50,432,18,Flat,Mud mortar-Stone/Brick,Bamboo/Timber-Heavy roof,Mud,TImber/Bamboo-Mud,Attached-1 side,Rectangular,0
3,60,560,20,Flat,Mud mortar-Stone/Brick,Bamboo/Timber-Heavy roof,Mud,TImber/Bamboo-Mud,Not attached,Rectangular,0
4,15,384,12,Flat,Cement-Stone/Brick,Bamboo/Timber-Light roof,Mud,TImber/Bamboo-Mud,Attached-1 side,Rectangular,1


In [9]:
# Performing One Hot Encoding
columns_to_encode = [
    'land_surface_condition',
    'foundation_type',
    'roof_type',
    'ground_floor_type',
    'other_floor_type',
    'position',
    'plan_configuration'
        ]

columns_to_scale = [
    'age_building',
    'plinth_area_sq_ft',	
    'height_ft_pre_eq',
    
]

In [10]:
updated_df  = DataPreprocessor.one_hot_encoder(df, columns_to_encode)

In [11]:
updated_df.head()

Unnamed: 0,age_building,plinth_area_sq_ft,height_ft_pre_eq,severe_damage,land_surface_condition_Flat,land_surface_condition_Moderate slope,land_surface_condition_Steep slope,foundation_type_Bamboo/Timber,foundation_type_Cement-Stone/Brick,foundation_type_Mud mortar-Stone/Brick,...,plan_configuration_E-shape,plan_configuration_H-shape,plan_configuration_L-shape,plan_configuration_Multi-projected,plan_configuration_Others,plan_configuration_Rectangular,plan_configuration_Square,plan_configuration_T-shape,plan_configuration_U-shape,plan_configuration_nan
0,50,489,14,1,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,50,551,18,0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,50,432,18,0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,60,560,20,0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,15,384,12,1,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [12]:
scaled_df = DataPreprocessor.feature_scaler(updated_df, columns_to_scale)

In [13]:
scaled_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 762106 entries, 0 to 762105
Data columns (total 40 columns):
 #   Column                                              Non-Null Count   Dtype  
---  ------                                              --------------   -----  
 0   severe_damage                                       762106 non-null  int64  
 1   land_surface_condition_Flat                         762106 non-null  float64
 2   land_surface_condition_Moderate slope               762106 non-null  float64
 3   land_surface_condition_Steep slope                  762106 non-null  float64
 4   foundation_type_Bamboo/Timber                       762106 non-null  float64
 5   foundation_type_Cement-Stone/Brick                  762106 non-null  float64
 6   foundation_type_Mud mortar-Stone/Brick              762106 non-null  float64
 7   foundation_type_Other                               762106 non-null  float64
 8   foundation_type_RC                                  762106 non-n

In [14]:
scaled_df.head()

Unnamed: 0,severe_damage,land_surface_condition_Flat,land_surface_condition_Moderate slope,land_surface_condition_Steep slope,foundation_type_Bamboo/Timber,foundation_type_Cement-Stone/Brick,foundation_type_Mud mortar-Stone/Brick,foundation_type_Other,foundation_type_RC,roof_type_Bamboo/Timber-Heavy roof,...,plan_configuration_Multi-projected,plan_configuration_Others,plan_configuration_Rectangular,plan_configuration_Square,plan_configuration_T-shape,plan_configuration_U-shape,plan_configuration_nan,age_building,plinth_area_sq_ft,height_ft_pre_eq
0,1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.394713,0.363029,-0.373029
1,0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.394713,0.636423,0.355055
2,0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.394713,0.111683,0.355055
3,0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.548456,0.676109,0.719097
4,1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,-0.143387,-0.099976,-0.73707


In [15]:
scaled_df.to_csv('datasets/final_version.csv', index=False)