In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

## Display all the columns of the dataframe
pd.pandas.set_option('display.max_columns',None)

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
earthquake_data = pd.read_csv('../datasets/final_dataset/cleaned_earthquake_data.csv')
earthquake_data.head()

Unnamed: 0,latitude,longitude,depth,magnitude,magnitude_type,distance,gravity,force,year,month,hour,minutes,day_name
0,38.2484,38.1064,4.75,4.1,least damage,403639.392,9.8001,1.79766e+20,2024,7,21,41,Tuesday
1,18.085167,-66.650833,20.18,2.5,least damage,395082.122,9.7854,1.876376e+20,2024,7,21,10,Tuesday
2,60.5382,-151.8092,71.1,3.6,least damage,399158.905,9.8196,1.838243e+20,2024,7,19,53,Tuesday
3,34.1134,86.1206,10.0,4.0,least damage,404343.124,9.7966,1.791408e+20,2024,7,19,18,Tuesday
4,31.499167,-115.628667,8.38,2.72,least damage,397055.439,9.7944,1.857772e+20,2024,7,19,8,Tuesday


In [3]:
earthquake_data['magnitude_type'].unique()

array(['least damage', 'moderate damage', 'strong damage'], dtype=object)

## Apply Feature Engineering

### Convert Categorical Feature into Numerical Feature

In [4]:
# 1. Ordinal encoding for magnitude_type because it is in order from least to major damage
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler, OrdinalEncoder

In [5]:
scaled_df = earthquake_data

ordinal_encoder = OrdinalEncoder()

In [6]:
# Ordinal Encoding
scaled_df['magnitude_type'] = ordinal_encoder.fit_transform(scaled_df[['magnitude_type']])

In [7]:
# Ordinal Encoding
scaled_df['day_name'] = ordinal_encoder.fit_transform(scaled_df[['day_name']])

In [8]:
scaled_df.head(5)

Unnamed: 0,latitude,longitude,depth,magnitude,magnitude_type,distance,gravity,force,year,month,hour,minutes,day_name
0,38.2484,38.1064,4.75,4.1,0.0,403639.392,9.8001,1.79766e+20,2024,7,21,41,5.0
1,18.085167,-66.650833,20.18,2.5,0.0,395082.122,9.7854,1.876376e+20,2024,7,21,10,5.0
2,60.5382,-151.8092,71.1,3.6,0.0,399158.905,9.8196,1.838243e+20,2024,7,19,53,5.0
3,34.1134,86.1206,10.0,4.0,0.0,404343.124,9.7966,1.791408e+20,2024,7,19,18,5.0
4,31.499167,-115.628667,8.38,2.72,0.0,397055.439,9.7944,1.857772e+20,2024,7,19,8,5.0


In [9]:
scaled_df['magnitude_type'].unique()

array([0., 1., 2.])

In [10]:
# Scaling continuous features
scaler = StandardScaler()

In [11]:
continious_feature_scale = ['latitude','longitude','depth', 'distance', 'gravity', 'force']
# Apply scaling
scaled_df[continious_feature_scale] = scaler.fit_transform(scaled_df[continious_feature_scale])

#### Cyclic encoding for time-related features

Time features often benefit more from encoding techniques that capture periodicity (like sine and cosine transformations) rather than standard scaling.

In [12]:
# Cyclic encoding for time-related features
scaled_df['month_sin'] = np.sin(2 * np.pi * scaled_df['month'] / 12)
scaled_df['month_cos'] = np.cos(2 * np.pi * scaled_df['month'] / 12)
scaled_df['hour_sin'] = np.sin(2 * np.pi * scaled_df['hour'] / 24)
scaled_df['hour_cos'] = np.cos(2 * np.pi * scaled_df['hour'] / 24)
scaled_df['minutes_sin'] = np.sin(2 * np.pi * scaled_df['minutes'] / 60)
scaled_df['minutes_cos'] = np.cos(2 * np.pi * scaled_df['minutes'] / 60)

In [13]:
scaled_df.head()

Unnamed: 0,latitude,longitude,depth,magnitude,magnitude_type,distance,gravity,force,year,month,hour,minutes,day_name,month_sin,month_cos,hour_sin,hour_cos,minutes_sin,minutes_cos
0,0.578374,0.576498,-0.804832,4.1,0.0,1.205836,0.254693,-1.166843,2024,7,21,41,5.0,-0.5,-0.866025,-0.707107,0.707107,-0.913545,-0.406737
1,-0.092464,-0.276457,-0.242839,2.5,0.0,0.654375,-0.849625,-0.68027,2024,7,21,10,5.0,-0.5,-0.866025,-0.707107,0.707107,0.866025,0.5
2,1.319964,-0.969834,1.611773,3.6,0.0,0.917097,1.719603,-0.915982,2024,7,19,53,5.0,-0.5,-0.866025,-0.965926,0.258819,-0.669131,0.743145
3,0.440801,0.967439,-0.613616,4.0,0.0,1.251187,-0.00824,-1.205488,2024,7,19,18,5.0,-0.5,-0.866025,-0.965926,0.258819,0.951057,-0.309017
4,0.353824,-0.675245,-0.67262,2.72,0.0,0.781542,-0.173512,-0.79527,2024,7,19,8,5.0,-0.5,-0.866025,-0.965926,0.258819,0.743145,0.669131


In [14]:
scaled_df['magnitude_type'].unique()

array([0., 1., 2.])

In [15]:
# Drop original month and hour columns after encoding
scaled_df = scaled_df.drop(columns=['month', 'hour','minutes'])

In [18]:
scaled_df.head(5)

Unnamed: 0,latitude,longitude,depth,magnitude,magnitude_type,distance,gravity,force,year,day_name,month_sin,month_cos,hour_sin,hour_cos,minutes_sin,minutes_cos
0,0.578374,0.576498,-0.804832,4.1,0.0,1.205836,0.254693,-1.166843,2024,5.0,-0.5,-0.866025,-0.707107,0.707107,-0.913545,-0.406737
1,-0.092464,-0.276457,-0.242839,2.5,0.0,0.654375,-0.849625,-0.68027,2024,5.0,-0.5,-0.866025,-0.707107,0.707107,0.866025,0.5
2,1.319964,-0.969834,1.611773,3.6,0.0,0.917097,1.719603,-0.915982,2024,5.0,-0.5,-0.866025,-0.965926,0.258819,-0.669131,0.743145
3,0.440801,0.967439,-0.613616,4.0,0.0,1.251187,-0.00824,-1.205488,2024,5.0,-0.5,-0.866025,-0.965926,0.258819,0.951057,-0.309017
4,0.353824,-0.675245,-0.67262,2.72,0.0,0.781542,-0.173512,-0.79527,2024,5.0,-0.5,-0.866025,-0.965926,0.258819,0.743145,0.669131


In [17]:
# scaled_df.to_csv("../datasets/final_dataset/cyclic_scaled_data.csv", index=False)