# 1. Time-Based Features

In [18]:
# Import libraries
import pandas as pd

# Load the datasets
jma_df = pd.read_csv('JMA_cleaned.csv')
bmkg_df = pd.read_csv('BMKG_cleaned.csv')

# show the first 5 rows of the JMA dataset
jma_df.head()

# show the first 5 rows of the BMKG dataset
#bmkg_df.head()

Unnamed: 0,Latitude,Longitude,Depth,Magnitude,Datetime
0,35.338333,133.211667,12,3.6,1985-12-31 02:26:48
1,36.406667,140.696667,55,3.3,1985-12-30 19:11:46
2,37.21,139.936667,6,3.5,1985-12-30 15:56:17
3,27.968333,129.665,0,4.2,1985-12-30 15:20:15
4,42.888333,145.445,35,3.7,1985-12-29 09:22:20


In [19]:
# Convert 'Datetime' column to datetime format
jma_df['Datetime'] = pd.to_datetime(jma_df['Datetime'], errors='coerce')
bmkg_df['Datetime'] = pd.to_datetime(bmkg_df['Datetime'], errors='coerce')

In [20]:
print(jma_df['Datetime'].head())
print(bmkg_df['Datetime'].head())

0   1985-12-31 02:26:48
1   1985-12-30 19:11:46
2   1985-12-30 15:56:17
3   1985-12-30 15:20:15
4   1985-12-29 09:22:20
Name: Datetime, dtype: datetime64[ns]
0   2008-11-01 21:02:43.058
1   2008-11-01 20:58:50.248
2   2008-11-01 17:43:12.941
3   2008-11-01 16:24:14.755
4   2008-11-01 16:20:37.327
Name: Datetime, dtype: datetime64[ns]


In [21]:
# Extract Year, Month, Day, Day of the Week, and Hour from the Datetime column.
jma_df['Year'] = jma_df['Datetime'].dt.year
jma_df['Month'] = jma_df['Datetime'].dt.month
jma_df['Day'] = jma_df['Datetime'].dt.day
jma_df['DayOfWeek'] = jma_df['Datetime'].dt.dayofweek
jma_df['Hour'] = jma_df['Datetime'].dt.hour
jma_df.head()

Unnamed: 0,Latitude,Longitude,Depth,Magnitude,Datetime,Year,Month,Day,DayOfWeek,Hour
0,35.338333,133.211667,12,3.6,1985-12-31 02:26:48,1985,12,31,1,2
1,36.406667,140.696667,55,3.3,1985-12-30 19:11:46,1985,12,30,0,19
2,37.21,139.936667,6,3.5,1985-12-30 15:56:17,1985,12,30,0,15
3,27.968333,129.665,0,4.2,1985-12-30 15:20:15,1985,12,30,0,15
4,42.888333,145.445,35,3.7,1985-12-29 09:22:20,1985,12,29,6,9


In [22]:
# Repeat for the BMKG dataset
bmkg_df['Year'] = bmkg_df['Datetime'].dt.year
bmkg_df['Month'] = bmkg_df['Datetime'].dt.month
bmkg_df['Day'] = bmkg_df['Datetime'].dt.day
bmkg_df['DayOfWeek'] = bmkg_df['Datetime'].dt.dayofweek
bmkg_df['Hour'] = bmkg_df['Datetime'].dt.hour
bmkg_df.head()

Unnamed: 0,Latitude,Longitude,Depth,Magnitude,Datetime,Year,Month,Day,DayOfWeek,Hour
0,-9.18,119.06,10,4.9,2008-11-01 21:02:43.058,2008,11,1,5,21
1,-6.55,129.64,10,4.6,2008-11-01 20:58:50.248,2008,11,1,5,20
2,-7.01,106.63,121,3.7,2008-11-01 17:43:12.941,2008,11,1,5,17
3,-3.3,127.85,10,3.2,2008-11-01 16:24:14.755,2008,11,1,5,16
4,-6.41,129.54,70,4.3,2008-11-01 16:20:37.327,2008,11,1,5,16


# 2. Magnitude Category

In [23]:
# Define bins and labels
bins = [0, 3, 5, 7, float('inf')]
labels = ['Low', 'Moderate', 'High', 'Very High']

# Apply to both datasets
jma_df['MagnitudeCategory'] = pd.cut(jma_df['Magnitude'], bins=bins, labels=labels)
bmkg_df['MagnitudeCategory'] = pd.cut(bmkg_df['Magnitude'], bins=bins, labels=labels)

# 3. Time Difference Between Events

In [24]:
# Calculate time difference between consecutive events
jma_df['TimeDiff'] = jma_df['Datetime'].diff().dt.total_seconds() / 3600  # in hours
bmkg_df['TimeDiff'] = bmkg_df['Datetime'].diff().dt.total_seconds() / 3600  # in hours

# Fill NaN in the first row with 0 (no previous event to compare)
jma_df['TimeDiff'].fillna(0, inplace=True)
bmkg_df['TimeDiff'].fillna(0, inplace=True)

In [25]:
jma_df.head()
bmkg_df.head()

Unnamed: 0,Latitude,Longitude,Depth,Magnitude,Datetime,Year,Month,Day,DayOfWeek,Hour,MagnitudeCategory,TimeDiff
0,-9.18,119.06,10,4.9,2008-11-01 21:02:43.058,2008,11,1,5,21,Moderate,0.0
1,-6.55,129.64,10,4.6,2008-11-01 20:58:50.248,2008,11,1,5,20,Moderate,-0.064669
2,-7.01,106.63,121,3.7,2008-11-01 17:43:12.941,2008,11,1,5,17,Moderate,-3.260363
3,-3.3,127.85,10,3.2,2008-11-01 16:24:14.755,2008,11,1,5,16,Moderate,-1.316163
4,-6.41,129.54,70,4.3,2008-11-01 16:20:37.327,2008,11,1,5,16,Moderate,-0.060397


In [26]:
# 7-day rolling mean for magnitude and depth
jma_df['RollingMag'] = jma_df['Magnitude'].rolling(window=7).mean()
bmkg_df['RollingMag'] = bmkg_df['Magnitude'].rolling(window=7).mean()
jma_df['RollingDepth'] = jma_df['Depth'].rolling(window=7).mean()
bmkg_df['RollingDepth'] = bmkg_df['Depth'].rolling(window=7).mean()

In [27]:
jma_df.head()
bmkg_df.head()

Unnamed: 0,Latitude,Longitude,Depth,Magnitude,Datetime,Year,Month,Day,DayOfWeek,Hour,MagnitudeCategory,TimeDiff,RollingMag,RollingDepth
0,-9.18,119.06,10,4.9,2008-11-01 21:02:43.058,2008,11,1,5,21,Moderate,0.0,,
1,-6.55,129.64,10,4.6,2008-11-01 20:58:50.248,2008,11,1,5,20,Moderate,-0.064669,,
2,-7.01,106.63,121,3.7,2008-11-01 17:43:12.941,2008,11,1,5,17,Moderate,-3.260363,,
3,-3.3,127.85,10,3.2,2008-11-01 16:24:14.755,2008,11,1,5,16,Moderate,-1.316163,,
4,-6.41,129.54,70,4.3,2008-11-01 16:20:37.327,2008,11,1,5,16,Moderate,-0.060397,,


In [28]:
# drop the 'RollingMag' and 'RollingDepth' columns
jma_df.drop(['RollingMag', 'RollingDepth'], axis=1, inplace=True)
bmkg_df.drop(['RollingMag', 'RollingDepth'], axis=1, inplace=True)


In [29]:
jma_df.head()
bmkg_df.head()

Unnamed: 0,Latitude,Longitude,Depth,Magnitude,Datetime,Year,Month,Day,DayOfWeek,Hour,MagnitudeCategory,TimeDiff
0,-9.18,119.06,10,4.9,2008-11-01 21:02:43.058,2008,11,1,5,21,Moderate,0.0
1,-6.55,129.64,10,4.6,2008-11-01 20:58:50.248,2008,11,1,5,20,Moderate,-0.064669
2,-7.01,106.63,121,3.7,2008-11-01 17:43:12.941,2008,11,1,5,17,Moderate,-3.260363
3,-3.3,127.85,10,3.2,2008-11-01 16:24:14.755,2008,11,1,5,16,Moderate,-1.316163
4,-6.41,129.54,70,4.3,2008-11-01 16:20:37.327,2008,11,1,5,16,Moderate,-0.060397


In [30]:
# Save the datasets
jma_df.to_csv('JMA_features.csv', index=False)
#bmkg_df.to_csv('BMKG_features.csv', index=False)


In [2]:
bmkg_df.fillna(bmkg_df.mean(), inplace=True)  # Replace missing values with the column mean

In [3]:
jma_df.fillna(jma_df.mean(), inplace=True)  # Replace missing values with the column mean

In [4]:
# Normalize Numerical Features
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
jma_df[['Magnitude', 'Depth', 'Latitude', 'Longitude']] = scaler.fit_transform(jma_df[['Magnitude', 'Depth', 'Latitude', 'Longitude']])
bmkg_df[['Magnitude', 'Depth', 'Latitude', 'Longitude']] = scaler.fit_transform(bmkg_df[['Magnitude', 'Depth', 'Latitude', 'Longitude']])


In [5]:
jma_df.head()
bmkg_df.head()

Unnamed: 0,Latitude,Longitude,Depth,Magnitude,Datetime
0,0.107059,0.521884,0.010695,0.565217,2008-11-01 21:02:43.058
1,0.261765,0.742393,0.010695,0.521739,2008-11-01 20:58:50.248
2,0.234706,0.262818,0.159091,0.391304,2008-11-01 17:43:12.941
3,0.452941,0.705085,0.010695,0.318841,2008-11-01 16:24:14.755
4,0.27,0.740308,0.090909,0.478261,2008-11-01 16:20:37.327


In [8]:
# add the 'MagnitudeCategory' column to jma_df
jma_df['MagnitudeCategory'] = jma_df['MagnitudeCategory'].astype('category')

KeyError: 'MagnitudeCategory'

In [9]:
# Encode Categorical Features
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder()
#jma_encoded = encoder.fit_transform(jma_df[['MagnitudeCategory']]).toarray()
bmkg_encoded = encoder.fit_transform(bmkg_df[['MagnitudeCategory']]).toarray()


KeyError: "None of [Index(['MagnitudeCategory'], dtype='object')] are in the [columns]"

In [10]:
# Add Temporal Context
jma_df['Year'] = jma_df['Year'].astype('category')
jma_df['Month'] = jma_df['Month'].astype('category')
jma_df['Day'] = jma_df['Day'].astype('category')
jma_df['DayOfWeek'] = jma_df['DayOfWeek'].astype('category')
jma_df['Hour'] = jma_df['Hour'].astype('category')

bmkg_df['Year'] = bmkg_df['Year'].astype('category')
bmkg_df['Month'] = bmkg_df['Month'].astype('category')
bmkg_df['Day'] = bmkg_df['Day'].astype('category')
bmkg_df['DayOfWeek'] = bmkg_df['DayOfWeek'].astype('category')
bmkg_df['Hour'] = bmkg_df['Hour'].astype('category')


KeyError: 'Year'

In [11]:
# One-hot encode the temporal features
jma_encoded = encoder.fit_transform(jma_df[['Year', 'Month', 'Day', 'DayOfWeek', 'Hour']]).toarray()
bmkg_encoded = encoder.fit_transform(bmkg_df[['Year', 'Month', 'Day', 'DayOfWeek', 'Hour']]).toarray()


KeyError: "None of [Index(['Year', 'Month', 'Day', 'DayOfWeek', 'Hour'], dtype='object')] are in the [columns]"

In [12]:
correlation_matrix = bmkg_df.corr()
target_corr = correlation_matrix['Magnitude']
selected_features = target_corr[target_corr.abs() > 0.1].index
X_selected = bmkg_df[selected_features]

In [13]:
X_selected.head()

Unnamed: 0,Latitude,Longitude,Depth,Magnitude
0,0.107059,0.521884,0.010695,0.565217
1,0.261765,0.742393,0.010695,0.521739
2,0.234706,0.262818,0.159091,0.391304
3,0.452941,0.705085,0.010695,0.318841
4,0.27,0.740308,0.090909,0.478261


In [14]:
# Split the Data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_selected, bmkg_df['Magnitude'], test_size=0.2, random_state=42)

In [15]:
# save the selected features to a new CSV file for bmkg dataset as numpy array
X_selected.to_csv('BMKG_selected_features.csv', index=False)