In [15]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# Load the dataset
bmkg_data = pd.read_csv('BMKG_dataset.csv')  # Replace with the correct path
print(bmkg_data.head())

          tgl            ot   lat     lon  depth  mag  \
0  2008/11/01  21:02:43.058 -9.18  119.06     10  4.9   
1  2008/11/01  20:58:50.248 -6.55  129.64     10  4.6   
2  2008/11/01  17:43:12.941 -7.01  106.63    121  3.7   
3  2008/11/01  16:24:14.755 -3.30  127.85     10  3.2   
4  2008/11/01  16:20:37.327 -6.41  129.54     70  4.3   

                     remark  strike1  dip1  rake1  strike2  dip2  rake2  
0  Sumba Region - Indonesia      NaN   NaN    NaN      NaN   NaN    NaN  
1                 Banda Sea      NaN   NaN    NaN      NaN   NaN    NaN  
2          Java - Indonesia      NaN   NaN    NaN      NaN   NaN    NaN  
3         Seram - Indonesia      NaN   NaN    NaN      NaN   NaN    NaN  
4                 Banda Sea      NaN   NaN    NaN      NaN   NaN    NaN  


In [16]:
# Combine date and time into a single Datetime column
bmkg_data['Datetime'] = pd.to_datetime(bmkg_data['tgl'] + ' ' + bmkg_data['ot'])

# Show the data
print(bmkg_data.head())

          tgl            ot   lat     lon  depth  mag  \
0  2008/11/01  21:02:43.058 -9.18  119.06     10  4.9   
1  2008/11/01  20:58:50.248 -6.55  129.64     10  4.6   
2  2008/11/01  17:43:12.941 -7.01  106.63    121  3.7   
3  2008/11/01  16:24:14.755 -3.30  127.85     10  3.2   
4  2008/11/01  16:20:37.327 -6.41  129.54     70  4.3   

                     remark  strike1  dip1  rake1  strike2  dip2  rake2  \
0  Sumba Region - Indonesia      NaN   NaN    NaN      NaN   NaN    NaN   
1                 Banda Sea      NaN   NaN    NaN      NaN   NaN    NaN   
2          Java - Indonesia      NaN   NaN    NaN      NaN   NaN    NaN   
3         Seram - Indonesia      NaN   NaN    NaN      NaN   NaN    NaN   
4                 Banda Sea      NaN   NaN    NaN      NaN   NaN    NaN   

                 Datetime  
0 2008-11-01 21:02:43.058  
1 2008-11-01 20:58:50.248  
2 2008-11-01 17:43:12.941  
3 2008-11-01 16:24:14.755  
4 2008-11-01 16:20:37.327  


In [17]:
# Feature Engineering
bmkg_data['Year'] = bmkg_data['Datetime'].dt.year
bmkg_data['Month'] = bmkg_data['Datetime'].dt.month
bmkg_data['Day'] = bmkg_data['Datetime'].dt.day
bmkg_data['Hour'] = bmkg_data['Datetime'].dt.hour
bmkg_data['Minutes'] = bmkg_data['Datetime'].dt.minute
bmkg_data['DayOfWeek'] = bmkg_data['Datetime'].dt.dayofweek

In [18]:
# Time difference (in seconds) from the previous event
bmkg_data['TimeDiff'] = bmkg_data['Datetime'].diff().dt.total_seconds().fillna(0)

# Show the data
print(bmkg_data.head())

          tgl            ot   lat     lon  depth  mag  \
0  2008/11/01  21:02:43.058 -9.18  119.06     10  4.9   
1  2008/11/01  20:58:50.248 -6.55  129.64     10  4.6   
2  2008/11/01  17:43:12.941 -7.01  106.63    121  3.7   
3  2008/11/01  16:24:14.755 -3.30  127.85     10  3.2   
4  2008/11/01  16:20:37.327 -6.41  129.54     70  4.3   

                     remark  strike1  dip1  rake1  ...  dip2  rake2  \
0  Sumba Region - Indonesia      NaN   NaN    NaN  ...   NaN    NaN   
1                 Banda Sea      NaN   NaN    NaN  ...   NaN    NaN   
2          Java - Indonesia      NaN   NaN    NaN  ...   NaN    NaN   
3         Seram - Indonesia      NaN   NaN    NaN  ...   NaN    NaN   
4                 Banda Sea      NaN   NaN    NaN  ...   NaN    NaN   

                 Datetime  Year  Month  Day  Hour  Minutes  DayOfWeek  \
0 2008-11-01 21:02:43.058  2008     11    1    21        2          5   
1 2008-11-01 20:58:50.248  2008     11    1    20       58          5   
2 2008-11-0

In [19]:
# Drop unnecessary columns
bmkg_data = bmkg_data.drop(columns=['tgl', 'ot', 'remark', 'strike1', 'dip1', 'rake1', 'strike2', 'dip2', 'rake2'])

# Show the data
print(bmkg_data.head())

    lat     lon  depth  mag                Datetime  Year  Month  Day  Hour  \
0 -9.18  119.06     10  4.9 2008-11-01 21:02:43.058  2008     11    1    21   
1 -6.55  129.64     10  4.6 2008-11-01 20:58:50.248  2008     11    1    20   
2 -7.01  106.63    121  3.7 2008-11-01 17:43:12.941  2008     11    1    17   
3 -3.30  127.85     10  3.2 2008-11-01 16:24:14.755  2008     11    1    16   
4 -6.41  129.54     70  4.3 2008-11-01 16:20:37.327  2008     11    1    16   

   Minutes  DayOfWeek   TimeDiff  
0        2          5      0.000  
1       58          5   -232.810  
2       43          5 -11737.307  
3       24          5  -4738.186  
4       20          5   -217.428  


In [20]:
# Standardize column names
bmkg_data.rename(columns={'mag': 'Magnitude', 'depth': 'Depth', 'lat': 'Latitude', 'lon': 'Longitude'}, inplace=True)

# Show the data
print(bmkg_data.head())

   Latitude  Longitude  Depth  Magnitude                Datetime  Year  Month  \
0     -9.18     119.06     10        4.9 2008-11-01 21:02:43.058  2008     11   
1     -6.55     129.64     10        4.6 2008-11-01 20:58:50.248  2008     11   
2     -7.01     106.63    121        3.7 2008-11-01 17:43:12.941  2008     11   
3     -3.30     127.85     10        3.2 2008-11-01 16:24:14.755  2008     11   
4     -6.41     129.54     70        4.3 2008-11-01 16:20:37.327  2008     11   

   Day  Hour  Minutes  DayOfWeek   TimeDiff  
0    1    21        2          5      0.000  
1    1    20       58          5   -232.810  
2    1    17       43          5 -11737.307  
3    1    16       24          5  -4738.186  
4    1    16       20          5   -217.428  


In [21]:
# Drop the `Datetime` column (optional, as time features are extracted)
bmkg_data = bmkg_data.drop(columns=['Datetime'])

# Show the data
print(bmkg_data.head())

   Latitude  Longitude  Depth  Magnitude  Year  Month  Day  Hour  Minutes  \
0     -9.18     119.06     10        4.9  2008     11    1    21        2   
1     -6.55     129.64     10        4.6  2008     11    1    20       58   
2     -7.01     106.63    121        3.7  2008     11    1    17       43   
3     -3.30     127.85     10        3.2  2008     11    1    16       24   
4     -6.41     129.54     70        4.3  2008     11    1    16       20   

   DayOfWeek   TimeDiff  
0          5      0.000  
1          5   -232.810  
2          5 -11737.307  
3          5  -4738.186  
4          5   -217.428  


In [22]:
print(bmkg_data.head())
# Drop rows with missing values
bmkg_data = bmkg_data.dropna()

# Show the data
print(bmkg_data.head())

   Latitude  Longitude  Depth  Magnitude  Year  Month  Day  Hour  Minutes  \
0     -9.18     119.06     10        4.9  2008     11    1    21        2   
1     -6.55     129.64     10        4.6  2008     11    1    20       58   
2     -7.01     106.63    121        3.7  2008     11    1    17       43   
3     -3.30     127.85     10        3.2  2008     11    1    16       24   
4     -6.41     129.54     70        4.3  2008     11    1    16       20   

   DayOfWeek   TimeDiff  
0          5      0.000  
1          5   -232.810  
2          5 -11737.307  
3          5  -4738.186  
4          5   -217.428  
   Latitude  Longitude  Depth  Magnitude  Year  Month  Day  Hour  Minutes  \
0     -9.18     119.06     10        4.9  2008     11    1    21        2   
1     -6.55     129.64     10        4.6  2008     11    1    20       58   
2     -7.01     106.63    121        3.7  2008     11    1    17       43   
3     -3.30     127.85     10        3.2  2008     11    1    16       24

In [23]:
# Select features (X) and target (y)
X_bmkg = bmkg_data.drop(columns=['Magnitude'])
y_bmkg = bmkg_data['Magnitude']

# Show the data
print(X_bmkg.head())
print(y_bmkg.head())

   Latitude  Longitude  Depth  Year  Month  Day  Hour  Minutes  DayOfWeek  \
0     -9.18     119.06     10  2008     11    1    21        2          5   
1     -6.55     129.64     10  2008     11    1    20       58          5   
2     -7.01     106.63    121  2008     11    1    17       43          5   
3     -3.30     127.85     10  2008     11    1    16       24          5   
4     -6.41     129.54     70  2008     11    1    16       20          5   

    TimeDiff  
0      0.000  
1   -232.810  
2 -11737.307  
3  -4738.186  
4   -217.428  
0    4.9
1    4.6
2    3.7
3    3.2
4    4.3
Name: Magnitude, dtype: float64


In [24]:
from sklearn.preprocessing import MinMaxScaler
import numpy as np

# Initialize scalers
scaler_X = MinMaxScaler()
scaler_y = MinMaxScaler()

# Normalize features (X)
X_bmkg_scaled = scaler_X.fit_transform(X_bmkg)

# Normalize target (y)
y_bmkg_scaled = scaler_y.fit_transform(y_bmkg.values.reshape(-1, 1))  # Convert Series to NumPy array and reshape

In [25]:
# Display the normalized features and target
print("Normalized Features:")
print(X_bmkg_scaled)
print("\nNormalized Target:")
print(y_bmkg_scaled)

# Save the scalers
np.save("scaler_X.npy", scaler_X)
np.save("scaler_y.npy", scaler_y)


Normalized Features:
[[0.10705882 0.52188412 0.01069519 ... 0.03389831 0.83333333 0.33210184]
 [0.26176471 0.74239266 0.01069519 ... 0.98305085 0.83333333 0.3320872 ]
 [0.23470588 0.26281784 0.15909091 ... 0.72881356 0.83333333 0.3313637 ]
 ...
 [0.18647059 0.56377657 0.01069519 ... 0.96610169 0.5        0.33203423]
 [0.82352941 0.69070446 0.01069519 ... 0.77966102 0.5        0.33206109]
 [0.12529412 0.5195915  0.01069519 ... 0.         0.5        0.33170276]]

Normalized Target:
[[0.56521739]
 [0.52173913]
 [0.39130435]
 ...
 [0.4057971 ]
 [0.44927536]
 [0.20289855]]


In [26]:
# Reshape for CNN-GRU (if needed, based on time steps)
time_steps = 5
samples = X_bmkg_scaled.shape[0] - time_steps + 1  # Calculate the number of samples
features = X_bmkg_scaled.shape[1]


In [27]:
#show the data
print(samples)
print(features)

92883
10


In [28]:
# show the data shape
print(X_bmkg_scaled.shape)
print(y_bmkg_scaled.shape)

(92887, 10)
(92887, 1)


In [29]:
# Create sequences of time steps
X_bmkg_reshaped = np.array([X_bmkg_scaled[i:i+time_steps] for i in range(samples)])
y_bmkg_reshaped = y_bmkg_scaled[time_steps-1:]  # Align the target with the sequences

# Verify the shape of the reshaped data
print("\nReshaped Data:")
print(f"X_bmkg shape: {X_bmkg_reshaped.shape}")
print(f"y_bmkg shape: {y_bmkg_reshaped.shape}")



Reshaped Data:
X_bmkg shape: (92883, 5, 10)
y_bmkg shape: (92883, 1)


In [30]:
# Save preprocessed data
np.save("X_bmkg_cnn_bilstm.npy", X_bmkg_reshaped)
np.save("y_bmkg_cnn_bilstm.npy", y_bmkg_reshaped)

print("Preprocessing complete. Data saved.")

Preprocessing complete. Data saved.


In [31]:
# Check the saved data shape
X_bmkg = np.load("X_bmkg_cnn_bilstm.npy")  # Loading the first array
y_bmkg = np.load("y_bmkg_cnn_bilstm.npy")  # Loading the second array

# Print results
print(f"X_bmkg shape: {X_bmkg.shape}")
print(f"y_bmkg shape: {y_bmkg.shape}")


X_bmkg shape: (92883, 5, 10)
y_bmkg shape: (92883, 1)
