In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# Load the dataset
bmkg_data = pd.read_csv('BMKG_dataset.csv')  # Replace with the correct path
print(bmkg_data.head())

          tgl            ot   lat     lon  depth  mag  \
0  2008/11/01  21:02:43.058 -9.18  119.06     10  4.9   
1  2008/11/01  20:58:50.248 -6.55  129.64     10  4.6   
2  2008/11/01  17:43:12.941 -7.01  106.63    121  3.7   
3  2008/11/01  16:24:14.755 -3.30  127.85     10  3.2   
4  2008/11/01  16:20:37.327 -6.41  129.54     70  4.3   

                     remark  strike1  dip1  rake1  strike2  dip2  rake2  
0  Sumba Region - Indonesia      NaN   NaN    NaN      NaN   NaN    NaN  
1                 Banda Sea      NaN   NaN    NaN      NaN   NaN    NaN  
2          Java - Indonesia      NaN   NaN    NaN      NaN   NaN    NaN  
3         Seram - Indonesia      NaN   NaN    NaN      NaN   NaN    NaN  
4                 Banda Sea      NaN   NaN    NaN      NaN   NaN    NaN  


In [2]:
# Combine date and time into a single Datetime column
bmkg_data['Datetime'] = pd.to_datetime(bmkg_data['tgl'] + ' ' + bmkg_data['ot'])

# Show the data
print(bmkg_data.head())

          tgl            ot   lat     lon  depth  mag  \
0  2008/11/01  21:02:43.058 -9.18  119.06     10  4.9   
1  2008/11/01  20:58:50.248 -6.55  129.64     10  4.6   
2  2008/11/01  17:43:12.941 -7.01  106.63    121  3.7   
3  2008/11/01  16:24:14.755 -3.30  127.85     10  3.2   
4  2008/11/01  16:20:37.327 -6.41  129.54     70  4.3   

                     remark  strike1  dip1  rake1  strike2  dip2  rake2  \
0  Sumba Region - Indonesia      NaN   NaN    NaN      NaN   NaN    NaN   
1                 Banda Sea      NaN   NaN    NaN      NaN   NaN    NaN   
2          Java - Indonesia      NaN   NaN    NaN      NaN   NaN    NaN   
3         Seram - Indonesia      NaN   NaN    NaN      NaN   NaN    NaN   
4                 Banda Sea      NaN   NaN    NaN      NaN   NaN    NaN   

                 Datetime  
0 2008-11-01 21:02:43.058  
1 2008-11-01 20:58:50.248  
2 2008-11-01 17:43:12.941  
3 2008-11-01 16:24:14.755  
4 2008-11-01 16:20:37.327  


In [3]:
# Feature Engineering
bmkg_data['Year'] = bmkg_data['Datetime'].dt.year
bmkg_data['Month'] = bmkg_data['Datetime'].dt.month
bmkg_data['Day'] = bmkg_data['Datetime'].dt.day
bmkg_data['Hour'] = bmkg_data['Datetime'].dt.hour
bmkg_data['DayOfWeek'] = bmkg_data['Datetime'].dt.dayofweek
bmkg_data['Minutes'] = bmkg_data['Datetime'].dt.minute

In [4]:
# Time difference (in seconds) from the previous event
bmkg_data['TimeDiff'] = bmkg_data['Datetime'].diff().dt.total_seconds().fillna(0)

# Show the data
print(bmkg_data.head())

          tgl            ot   lat     lon  depth  mag  \
0  2008/11/01  21:02:43.058 -9.18  119.06     10  4.9   
1  2008/11/01  20:58:50.248 -6.55  129.64     10  4.6   
2  2008/11/01  17:43:12.941 -7.01  106.63    121  3.7   
3  2008/11/01  16:24:14.755 -3.30  127.85     10  3.2   
4  2008/11/01  16:20:37.327 -6.41  129.54     70  4.3   

                     remark  strike1  dip1  rake1  ...  dip2  rake2  \
0  Sumba Region - Indonesia      NaN   NaN    NaN  ...   NaN    NaN   
1                 Banda Sea      NaN   NaN    NaN  ...   NaN    NaN   
2          Java - Indonesia      NaN   NaN    NaN  ...   NaN    NaN   
3         Seram - Indonesia      NaN   NaN    NaN  ...   NaN    NaN   
4                 Banda Sea      NaN   NaN    NaN  ...   NaN    NaN   

                 Datetime  Year  Month  Day  Hour  DayOfWeek  Minutes  \
0 2008-11-01 21:02:43.058  2008     11    1    21          5        2   
1 2008-11-01 20:58:50.248  2008     11    1    20          5       58   
2 2008-11-0

In [5]:
# Drop unnecessary columns
bmkg_data = bmkg_data.drop(columns=['tgl', 'ot', 'remark', 'strike1', 'dip1', 'rake1', 'strike2', 'dip2', 'rake2'])

# Show the data
print(bmkg_data.head())

    lat     lon  depth  mag                Datetime  Year  Month  Day  Hour  \
0 -9.18  119.06     10  4.9 2008-11-01 21:02:43.058  2008     11    1    21   
1 -6.55  129.64     10  4.6 2008-11-01 20:58:50.248  2008     11    1    20   
2 -7.01  106.63    121  3.7 2008-11-01 17:43:12.941  2008     11    1    17   
3 -3.30  127.85     10  3.2 2008-11-01 16:24:14.755  2008     11    1    16   
4 -6.41  129.54     70  4.3 2008-11-01 16:20:37.327  2008     11    1    16   

   DayOfWeek  Minutes   TimeDiff  
0          5        2      0.000  
1          5       58   -232.810  
2          5       43 -11737.307  
3          5       24  -4738.186  
4          5       20   -217.428  


In [6]:
# Standardize column names
bmkg_data.rename(columns={'mag': 'Magnitude', 'depth': 'Depth', 'lat': 'Latitude', 'lon': 'Longitude'}, inplace=True)

# Show the data
print(bmkg_data.head())

   Latitude  Longitude  Depth  Magnitude                Datetime  Year  Month  \
0     -9.18     119.06     10        4.9 2008-11-01 21:02:43.058  2008     11   
1     -6.55     129.64     10        4.6 2008-11-01 20:58:50.248  2008     11   
2     -7.01     106.63    121        3.7 2008-11-01 17:43:12.941  2008     11   
3     -3.30     127.85     10        3.2 2008-11-01 16:24:14.755  2008     11   
4     -6.41     129.54     70        4.3 2008-11-01 16:20:37.327  2008     11   

   Day  Hour  DayOfWeek  Minutes   TimeDiff  
0    1    21          5        2      0.000  
1    1    20          5       58   -232.810  
2    1    17          5       43 -11737.307  
3    1    16          5       24  -4738.186  
4    1    16          5       20   -217.428  


In [7]:
# Drop the `Datetime` column (optional, as time features are extracted)
bmkg_data = bmkg_data.drop(columns=['Datetime'])

# Show the data
print(bmkg_data.head())

   Latitude  Longitude  Depth  Magnitude  Year  Month  Day  Hour  DayOfWeek  \
0     -9.18     119.06     10        4.9  2008     11    1    21          5   
1     -6.55     129.64     10        4.6  2008     11    1    20          5   
2     -7.01     106.63    121        3.7  2008     11    1    17          5   
3     -3.30     127.85     10        3.2  2008     11    1    16          5   
4     -6.41     129.54     70        4.3  2008     11    1    16          5   

   Minutes   TimeDiff  
0        2      0.000  
1       58   -232.810  
2       43 -11737.307  
3       24  -4738.186  
4       20   -217.428  


In [8]:
# Drop rows with missing values
bmkg_data = bmkg_data.dropna()

# Show the data
print(bmkg_data.head())

   Latitude  Longitude  Depth  Magnitude  Year  Month  Day  Hour  DayOfWeek  \
0     -9.18     119.06     10        4.9  2008     11    1    21          5   
1     -6.55     129.64     10        4.6  2008     11    1    20          5   
2     -7.01     106.63    121        3.7  2008     11    1    17          5   
3     -3.30     127.85     10        3.2  2008     11    1    16          5   
4     -6.41     129.54     70        4.3  2008     11    1    16          5   

   Minutes   TimeDiff  
0        2      0.000  
1       58   -232.810  
2       43 -11737.307  
3       24  -4738.186  
4       20   -217.428  


In [9]:
# column length
print(len(bmkg_data.columns))
# row length
print(len(bmkg_data))

11
92887


In [10]:
# Select features (X) and target (y)
X_bmkg = bmkg_data.drop(columns=['Magnitude'])
y_bmkg = bmkg_data['Magnitude']

# Show the data
print(X_bmkg.head())
print(y_bmkg.head())

   Latitude  Longitude  Depth  Year  Month  Day  Hour  DayOfWeek  Minutes  \
0     -9.18     119.06     10  2008     11    1    21          5        2   
1     -6.55     129.64     10  2008     11    1    20          5       58   
2     -7.01     106.63    121  2008     11    1    17          5       43   
3     -3.30     127.85     10  2008     11    1    16          5       24   
4     -6.41     129.54     70  2008     11    1    16          5       20   

    TimeDiff  
0      0.000  
1   -232.810  
2 -11737.307  
3  -4738.186  
4   -217.428  
0    4.9
1    4.6
2    3.7
3    3.2
4    4.3
Name: Magnitude, dtype: float64


In [11]:
# Ensure y_bmkg_transformed is in a numeric format
y_bmkg = y_bmkg.astype(float)

# Verify the new data type
print(f"Type of y_bmkg: {y_bmkg.dtype}")

Type of y_bmkg: float64


In [12]:
print("Original shape of x:", X_bmkg.shape)
print("Original shape of y:", y_bmkg.shape)

Original shape of x: (92887, 10)
Original shape of y: (92887,)


In [13]:
# Step 7: Reshape for CNN-GRU (3D)
import numpy as np
import pandas as pd

def create_sliding_window(data, labels, window_size):
    """
    Create sliding windows for data and corresponding labels.

    Args:
        data (numpy.ndarray or pandas.DataFrame): Input features with shape (n_samples, n_features).
        labels (pandas.Series): Corresponding labels as a pandas.Series.
        window_size (int): The number of time steps for each sliding window.

    Returns:
        tuple: Transformed data and labels in the shape (n_windows, window_size, n_features) and (n_windows,).
    """
    # Reset indices for labels to ensure numeric indexing
    labels = labels.reset_index(drop=True)

    x_seq = []
    y_seq = []

    # Iterate to create sliding windows
    for i in range(len(data) - window_size + 1):
        x_seq.append(data[i:i + window_size])  # Append sliding window for features
        y_seq.append(labels[i + window_size - 1])  # Append corresponding label (last step in window)

    return np.array(x_seq), np.array(y_seq)

In [14]:
# Example usage for debugging
# Assuming X_bmkg is a numpy array or pandas DataFrame and y_bmkg a pandas Series
window_size = 10  # Number of time steps
X_bmkg, y_bmkg= create_sliding_window(X_bmkg, y_bmkg, window_size)

print(f"New X_bmkg shape: {X_bmkg.shape}")  # Outputs new shape
print(f"New y_bmkg shape: {y_bmkg.shape}")  # Outputs new shape

New X_bmkg shape: (92878, 10, 10)
New y_bmkg shape: (92878,)


In [15]:
# Save preprocessed data
np.save("X_bmkg_cnn_gru.npy", X_bmkg)
np.save("y_bmkg_cnn_gru.npy", y_bmkg)
# Verify the new shape
print("Reshaped for CNN-GRU:")
print(f"X_bmkg shape: {X_bmkg.shape}")
print(f"y_bmkg shape: {y_bmkg.shape}")

Reshaped for CNN-GRU:
X_bmkg shape: (92878, 10, 10)
y_bmkg shape: (92878,)


In [16]:
import numpy as np

# Check the saved data shape
X_bmkg = np.load('X_bmkg_cnn_gru.npy')  # Loading the first array

# Enable allow_pickle=True to properly load object arrays
y_bmkg = np.load('y_bmkg_cnn_gru.npy', allow_pickle=True)

print("Loaded X_bmkg shape:", X_bmkg.shape)
print("Loaded y_bmkg shape:", y_bmkg.shape)


Loaded X_bmkg shape: (92878, 10, 10)
Loaded y_bmkg shape: (92878,)
