In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# Load the dataset
jma_df = pd.read_csv('JMA_dataset.csv')  # Replace with the correct path
print(jma_df.head())

         Date      Time    震央地名        Lat        Long  Depth    M 最大震度  \
0  12/31/1985   2:26:48   島根県東部  35°20.3'N  133°12.7'E     12  3.6  震度１   
1  12/30/1985  19:11:46    茨城県沖  36°24.4'N  140°41.8'E     55  3.3  震度１   
2  12/30/1985  15:56:17   福島県会津  37°12.6'N  139°56.2'E      6  3.5  震度１   
3  12/30/1985  15:20:15  奄美大島近海  27°58.1'N  129°39.9'E      0  4.2  震度１   
4  12/29/1985   9:22:20     釧路沖  42°53.3'N  145°26.7'E     35  3.7  震度２   

  Source.Name  
0         NaN  
1         NaN  
2         NaN  
3         NaN  
4         NaN  


In [7]:
# Step 1: Combine Date and Time into a single `Datetime` column
jma_df['Datetime'] = pd.to_datetime(jma_df['Date'] + ' ' + jma_df['Time'], format='%m/%d/%Y %H:%M:%S')

In [8]:
#show the data
print(jma_df.head())

         Date      Time    震央地名        Lat        Long  Depth    M 最大震度  \
0  12/31/1985   2:26:48   島根県東部  35°20.3'N  133°12.7'E     12  3.6  震度１   
1  12/30/1985  19:11:46    茨城県沖  36°24.4'N  140°41.8'E     55  3.3  震度１   
2  12/30/1985  15:56:17   福島県会津  37°12.6'N  139°56.2'E      6  3.5  震度１   
3  12/30/1985  15:20:15  奄美大島近海  27°58.1'N  129°39.9'E      0  4.2  震度１   
4  12/29/1985   9:22:20     釧路沖  42°53.3'N  145°26.7'E     35  3.7  震度２   

  Source.Name            Datetime  
0         NaN 1985-12-31 02:26:48  
1         NaN 1985-12-30 19:11:46  
2         NaN 1985-12-30 15:56:17  
3         NaN 1985-12-30 15:20:15  
4         NaN 1985-12-29 09:22:20  


In [9]:
#change the column names
jma_df.rename(columns={'Lat':'Latitude', 'Long':'Longitude', 'Depth':'Depth', 'M':'Magnitude'}, inplace=True)

# Display the first few rows of the updated dataset
print("\nJMA Dataset:")
print(jma_df.head())


JMA Dataset:
         Date      Time    震央地名   Latitude   Longitude  Depth Magnitude 最大震度  \
0  12/31/1985   2:26:48   島根県東部  35°20.3'N  133°12.7'E     12       3.6  震度１   
1  12/30/1985  19:11:46    茨城県沖  36°24.4'N  140°41.8'E     55       3.3  震度１   
2  12/30/1985  15:56:17   福島県会津  37°12.6'N  139°56.2'E      6       3.5  震度１   
3  12/30/1985  15:20:15  奄美大島近海  27°58.1'N  129°39.9'E      0       4.2  震度１   
4  12/29/1985   9:22:20     釧路沖  42°53.3'N  145°26.7'E     35       3.7  震度２   

  Source.Name            Datetime  
0         NaN 1985-12-31 02:26:48  
1         NaN 1985-12-30 19:11:46  
2         NaN 1985-12-30 15:56:17  
3         NaN 1985-12-30 15:20:15  
4         NaN 1985-12-29 09:22:20  


In [10]:
#change the column names
jma_df.rename(columns={'Lat':'Latitude', 'Long':'Longitude', 'Depth':'Depth', 'M':'Magnitude'}, inplace=True)

# Display the first few rows of the updated dataset
print("\nJMA Dataset:")
print(jma_df.head())


JMA Dataset:
         Date      Time    震央地名   Latitude   Longitude  Depth Magnitude 最大震度  \
0  12/31/1985   2:26:48   島根県東部  35°20.3'N  133°12.7'E     12       3.6  震度１   
1  12/30/1985  19:11:46    茨城県沖  36°24.4'N  140°41.8'E     55       3.3  震度１   
2  12/30/1985  15:56:17   福島県会津  37°12.6'N  139°56.2'E      6       3.5  震度１   
3  12/30/1985  15:20:15  奄美大島近海  27°58.1'N  129°39.9'E      0       4.2  震度１   
4  12/29/1985   9:22:20     釧路沖  42°53.3'N  145°26.7'E     35       3.7  震度２   

  Source.Name            Datetime  
0         NaN 1985-12-31 02:26:48  
1         NaN 1985-12-30 19:11:46  
2         NaN 1985-12-30 15:56:17  
3         NaN 1985-12-30 15:20:15  
4         NaN 1985-12-29 09:22:20  


In [11]:
import numpy as np
import re
import pandas as pd

# Updated function to handle both DMS and decimal formats
def dms_to_decimal(coord):
    # Check if the value is NaN
    if pd.isna(coord):
        return np.nan

    # If the coordinate is already in decimal format, return it as a float
    try:
        # Try converting directly to float; if it works, we have a decimal value
        return float(coord)
    except ValueError:
        # If direct conversion fails, it's likely in DMS format
        pass

    # For DMS format, extract degrees, minutes, and direction
    match = re.match(r"(\d+)°(\d+\.\d+)'([NSEW])", str(coord))
    if match:
        degrees = float(match.group(1))
        minutes = float(match.group(2))
        direction = match.group(3)

        # Convert DMS to decimal degrees
        decimal = degrees + (minutes / 60)

        # Make coordinates negative for South and West
        if direction in ['S', 'W']:
            decimal = -decimal

        return decimal
    else:
        raise ValueError(f"Invalid coordinate format: {coord}")

# Replace '不明' with NaN in the dataset
jma_df.replace('不明', np.nan, inplace=True)

# Apply the function to the Latitude and Longitude columns
jma_df['Latitude'] = jma_df['Latitude'].apply(dms_to_decimal)
jma_df['Longitude'] = jma_df['Longitude'].apply(dms_to_decimal)

# Display the updated dataset
print("\nUpdated JMA Dataset with Decimal Coordinates:")
print(jma_df.head())


Updated JMA Dataset with Decimal Coordinates:
         Date      Time    震央地名   Latitude   Longitude  Depth Magnitude 最大震度  \
0  12/31/1985   2:26:48   島根県東部  35.338333  133.211667     12       3.6  震度１   
1  12/30/1985  19:11:46    茨城県沖  36.406667  140.696667     55       3.3  震度１   
2  12/30/1985  15:56:17   福島県会津  37.210000  139.936667      6       3.5  震度１   
3  12/30/1985  15:20:15  奄美大島近海  27.968333  129.665000      0       4.2  震度１   
4  12/29/1985   9:22:20     釧路沖  42.888333  145.445000     35       3.7  震度２   

  Source.Name            Datetime  
0         NaN 1985-12-31 02:26:48  
1         NaN 1985-12-30 19:11:46  
2         NaN 1985-12-30 15:56:17  
3         NaN 1985-12-30 15:20:15  
4         NaN 1985-12-29 09:22:20  


In [12]:
# Step 3: Clean and remove unnecessary columns
jma_df = jma_df.drop(columns=['Date', 'Time', '震央地名', '最大震度', 'Source.Name'])
print(jma_df.head())


    Latitude   Longitude  Depth Magnitude            Datetime
0  35.338333  133.211667     12       3.6 1985-12-31 02:26:48
1  36.406667  140.696667     55       3.3 1985-12-30 19:11:46
2  37.210000  139.936667      6       3.5 1985-12-30 15:56:17
3  27.968333  129.665000      0       4.2 1985-12-30 15:20:15
4  42.888333  145.445000     35       3.7 1985-12-29 09:22:20


In [13]:
# Step 4: Drop rows with missing values
jma_df = jma_df.dropna()
print(jma_df.head())

    Latitude   Longitude  Depth Magnitude            Datetime
0  35.338333  133.211667     12       3.6 1985-12-31 02:26:48
1  36.406667  140.696667     55       3.3 1985-12-30 19:11:46
2  37.210000  139.936667      6       3.5 1985-12-30 15:56:17
3  27.968333  129.665000      0       4.2 1985-12-30 15:20:15
4  42.888333  145.445000     35       3.7 1985-12-29 09:22:20


In [14]:
# Step 5: Feature engineering
jma_df['Year'] = jma_df['Datetime'].dt.year
jma_df['Month'] = jma_df['Datetime'].dt.month
jma_df['Day'] = jma_df['Datetime'].dt.day
jma_df['Hour'] = jma_df['Datetime'].dt.hour
jma_df['Minutes'] = jma_df['Datetime'].dt.minute
jma_df['DayOfWeek'] = jma_df['Datetime'].dt.dayofweek
jma_df['TimeDiff'] = (jma_df['Datetime'] - jma_df['Datetime'].shift(1)).dt.total_seconds().fillna(0)

In [15]:
#show the data
print(jma_df.head())

    Latitude   Longitude  Depth Magnitude            Datetime  Year  Month  \
0  35.338333  133.211667     12       3.6 1985-12-31 02:26:48  1985     12   
1  36.406667  140.696667     55       3.3 1985-12-30 19:11:46  1985     12   
2  37.210000  139.936667      6       3.5 1985-12-30 15:56:17  1985     12   
3  27.968333  129.665000      0       4.2 1985-12-30 15:20:15  1985     12   
4  42.888333  145.445000     35       3.7 1985-12-29 09:22:20  1985     12   

   Day  Hour  Minutes  DayOfWeek  TimeDiff  
0   31     2       26          1       0.0  
1   30    19       11          0  -26102.0  
2   30    15       56          0  -11729.0  
3   30    15       20          0   -2162.0  
4   29     9       22          6 -107875.0  


In [16]:
# Drop the `Datetime` column (optional, as time features are extracted)
jma_df = jma_df.drop(columns=['Datetime'])
#show the data
print(jma_df.head())

    Latitude   Longitude  Depth Magnitude  Year  Month  Day  Hour  Minutes  \
0  35.338333  133.211667     12       3.6  1985     12   31     2       26   
1  36.406667  140.696667     55       3.3  1985     12   30    19       11   
2  37.210000  139.936667      6       3.5  1985     12   30    15       56   
3  27.968333  129.665000      0       4.2  1985     12   30    15       20   
4  42.888333  145.445000     35       3.7  1985     12   29     9       22   

   DayOfWeek  TimeDiff  
0          1       0.0  
1          0  -26102.0  
2          0  -11729.0  
3          0   -2162.0  
4          6 -107875.0  


In [17]:
# Step 6: Prepare target and features
X_jma = jma_df.drop(columns=['Magnitude'])
y_jma = jma_df['Magnitude']

# Display the features and target
print("\nFeatures:")
print(X_jma.head())
print("\nTarget:")
print(y_jma.head())


Features:
    Latitude   Longitude  Depth  Year  Month  Day  Hour  Minutes  DayOfWeek  \
0  35.338333  133.211667     12  1985     12   31     2       26          1   
1  36.406667  140.696667     55  1985     12   30    19       11          0   
2  37.210000  139.936667      6  1985     12   30    15       56          0   
3  27.968333  129.665000      0  1985     12   30    15       20          0   
4  42.888333  145.445000     35  1985     12   29     9       22          6   

   TimeDiff  
0       0.0  
1  -26102.0  
2  -11729.0  
3   -2162.0  
4 -107875.0  

Target:
0    3.6
1    3.3
2    3.5
3    4.2
4    3.7
Name: Magnitude, dtype: object


In [20]:
from sklearn.preprocessing import MinMaxScaler
import numpy as np

# Initialize scalers
scaler_X = MinMaxScaler()
scaler_y = MinMaxScaler()

# Normalize features (X)
X_jma_scaled = scaler_X.fit_transform(X_jma)

# Normalize target (y)
y_jma_scaled = scaler_y.fit_transform(y_jma.values.reshape(-1, 1))  # Convert Series to NumPy array and reshape

In [21]:
# Display the normalized features and target
print("\nNormalized Features:")
print(X_jma_scaled[:5])
print("\nNormalized Target:")
print(y_jma_scaled[:5])


Normalized Features:
[[0.59948847 0.33121805 0.01726619 0.         1.         1.
  0.08695652 0.44067797 0.16666667 0.25212269]
 [0.62024087 0.52912921 0.07913669 0.         1.         0.96666667
  0.82608696 0.18644068 0.         0.2518133 ]
 [0.63584564 0.50903402 0.00863309 0.         1.         0.96666667
  0.65217391 0.94915254 0.         0.25198366]
 [0.45632608 0.23744051 0.         0.         1.         0.96666667
  0.65217391 0.33898305 0.         0.25209706]
 [0.74614737 0.65468006 0.05035971 0.         1.         0.93333333
  0.39130435 0.37288136 1.         0.25084401]]

Normalized Target:
[[0.38636364]
 [0.35227273]
 [0.375     ]
 [0.45454545]
 [0.39772727]]


In [22]:
# Reshape for CNN-BiLSTM (if needed, based on time steps)
time_steps = 5
samples = X_jma_scaled.shape[0] - time_steps + 1  # Calculate the number of samples
features = X_jma_scaled.shape[1]

In [23]:
#show the data
print(samples)
print(features)

85330
10


In [24]:
# show the data shape
print(X_jma_scaled.shape)
print(y_jma_scaled.shape)

(85334, 10)
(85334, 1)


In [25]:
# Create sequences of time steps
X_jma_reshaped = np.array([X_jma_scaled[i:i+time_steps] for i in range(samples)])
y_jma_reshaped = y_jma_scaled[time_steps-1:]  # Align the target with the sequences

# Verify the shape of the reshaped data
print("\nReshaped Data:")
print(f"X_jma shape: {X_jma_reshaped.shape}")
print(f"y_jma shape: {y_jma_reshaped.shape}")



Reshaped Data:
X_jma shape: (85330, 5, 10)
y_jma shape: (85330, 1)


In [26]:
# Save preprocessed data
np.save("X_jma_cnn_bilstm.npy", X_jma_reshaped)
np.save("y_jma_cnn_bilstm.npy", y_jma_reshaped)

# Print results
print("Preprocessing complete. Data saved.")

Preprocessing complete. Data saved.


In [27]:
print(f"Preprocessed JMA dataset for CNN-BiLSTM:")
print(f"X_jma shape: {X_jma_reshaped.shape}")
print(f"y_jma shape: {y_jma_reshaped.shape}")

Preprocessed JMA dataset for CNN-BiLSTM:
X_jma shape: (85330, 5, 10)
y_jma shape: (85330, 1)
