In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout , BatchNormalization
from keras.callbacks import EarlyStopping , ModelCheckpoint
from scipy.signal import butter, lfilter


In [2]:
import os


training_data = pd.read_csv('C:/Users/nm_ma/Desktop/space_apps_2024_seismic_detection/data/lunar/training/data/S12_GradeA/xa.s12.00.mhz.1970-01-19HR00_evid00002.csv')
# Display the first few rows of the merged DataFrame
print(training_data.head())


  time_abs(%Y-%m-%dT%H:%M:%S.%f)  time_rel(sec)  velocity(m/s)
0     1970-01-19T00:00:00.665000       0.000000  -6.153279e-14
1     1970-01-19T00:00:00.815943       0.150943  -7.701288e-14
2     1970-01-19T00:00:00.966887       0.301887  -8.396187e-14
3     1970-01-19T00:00:01.117830       0.452830  -8.096155e-14
4     1970-01-19T00:00:01.268774       0.603774  -7.097599e-14


In [3]:
training_data.tail()

Unnamed: 0,time_abs(%Y-%m-%dT%H:%M:%S.%f),time_rel(sec),velocity(m/s)
572410,1970-01-20T00:00:02.174434,86401.509434,-1.472713e-14
572411,1970-01-20T00:00:02.325377,86401.660377,-1.956104e-14
572412,1970-01-20T00:00:02.476321,86401.811321,-2.240307e-14
572413,1970-01-20T00:00:02.627264,86401.962264,-2.998405e-14
572414,1970-01-20T00:00:02.778208,86402.113208,-4.465515e-14


In [4]:
def label_matching_rows(training_data_df, catalog_file):
    # Read the catalog file
    catalog_df = pd.read_csv(catalog_file)

    # Ensure the timestamp in the catalog is in the correct format
    catalog_df['time_abs'] = pd.to_datetime(catalog_df['time_abs(%Y-%m-%dT%H:%M:%S.%f)'], format='%Y-%m-%dT%H:%M:%S.%f')

    # Convert time_abs in the training DataFrame to datetime format
    training_data_df['time_abs'] = pd.to_datetime(training_data_df['time_abs(%Y-%m-%dT%H:%M:%S.%f)'], format='%Y-%m-%dT%H:%M:%S.%f')

    # Initialize target column with 0
    training_data_df['target'] = 0

    # Function to find the first matching row in training data
    def find_first_match(cat_time, train_times):
        mask = (train_times.dt.year == cat_time.year) & \
               (train_times.dt.month == cat_time.month) & \
               (train_times.dt.day == cat_time.day) & \
               (train_times.dt.hour == cat_time.hour)
        matching_indices = mask[mask].index
        return matching_indices[0] if len(matching_indices) > 0 else -1

    # Set target to 1 for the first matching timestamp for each catalog entry
    for catalog_time in catalog_df['time_abs']:
        match_idx = find_first_match(catalog_time, training_data_df['time_abs'])
        if match_idx != -1:
            training_data_df.loc[match_idx, 'target'] = 1

    # Drop unnecessary columns, keep only relevant ones
    columns_to_keep = ['time_abs', 'time_rel(sec)', 'velocity(m/s)', 'target']
    training_data_df = training_data_df[columns_to_keep]

    # Drop duplicate entries if necessary
    training_data_df.drop_duplicates(inplace=True)

    return training_data_df

# Example usage:
catalog_file = 'C:/Users/nm_ma/Desktop/space_apps_2024_seismic_detection/data/lunar/training/catalogs/apollo12_catalog_GradeA_final.csv'
labeled_training_data = label_matching_rows(training_data, catalog_file)

# Display the first few rows of the labeled DataFrame
print("Labeled Training Data:")
print(labeled_training_data.head())

# Check counts of target values
print("Target variable counts:")
print(labeled_training_data['target'].value_counts())


Labeled Training Data:
                    time_abs  time_rel(sec)  velocity(m/s)  target
0 1970-01-19 00:00:00.665000       0.000000  -6.153279e-14       0
1 1970-01-19 00:00:00.815943       0.150943  -7.701288e-14       0
2 1970-01-19 00:00:00.966887       0.301887  -8.396187e-14       0
3 1970-01-19 00:00:01.117830       0.452830  -8.096155e-14       0
4 1970-01-19 00:00:01.268774       0.603774  -7.097599e-14       0
Target variable counts:
target
0    572414
1         1
Name: count, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  training_data_df.drop_duplicates(inplace=True)


In [5]:
labeled_training_data.tail()

Unnamed: 0,time_abs,time_rel(sec),velocity(m/s),target
572410,1970-01-20 00:00:02.174434,86401.509434,-1.472713e-14,0
572411,1970-01-20 00:00:02.325377,86401.660377,-1.956104e-14,0
572412,1970-01-20 00:00:02.476321,86401.811321,-2.240307e-14,0
572413,1970-01-20 00:00:02.627264,86401.962264,-2.998405e-14,0
572414,1970-01-20 00:00:02.778208,86402.113208,-4.465515e-14,0


In [6]:
labeled_training_data['target'].value_counts()

target
0    572414
1         1
Name: count, dtype: int64

In [7]:
def butter_bandpass(lowcut, highcut, fs, order=5):
    nyq = 0.5 * fs
    low = lowcut / nyq
    high = highcut / nyq
    b, a = butter(order, [low, high], btype='band')
    return b, a

def bandpass_filter(data, lowcut, highcut, fs, order=5):
    b, a = butter_bandpass(lowcut, highcut, fs, order=order)
    y = lfilter(b, a, data)
    return y


In [8]:
# Set filter parameters
lowcut = 0.1  # Set low cutoff frequency
highcut = 10.0  # Set high cutoff frequency
fs = 100.0  # Sample rate (Hz)

# Apply filter to the velocity data
labeled_training_data['filtered_velocity'] = bandpass_filter(labeled_training_data['velocity(m/s)'].values, lowcut, highcut, fs)


In [9]:
labeled_training_data

Unnamed: 0,time_abs,time_rel(sec),velocity(m/s),target,filtered_velocity
0,1970-01-19 00:00:00.665000,0.000000,-6.153279e-14,0,-7.560920e-17
1,1970-01-19 00:00:00.815943,0.150943,-7.701288e-14,0,-6.979321e-16
2,1970-01-19 00:00:00.966887,0.301887,-8.396187e-14,0,-3.122392e-15
3,1970-01-19 00:00:01.117830,0.452830,-8.096155e-14,0,-9.143711e-15
4,1970-01-19 00:00:01.268774,0.603774,-7.097599e-14,0,-1.995983e-14
...,...,...,...,...,...
572410,1970-01-20 00:00:02.174434,86401.509434,-1.472713e-14,0,8.358008e-14
572411,1970-01-20 00:00:02.325377,86401.660377,-1.956104e-14,0,7.389166e-14
572412,1970-01-20 00:00:02.476321,86401.811321,-2.240307e-14,0,5.711657e-14
572413,1970-01-20 00:00:02.627264,86401.962264,-2.998405e-14,0,3.520667e-14


In [10]:
scaler = MinMaxScaler(feature_range=(0, 1))
labeled_training_data[['filtered_velocity']] = scaler.fit_transform(labeled_training_data[['filtered_velocity']])


In [11]:
def create_sequences(data, target, time_steps):
    X, y = [], []
    for i in range(len(data) - time_steps):
        X.append(data[i:(i + time_steps)])
        y.append(target[i + time_steps])
    return np.array(X), np.array(y)

# Set time steps
time_steps = 10  # Number of time steps to look back
X, y = create_sequences(labeled_training_data['filtered_velocity'].values, labeled_training_data['target'].values, time_steps)


In [12]:
X = X.reshape((X.shape[0], X.shape[1], 1))  # Reshape to [samples, time steps, features]


In [13]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [14]:
# Define the LSTM model
model = Sequential()

# First LSTM layer
model.add(LSTM(64, input_shape=(X_train.shape[1], X_train.shape[2]), return_sequences=True))
model.add(BatchNormalization())
model.add(Dropout(0.3))  # Dropout to prevent overfitting

# Second LSTM layer
model.add(LSTM(32))
model.add(BatchNormalization())
model.add(Dropout(0.3))  # Dropout to prevent overfitting

# Output layer for binary classification
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


  super().__init__(**kwargs)


In [15]:
# Early stopping to prevent overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)


In [16]:
# Train the model
history = model.fit(X_train, y_train, 
                    epochs=5,  # Set a higher number of epochs
                    batch_size=32, 
                    validation_data=(X_val, y_val), 
                    callbacks=[early_stopping])


Epoch 1/5
[1m14311/14311[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m110s[0m 7ms/step - accuracy: 0.9849 - loss: 0.0439 - val_accuracy: 1.0000 - val_loss: 1.5641e-07
Epoch 2/5
[1m14311/14311[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m103s[0m 7ms/step - accuracy: 1.0000 - loss: 4.9107e-05 - val_accuracy: 1.0000 - val_loss: 1.2615e-11
Epoch 3/5
[1m14311/14311[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m104s[0m 7ms/step - accuracy: 1.0000 - loss: 3.7791e-05 - val_accuracy: 1.0000 - val_loss: 3.9854e-13
Epoch 4/5
[1m14311/14311[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m105s[0m 7ms/step - accuracy: 1.0000 - loss: 1.3910e-05 - val_accuracy: 1.0000 - val_loss: 2.0887e-13
Epoch 5/5
[1m14311/14311[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m109s[0m 8ms/step - accuracy: 1.0000 - loss: 6.7483e-05 - val_accuracy: 1.0000 - val_loss: 3.9189e-14


In [17]:
# Save the model to a file
model.save('lunarmodel2.keras')