In [1]:
import os
import numpy as np
import pandas as pd
import json
from obspy import read
from scipy import signal
from obspy.signal.trigger import classic_sta_lta
from sklearn.preprocessing import LabelEncoder
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Check if a GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
# Function to apply padding and create a mask
def pad_sequences(sequences, max_len=None, padding_value=0):
    """
    Pad each sequence to the maximum length with a specified padding value and create a mask.
    """
    if max_len is None:
        max_len = max(len(seq) for seq in sequences)

    padded_seqs = np.full((len(sequences), max_len), padding_value, dtype=np.float32)
    masks = np.zeros((len(sequences), max_len), dtype=np.float32)

    for i, seq in enumerate(sequences):
        seq_len = len(seq)
        padded_seqs[i, :seq_len] = seq
        masks[i, :seq_len] = 1  # Valid data points

    return padded_seqs, masks

In [3]:
# Bandpass filter for seismic data
def apply_bandpass_filter(trace, lowcut=0.5, highcut=1.0, sampling_rate=6.625, order=4):
    sos = signal.butter(order, [lowcut, highcut], btype='bandpass', fs=sampling_rate, output='sos')
    filtered_trace = signal.sosfilt(sos, trace)
    return filtered_trace

In [4]:
# STA/LTA feature extraction
def extract_sta_lta_features(trace, sampling_rate, sta_window=1.0, lta_window=5.0, fixed_length=500):
    sta_samples = int(sta_window * sampling_rate)
    lta_samples = int(lta_window * sampling_rate)
    cft = classic_sta_lta(trace, sta_samples, lta_samples)
    
    if len(cft) > fixed_length:
        features = cft[:fixed_length]  # Truncate if longer
    else:
        features = np.pad(cft, (0, fixed_length - len(cft)), 'constant')  # Pad with zeros if shorter
    
    return features

In [5]:
# Complete preprocessing function
def preprocess_seismic_data(filepath, filetype, sampling_rate=6.625):
    if filetype == 'csv':
        seismic_data = pd.read_csv(filepath)
        trace = seismic_data['velocity(m/s)'].values
    elif filetype == 'mseed':
        st = read(filepath)
        trace = st[0].data
    
    filtered_trace = apply_bandpass_filter(trace, sampling_rate=sampling_rate)
    features = extract_sta_lta_features(filtered_trace, sampling_rate)
    return features

In [6]:
def load_seismic_data(data_dir, catalog_df=None, include_catalog=False):
    seismic_data = []
    labels = []
    
    for root, _, files in os.walk(data_dir):
        for file in files:
            filepath = os.path.join(root, file)
            if file.endswith('.mseed'):
                filetype = 'mseed'
            elif file.endswith('.csv'):
                filetype = 'csv'
            else:
                continue  # Skip unsupported file types
            
            # Preprocess seismic data (bandpass filtering and STA/LTA)
            features = preprocess_seismic_data(filepath, filetype)
            seismic_data.append(features)
            
            if include_catalog and catalog_df is not None:
                event_id = os.path.splitext(file)[0]
                if event_id in catalog_df['filename'].values:
                    label_row = catalog_df.loc[catalog_df['filename'] == event_id]
                    labels.append(label_row['mq_type'].values[0])  # Extract the string label
    
    # Convert seismic data to NumPy array
    padded_data, masks = pad_sequences(seismic_data)

    if include_catalog:
        # Encode labels to numeric values
        label_encoder = LabelEncoder()
        labels_encoded = label_encoder.fit_transform(labels)  # Convert labels to integers
        return padded_data, labels_encoded, masks
    else:
        return padded_data, masks


In [7]:
class SeismicCNN(nn.Module):
    def __init__(self):
        super(SeismicCNN, self).__init__()
        self.conv1 = nn.Conv1d(in_channels=1, out_channels=64, kernel_size=3, stride=1, padding=1)
        self.pool = nn.MaxPool1d(kernel_size=2, stride=2)
        self.conv2 = nn.Conv1d(in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=1)
        
        # Placeholder for fully connected layer input size (to be calculated dynamically)
        self.fc1 = None
        self.fc2 = None

    def forward(self, x):
        x = self.pool(torch.relu(self.conv1(x)))  # After conv1 + pool
        x = self.pool(torch.relu(self.conv2(x)))  # After conv2 + pool

        # Flatten the output of the conv layers to pass to fully connected layers
        x = x.view(x.size(0), -1)  # Flatten the output dynamically based on batch size
        
        if self.fc1 is None:
            # Dynamically initialize the fully connected layers based on the input size
            self.fc1 = nn.Linear(x.size(1), 100)  # Use the computed flattened size
            self.fc2 = nn.Linear(100, 3)  # For 3 classes

        x = torch.relu(self.fc1(x))  # Pass through fully connected layer
        x = self.fc2(x)  # Output layer (logits for 3 classes)
        return x

In [8]:
# Prepare data for PyTorch (ensure correct shape)
def prepare_data_for_pytorch(X, y):
    X_tensor = torch.tensor(X, dtype=torch.float32).unsqueeze(1)  # Add channel dimension (batch_size, channels, features)
    y_tensor = torch.tensor(y, dtype=torch.long)  # Long tensor for labels
    dataset = TensorDataset(X_tensor, y_tensor)
    train_loader = DataLoader(dataset, batch_size=32, shuffle=True)
    return train_loader

In [9]:
# Train CNN model for multi-class classification
def train_cnn_model(train_loader, num_epochs=100):
    model = SeismicCNN()
    criterion = nn.CrossEntropyLoss()  # For multi-class classification
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0

        for i, (X_batch, y_batch) in enumerate(train_loader):
            optimizer.zero_grad()
            outputs = model(X_batch)  # Logits output

            # Calculate loss (CrossEntropyLoss expects raw logits and integer labels)
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()

            print(f"Epoch [{epoch+1}/{num_epochs}], Batch [{i+1}/{len(train_loader)}], Loss: {loss.item():.4f}")

        print(f"Epoch [{epoch+1}/{num_epochs}] complete, Total Loss: {running_loss:.4f}")

    return model  # Return the trained model


In [10]:
# Save model and weights
def save_model_artifacts(model, model_name='seismic_cnn_model'):
    # 1. Save the model architecture (artifact) in a JSON or similar format
    model_architecture = {
        'input_size': model.conv1.in_channels,
        'conv_layers': [
            {'in_channels': model.conv1.in_channels, 'out_channels': model.conv1.out_channels, 'kernel_size': model.conv1.kernel_size, 'stride': model.conv1.stride, 'padding': model.conv1.padding},
            {'in_channels': model.conv2.in_channels, 'out_channels': model.conv2.out_channels, 'kernel_size': model.conv2.kernel_size, 'stride': model.conv2.stride, 'padding': model.conv2.padding}
        ],
        'fc_layers': [
            {'in_features': model.fc1.in_features, 'out_features': model.fc1.out_features},
            {'in_features': model.fc2.in_features, 'out_features': model.fc2.out_features}
        ]
    }
    
    with open(f'{model_name}_architecture.json', 'w') as f:
        json.dump(model_architecture, f)
    print(f"Model architecture saved to {model_name}_architecture.json")

    # 2. Save the model weights
    torch.save(model.state_dict(), f'{model_name}_weights.pth')
    print(f"Model weights saved to {model_name}_weights.pth")

    # 3. Save the full model (model + weights)
    torch.save(model, f'{model_name}_full.pth')
    print(f"Full model saved to {model_name}_full.pth")

In [11]:
def main():
    catalog_path = '../../data/lunar_data/training/catalogs/apollo12_catalog_GradeA_final.csv'
    data_directory = '../../data/lunar_data/training/data/'

    # Load and preprocess data
    print("Preprocessing data...")
    catalog = pd.read_csv(catalog_path)
    X, y, _ = load_seismic_data(data_directory, catalog_df=catalog, include_catalog=True)

    print(f"Shape of features (X): {X.shape}")
    print(f"Shape of labels (y): {y.shape}")
    print(f"Unique labels: {np.unique(y)}")

    if X.shape[0] == 0 or y.shape[0] == 0:
        print("No data to train on!")
        return

    # Train CNN
    print("Training CNN...")
    train_loader = prepare_data_for_pytorch(X, y)
    cnn_model = train_cnn_model(train_loader)  # Get the trained model
    print("CNN training complete.")

    # Save model and weights
    save_model_artifacts(cnn_model, 'seismic_cnn_model')  # Pass the model to the save function
    print("CNN model and weights saved.")

In [12]:
# Call the main function
if __name__ == "__main__":
    main()

Preprocessing data...
Shape of features (X): (152, 500)
Shape of labels (y): (152,)
Unique labels: [0 1 2]
Training CNN...
Epoch [1/100], Batch [1/5], Loss: 1.0302
Epoch [1/100], Batch [2/5], Loss: 0.9859
Epoch [1/100], Batch [3/5], Loss: 0.9427
Epoch [1/100], Batch [4/5], Loss: 0.9016
Epoch [1/100], Batch [5/5], Loss: 0.8304
Epoch [1/100] complete, Total Loss: 4.6908
Epoch [2/100], Batch [1/5], Loss: 0.8788
Epoch [2/100], Batch [2/5], Loss: 0.8180
Epoch [2/100], Batch [3/5], Loss: 0.7749
Epoch [2/100], Batch [4/5], Loss: 0.7062
Epoch [2/100], Batch [5/5], Loss: 0.7194
Epoch [2/100] complete, Total Loss: 3.8972
Epoch [3/100], Batch [1/5], Loss: 0.7102
Epoch [3/100], Batch [2/5], Loss: 0.6879
Epoch [3/100], Batch [3/5], Loss: 0.7090
Epoch [3/100], Batch [4/5], Loss: 0.6688
Epoch [3/100], Batch [5/5], Loss: 0.5862
Epoch [3/100] complete, Total Loss: 3.3620
Epoch [4/100], Batch [1/5], Loss: 0.5825
Epoch [4/100], Batch [2/5], Loss: 0.6672
Epoch [4/100], Batch [3/5], Loss: 0.5887
Epoch [4/1

Analysis of the Code and Results
1. Data Preprocessing

The code handles seismic data preprocessing using time series data in either .mseed or .csv format. The preprocessing steps are as follows:

    Bandpass Filtering: The seismic signal is filtered to retain frequencies between 0.5 Hz and 1.0 Hz to remove noise outside this range.
    STA/LTA Feature Extraction: The code computes the Short-Term Average/Long-Term Average (STA/LTA) ratio for each time series. This feature extraction technique helps detect anomalies or abrupt changes in seismic data, such as earthquakes.

The STA/LTA ratios are then truncated or padded to a fixed length of 500 samples for each signal, ensuring a uniform input size for the CNN model.
2. Data Loading and Labeling

The load_seismic_data function reads seismic data from either .mseed or .csv files and matches them with their corresponding event labels from a catalog dataframe (catalog_df). The labels (mq_type) are converted to numeric values using LabelEncoder, which maps event types (likely impact, deep, and shallow seismic events) to integers [0, 1, 2].
3. CNN Model Architecture

The CNN model, defined as SeismicCNN, is a basic two-layer 1D convolutional neural network designed to classify seismic events into three categories:

    Layer 1: A 1D convolutional layer with 64 filters followed by max pooling.
    Layer 2: A second 1D convolutional layer with 128 filters, also followed by max pooling.
    Fully Connected Layers: Two fully connected (dense) layers are dynamically initialized based on the flattened output size after the convolutional layers. The final output has 3 units, corresponding to the 3 seismic event categories.

The model uses ReLU activation functions and CrossEntropyLoss as the loss function, appropriate for multi-class classification problems.
4. Model Training and Performance

The model is trained for 100 epochs, with 5 batches per epoch, on a dataset of 152 samples with 500 features each. The training process outputs the loss after each batch and computes the total loss at the end of each epoch. Here is an analysis of the training results:

    Initial Loss: The loss starts at a relatively high value (4.6908 at Epoch 1) and gradually decreases as the model learns, indicating that the model is improving its predictions over time.
    Convergence: The total loss consistently decreases with each epoch, reaching a final loss of 1.1257 after 100 epochs. This suggests that the model is learning and converging as expected.
    Learning Rate Stability: The use of the Adam optimizer (with a learning rate of 0.001) helps maintain a steady improvement in loss reduction without sharp fluctuations, showing that the learning process is stable.

The training loss decreases steadily, which is a good sign. However, without a validation set, it is hard to say whether the model is overfitting to the training data or if it generalizes well to new data. Introducing a validation set could help track the model's performance on unseen data and ensure it isn't overfitting.
5. Model Saving

The model architecture and weights are saved using three different methods:

    Model architecture in JSON format: The architecture details (convolutional layers, filter sizes, etc.) are saved in a JSON file.
    Model weights in a .pth file: The trained model's weights are saved in PyTorch’s binary format.
    Full model: Both the architecture and weights are saved together for later use.

This modular saving approach ensures that you can reload the model and its weights for further testing or deployment.
6. Key Observations from the Training Process

    Initial Learning: The first few epochs show rapid improvements in the loss (Epoch 1 total loss is 4.6908, and Epoch 2 drops to 3.8972). This is typical in early-stage training, where the model quickly adjusts its weights to reduce the error.
    Gradual Improvement: After initial epochs, the improvements become more gradual, which is expected as the model fine-tunes the weights and learns more subtle patterns in the data.
    Epoch 100 Results: By the end of Epoch 100, the model has reduced the total loss to 1.1257, which suggests the model is well-trained on the dataset.

7. Limitations and Suggestions

    Absence of a Validation Set: There is no validation set in the training process to check for overfitting. Adding a validation dataset and evaluating metrics like accuracy, precision, recall, and F1-score would help assess model performance more thoroughly.
    Class Imbalance Check: The unique labels are [0, 1, 2], which correspond to the different seismic event types. While the labels appear balanced, verifying the distribution of these labels would ensure no class imbalance, which could bias the model towards the majority class.
    Potential Overfitting: Since the dataset is small (152 samples), the model may overfit. Techniques like regularization (e.g., dropout) or data augmentation could help mitigate overfitting risks.
    Testing on Unlabeled Data: Once the model is trained, it can be tested on unlabeled .mseed files to see if it can accurately detect seismic anomalies in new data.

8. Final Recommendations

    Add a validation set: Split the dataset into training and validation subsets to better understand the model's generalization ability.
    Evaluate on test data: After training, evaluate the model on new, unseen data to verify its real-world applicability.
    Monitor evaluation metrics: Instead of relying only on the loss, track metrics like accuracy, precision, recall, and F1-score to understand how well the model performs on each class.
    Test for generalization: Apply the model to unlabeled seismic data to check how well it can classify new events and potentially identify seismic anomalies in real-world scenarios.

Conclusion

The model shows strong learning capacity with a steady reduction in loss over 100 epochs. The architecture and training approach are sound, and the saved model can be used for further testing. However, to ensure the model generalizes well to unseen data, the inclusion of a validation set and comprehensive performance metrics (accuracy, precision, recall) is highly recommended.