In [1]:
import os
import numpy as np
import pandas as pd
from obspy import read
from scipy import signal
from obspy.signal.trigger import classic_sta_lta
from sklearn.preprocessing import LabelEncoder
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

# Set device to CPU
device = torch.device("cpu")

In [2]:
# Function to apply padding and create a mask
def pad_sequences(sequences, max_len=None, padding_value=0):
    if max_len is None:
        max_len = max(len(seq) for seq in sequences)

    padded_seqs = np.full((len(sequences), max_len), padding_value, dtype=np.float32)
    masks = np.zeros((len(sequences), max_len), dtype=np.float32)

    for i, seq in enumerate(sequences):
        seq_len = len(seq)
        padded_seqs[i, :seq_len] = seq
        masks[i, :seq_len] = 1  # Valid data points

    return padded_seqs, masks

In [3]:
# Bandpass filter for seismic data
def apply_bandpass_filter(trace, lowcut=0.5, highcut=1.0, sampling_rate=6.625, order=4):
    sos = signal.butter(order, [lowcut, highcut], btype='bandpass', fs=sampling_rate, output='sos')
    filtered_trace = signal.sosfilt(sos, trace)
    return filtered_trace

# STA/LTA feature extraction
def extract_sta_lta_features(trace, sampling_rate, sta_window=1.0, lta_window=5.0, fixed_length=500):
    sta_samples = int(sta_window * sampling_rate)
    lta_samples = int(lta_window * sampling_rate)
    cft = classic_sta_lta(trace, sta_samples, lta_samples)
    
    if len(cft) > fixed_length:
        features = cft[:fixed_length]  # Truncate if longer
    else:
        features = np.pad(cft, (0, fixed_length - len(cft)), 'constant')  # Pad with zeros if shorter
    
    return features

In [4]:
# Preprocess seismic data and extract STA/LTA features
def preprocess_seismic_data(filepath, filetype, sampling_rate=6.625):
    if filetype == 'csv':
        seismic_data = pd.read_csv(filepath)
        trace = seismic_data['velocity(m/s)'].values
    elif filetype == 'mseed':
        st = read(filepath)
        trace = st[0].data
    
    # Apply bandpass filter to trace
    filtered_trace = apply_bandpass_filter(trace, sampling_rate=sampling_rate)
    
    # Extract STA/LTA features and find arrival time
    cft, arrival_time_rel = extract_sta_lta_features(filtered_trace, sampling_rate)
    
    return filtered_trace, cft, arrival_time_rel

In [5]:
# Load seismic data and catalog for arrival time processing
def load_seismic_data(data_dir, catalog_df=None, include_catalog=False, max_len=500):
    seismic_data = []
    labels = []
    
    for root, _, files in os.walk(data_dir):
        for file in files:
            filepath = os.path.join(root, file)
            if file.endswith('.mseed'):
                filetype = 'mseed'
            elif file.endswith('.csv'):
                filetype = 'csv'
            else:
                continue  # Skip unsupported file types
            
            # Preprocess seismic data and extract STA/LTA
            filtered_trace, cft, arrival_time_rel = preprocess_seismic_data(filepath, filetype)
            seismic_data.append(filtered_trace)  # Save filtered traces
            
            # Use catalog to append labels (arrival times)
            if include_catalog and catalog_df is not None:
                event_id = os.path.splitext(file)[0]
                if event_id in catalog_df['filename'].values:
                    label_row = catalog_df.loc[catalog_df['filename'] == event_id]
                    labels.append(label_row['time_abs(%Y-%m-%dT%H:%M:%S.%f)'].values[0])  # Extract absolute arrival time
    
    # Pad sequences to ensure all traces are of the same length
    padded_data, _ = pad_sequences(seismic_data, max_len=max_len)

    if include_catalog:
        return padded_data, np.array(labels)
    else:
        return padded_data

In [6]:
class SeismicCNN(nn.Module):
    def __init__(self):
        super(SeismicCNN, self).__init__()
        self.conv1 = nn.Conv1d(in_channels=1, out_channels=64, kernel_size=3, stride=1, padding=1)
        self.pool = nn.MaxPool1d(kernel_size=2, stride=2)
        self.conv2 = nn.Conv1d(in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=1)
        self.fc1 = None
        self.fc2 = None

    def forward(self, x):
        x = self.pool(torch.relu(self.conv1(x)))  # After conv1 + pool
        x = self.pool(torch.relu(self.conv2(x)))  # After conv2 + pool

        # Flatten the output of the conv layers to pass to fully connected layers
        x = x.view(x.size(0), -1)  # Flatten the output dynamically based on batch size
        
        if self.fc1 is None:
            # Dynamically initialize the fully connected layers based on the input size
            self.fc1 = nn.Linear(x.size(1), 100)  # Use the computed flattened size
            self.fc2 = nn.Linear(100, 3)  # For 3 classes

        x = torch.relu(self.fc1(x))  # Pass through fully connected layer
        x = self.fc2(x)  # Output layer (logits for 3 classes)
        return x



In [7]:
# Prepare data for PyTorch (ensure correct shape)
def prepare_data_for_pytorch(X, y):
    X_tensor = torch.tensor(X, dtype=torch.float32).unsqueeze(1)  # Add channel dimension (batch_size, channels, features)
    y_tensor = torch.tensor(y, dtype=torch.long)  # Long tensor for labels
    dataset = TensorDataset(X_tensor, y_tensor)
    loader = DataLoader(dataset, batch_size=32, shuffle=True)
    return loader

In [8]:
# Train CNN model for multi-class classification
def train_cnn_model(train_loader, val_loader, num_epochs=100):
    model = SeismicCNN().to(device)
    criterion = nn.CrossEntropyLoss()  # For multi-class classification
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0

        for i, (X_batch, y_batch) in enumerate(train_loader):
            optimizer.zero_grad()
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            outputs = model(X_batch)  # Logits output
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()

        print(f"Epoch [{epoch+1}/{num_epochs}] complete, Total Loss: {running_loss:.4f}")

    return model

In [9]:
# Define the evaluation function
def evaluate_model(model, test_loader):
    model.eval()  # Set the model to evaluation mode
    y_true = []
    y_pred = []

    with torch.no_grad():  # No need to compute gradients during evaluation
        for X_batch, y_batch in test_loader:
            outputs = model(X_batch)  # Forward pass
            _, predicted = torch.max(outputs, 1)  # Get the predicted class
            
            y_true.extend(y_batch.cpu().numpy())  # Store true labels
            y_pred.extend(predicted.cpu().numpy())  # Store predictions

    # Calculate and display evaluation metrics
    print("Classification Report:")
    print(classification_report(y_true, y_pred, target_names=['Class 0', 'Class 1', 'Class 2']))

    return y_true, y_pred

In [10]:
def main():
    catalog_path = '../../data/lunar_data/training/catalogs/apollo12_catalog_GradeA_final.csv'
    data_directory = '../../data/lunar_data/training/data/'

    # Load and preprocess data
    print("Preprocessing data...")
    catalog = pd.read_csv(catalog_path)
    X, y, _ = load_seismic_data(data_directory, catalog_df=catalog, include_catalog=True)

    # Train/Validation/Test split (60% train, 20% validation, 20% test)
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42, stratify=y)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

    print(f"Shape of training data: {X_train.shape}, Validation data: {X_val.shape}, Test data: {X_test.shape}")

    # Prepare data for PyTorch
    train_loader = prepare_data_for_pytorch(X_train, y_train)
    val_loader = prepare_data_for_pytorch(X_val, y_val)
    test_loader = prepare_data_for_pytorch(X_test, y_test)

    # Train CNN
    print("Training CNN...")
    cnn_model = train_cnn_model(train_loader, val_loader, num_epochs=100)
    print("CNN training complete.")

    # Evaluate the model on the test set
    print("Evaluating CNN on test set...")
    evaluate_model(cnn_model, test_loader)

In [11]:
if __name__ == "__main__":
    main()

Preprocessing data...


ValueError: too many values to unpack (expected 2)

In [None]:
import matplotlib.pyplot as plt

# Data from the training loop
epochs = list(range(1, 101))
train_losses = [
    3.3378, 2.9283, 2.6316, 2.3719, 2.1595, 1.9974, 1.8743, 1.7731, 1.7275, 1.6773,
    1.6550, 1.6350, 1.6398, 1.6162, 1.6127, 1.5282, 1.5398, 1.5448, 1.5192, 1.5318,
    1.5179, 1.5241, 1.4718, 1.4838, 1.4495, 1.4187, 1.4091, 1.4017, 1.4140, 1.3989,
    1.3846, 1.3787, 1.3296, 1.3468, 1.3302, 1.3196, 1.3033, 1.3280, 1.3051, 1.2995,
    1.3079, 1.3041, 1.2648, 1.2614, 1.2522, 1.2324, 1.2393, 1.2522, 1.2206, 1.2200,
    1.2026, 1.2139, 1.2249, 1.2037, 1.2351, 1.1769, 1.1634, 1.1634, 1.1636, 1.1250,
    1.1274, 1.0957, 1.1422, 1.1089, 1.0969, 1.0890, 1.1023, 1.0637, 1.0400, 1.0507,
    1.0331, 1.0087, 0.9976, 0.9925, 1.0022, 1.0183, 0.9913, 0.9706, 0.9803, 0.9458,
    0.9403, 0.9291, 0.9663, 0.9627, 0.9335, 0.9536, 0.8982, 0.9032, 0.8985, 0.8756,
    0.8699, 0.8616, 0.8789, 0.8596, 0.8595, 0.8409, 0.8131, 0.8206, 0.8261, 0.8191
]

# Plotting the training loss
plt.figure(figsize=(10, 6))
plt.plot(epochs, train_losses, label="Training Loss", color='b')
plt.title("Training Loss Over Epochs")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
plt.grid(True)
plt.show()


In [13]:
import os
import numpy as np
import pandas as pd
from obspy import read
from scipy import signal
from obspy.signal.trigger import classic_sta_lta
from sklearn.preprocessing import LabelEncoder
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

# Set device to CPU
device = torch.device("cpu")

# Function to apply padding and create a mask
def pad_sequences(sequences, max_len=None, padding_value=0):
    if max_len is None:
        max_len = max(len(seq) for seq in sequences)

    padded_seqs = np.full((len(sequences), max_len), padding_value, dtype=np.float32)
    masks = np.zeros((len(sequences), max_len), dtype=np.float32)

    for i, seq in enumerate(sequences):
        seq_len = len(seq)
        padded_seqs[i, :seq_len] = seq
        masks[i, :seq_len] = 1  # Valid data points

    return padded_seqs, masks

# Bandpass filter for seismic data
def apply_bandpass_filter(trace, lowcut=0.5, highcut=1.0, sampling_rate=6.625, order=4):
    sos = signal.butter(order, [lowcut, highcut], btype='bandpass', fs=sampling_rate, output='sos')
    filtered_trace = signal.sosfilt(sos, trace)
    return filtered_trace

# STA/LTA feature extraction
def extract_sta_lta_features(trace, sampling_rate, sta_window=1.0, lta_window=5.0, fixed_length=500):
    sta_samples = int(sta_window * sampling_rate)
    lta_samples = int(lta_window * sampling_rate)
    cft = classic_sta_lta(trace, sta_samples, lta_samples)
    
    if len(cft) > fixed_length:
        features = cft[:fixed_length]  # Truncate if longer
    else:
        features = np.pad(cft, (0, fixed_length - len(cft)), 'constant')  # Pad with zeros if shorter
    
    return features

# Complete preprocessing function
def preprocess_seismic_data(filepath, filetype, sampling_rate=6.625):
    if filetype == 'csv':
        seismic_data = pd.read_csv(filepath)
        trace = seismic_data['velocity(m/s)'].values
    elif filetype == 'mseed':
        st = read(filepath)
        trace = st[0].data
    
    filtered_trace = apply_bandpass_filter(trace, sampling_rate=sampling_rate)
    features = extract_sta_lta_features(filtered_trace, sampling_rate)
    return features

def load_seismic_data(data_dir, catalog_df=None, include_catalog=False):
    seismic_data = []
    labels = []
    
    for root, _, files in os.walk(data_dir):
        for file in files:
            filepath = os.path.join(root, file)
            if file.endswith('.mseed'):
                filetype = 'mseed'
            elif file.endswith('.csv'):
                filetype = 'csv'
            else:
                continue  # Skip unsupported file types
            
            # Preprocess seismic data (bandpass filtering and STA/LTA)
            features = preprocess_seismic_data(filepath, filetype)
            seismic_data.append(features)
            
            if include_catalog and catalog_df is not None:
                event_id = os.path.splitext(file)[0]
                if event_id in catalog_df['filename'].values:
                    label_row = catalog_df.loc[catalog_df['filename'] == event_id]
                    labels.append(label_row['mq_type'].values[0])  # Extract the string label
    
    # Convert seismic data to NumPy array
    padded_data, masks = pad_sequences(seismic_data)

    if include_catalog:
        # Encode labels to numeric values
        label_encoder = LabelEncoder()
        labels_encoded = label_encoder.fit_transform(labels)  # Convert labels to integers
        return padded_data, labels_encoded, masks
    else:
        return padded_data, masks

class SeismicCNN(nn.Module):
    def __init__(self):
        super(SeismicCNN, self).__init__()
        self.conv1 = nn.Conv1d(in_channels=1, out_channels=64, kernel_size=3, stride=1, padding=1)
        self.pool = nn.MaxPool1d(kernel_size=2, stride=2)
        self.conv2 = nn.Conv1d(in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=1)
        self.fc1 = None
        self.fc2 = None

    def forward(self, x):
        x = self.pool(torch.relu(self.conv1(x)))  # After conv1 + pool
        x = self.pool(torch.relu(self.conv2(x)))  # After conv2 + pool

        # Flatten the output of the conv layers to pass to fully connected layers
        x = x.view(x.size(0), -1)  # Flatten the output dynamically based on batch size
        
        if self.fc1 is None:
            # Dynamically initialize the fully connected layers based on the input size
            self.fc1 = nn.Linear(x.size(1), 100)  # Use the computed flattened size
            self.fc2 = nn.Linear(100, 3)  # For 3 classes

        x = torch.relu(self.fc1(x))  # Pass through fully connected layer
        x = self.fc2(x)  # Output layer (logits for 3 classes)
        return x

# Prepare data for PyTorch (ensure correct shape)
def prepare_data_for_pytorch(X, y):
    X_tensor = torch.tensor(X, dtype=torch.float32).unsqueeze(1)  # Add channel dimension (batch_size, channels, features)
    y_tensor = torch.tensor(y, dtype=torch.long)  # Long tensor for labels
    dataset = TensorDataset(X_tensor, y_tensor)
    loader = DataLoader(dataset, batch_size=32, shuffle=True)
    return loader

# Train CNN model for multi-class classification
def train_cnn_model(train_loader, val_loader, num_epochs=100):
    model = SeismicCNN().to(device)
    criterion = nn.CrossEntropyLoss()  # For multi-class classification
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0

        for i, (X_batch, y_batch) in enumerate(train_loader):
            optimizer.zero_grad()
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            outputs = model(X_batch)  # Logits output
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()

        print(f"Epoch [{epoch+1}/{num_epochs}] complete, Total Loss: {running_loss:.4f}")

    return model

def evaluate_model(model, test_loader):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for X_batch, y_batch in test_loader:
            X_batch = X_batch.to(device)
            outputs = model(X_batch)
            _, preds = torch.max(outputs, 1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(y_batch.numpy())

    print("\nTest Set Evaluation Metrics:")
    print(classification_report(all_labels, all_preds))
    print(f"Accuracy: {accuracy_score(all_labels, all_preds):.4f}")

def main():
    catalog_path = '../../data/lunar_data/training/catalogs/apollo12_catalog_GradeA_final.csv'
    data_directory = '../../data/lunar_data/training/data/'

    # Load and preprocess data
    print("Preprocessing data...")
    catalog = pd.read_csv(catalog_path)
    X, y, _ = load_seismic_data(data_directory, catalog_df=catalog, include_catalog=True)

    # Train/Validation/Test split (60% train, 20% validation, 20% test)
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42, stratify=y)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

    print(f"Shape of training data: {X_train.shape}, Validation data: {X_val.shape}, Test data: {X_test.shape}")

    # Prepare data for PyTorch
    train_loader = prepare_data_for_pytorch(X_train, y_train)
    val_loader = prepare_data_for_pytorch(X_val, y_val)
    test_loader = prepare_data_for_pytorch(X_test, y_test)

    # Train CNN
    print("Training CNN...")
    cnn_model = train_cnn_model(train_loader, val_loader, num_epochs=100)
    print("CNN training complete.")

    # Save model and weights
    torch.save({
        'model_state_dict': cnn_model.state_dict(),
        'model': cnn_model,
    }, 'seismic_cnn_model.pth')
    print("CNN model and weights saved.")

    # Evaluate the model on the test set
    print("Evaluating CNN on test set...")
    evaluate_model(cnn_model, test_loader)

if __name__ == "__main__":
    main()


Preprocessing data...
Shape of training data: (91, 500), Validation data: (30, 500), Test data: (31, 500)
Training CNN...
Epoch [1/100] complete, Total Loss: 3.2546
Epoch [2/100] complete, Total Loss: 2.9609
Epoch [3/100] complete, Total Loss: 2.7682
Epoch [4/100] complete, Total Loss: 2.5515
Epoch [5/100] complete, Total Loss: 2.3803
Epoch [6/100] complete, Total Loss: 2.2616
Epoch [7/100] complete, Total Loss: 2.0957
Epoch [8/100] complete, Total Loss: 1.9957
Epoch [9/100] complete, Total Loss: 1.9162
Epoch [10/100] complete, Total Loss: 1.8761
Epoch [11/100] complete, Total Loss: 1.7633
Epoch [12/100] complete, Total Loss: 1.6846
Epoch [13/100] complete, Total Loss: 1.6444
Epoch [14/100] complete, Total Loss: 1.6304
Epoch [15/100] complete, Total Loss: 1.5586
Epoch [16/100] complete, Total Loss: 1.5554
Epoch [17/100] complete, Total Loss: 1.5017
Epoch [18/100] complete, Total Loss: 1.4375
Epoch [19/100] complete, Total Loss: 1.4281
Epoch [20/100] complete, Total Loss: 1.4194
Epoch [

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Breakdown of the Code:

    Data Preprocessing:
        Bandpass Filter: A bandpass filter is applied to the seismic traces to isolate frequency components between 0.5 and 1.0 Hz.
        STA/LTA Feature Extraction: The classic STA/LTA algorithm is used to detect changes in signal amplitude, which can highlight seismic events. The resulting STA/LTA feature vector is truncated or padded to a fixed length (500).

    Data Loading:
        The seismic data is loaded from the directory, either in .mseed or .csv format, and then preprocessed (bandpass filtering and STA/LTA). Event labels are matched with the seismic data using the catalog file.

    CNN Architecture:
        The CNN uses two 1D convolutional layers followed by max-pooling layers. The output is flattened and passed through a fully connected layer for classification into three classes (Class 0, Class 1, Class 2).
        The fully connected layer dimensions are dynamically set based on the flattened output size.

    Training:
        A typical training loop is used where the model is optimized using the Adam optimizer and CrossEntropy loss for 100 epochs. The loss decreases steadily, indicating learning progress.

    Evaluation:
        After training, the model is evaluated on a test set. Metrics such as precision, recall, F1-score, and accuracy are calculated to assess the model's performance.

Analysis of Results:

    Training Loss: The loss starts at 3.3378 and gradually decreases to 0.8191 over 100 epochs. This shows that the model is learning and adjusting its parameters during training.

    Test Performance:

        Class Imbalance:
            The dataset is heavily imbalanced, as seen from the support values in the classification report:
                Class 0 has 4 samples.
                Class 1 has 26 samples.
                Class 2 has 1 sample.
            This imbalance can heavily skew the training process, causing the model to overfit to the majority class (Class 1), while performing poorly on the minority classes (Class 0 and Class 2).

        Accuracy and Recall:
            The accuracy of 84% is driven primarily by the model’s performance on Class 1, which has 26 samples, and the model correctly predicts all of them (recall of 1.00 for Class 1). However, this is misleading as it masks the poor performance on Class 0 and Class 2.

        Zero Precision/Recall for Minority Classes (Class 0 and Class 2):
            The model fails to correctly classify any samples from Class 0 and Class 2, with zero precision, recall, and F1-score for these classes. This is likely due to the class imbalance and the insufficient number of samples from these classes for the model to learn meaningful patterns.

Key Issues and Recommendations:

    Class Imbalance:
        The main issue is the severe class imbalance, which causes the model to prioritize the majority class (Class 1) and ignore the minority classes (Class 0 and Class 2).
        Solution: You can address this imbalance by using techniques such as:
            Oversampling: Increase the number of samples in the minority classes by duplicating or generating synthetic data (e.g., using SMOTE).
            Class Weights: Adjust the loss function to penalize misclassifications of minority classes more heavily by setting class weights in the CrossEntropyLoss function.

    Limited Minority Class Data:
        With only 1 sample in Class 2 and 4 samples in Class 0, the model has little opportunity to learn patterns for these classes.
        Solution: It may be necessary to collect more data for these classes or apply data augmentation techniques to create more varied samples.

    Model Evaluation:
        Accuracy is not a reliable metric when dealing with imbalanced data, as it can be skewed by the majority class performance.
        Solution: Focus on precision, recall, and F1-score for each class to get a better understanding of how well the model performs across all classes.

    Dynamic Model Structure:
        The model structure dynamically adapts to the input size, which is a flexible design. However, this dynamic initialization might not always work well when scaling to larger datasets or more complex architectures. A more predefined architecture can provide stability in training.

Conclusion:

The CNN model shows potential in classifying seismic events, but its performance is severely limited by the class imbalance in the dataset. To improve performance, especially on minority classes, you should apply class-balancing techniques such as oversampling, adding class weights to the loss function, or collecting more data for the minority classes.