In [14]:
import os
import numpy as np
import pandas as pd
from obspy import read
from scipy import signal
from obspy.signal.trigger import classic_sta_lta
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

# Set device (CPU or GPU)
device = torch.device("cpu")

In [16]:
# Function to apply padding and create a mask
def pad_sequences(sequences, max_len=None, padding_value=0):
    if max_len is None:
        max_len = max(len(seq) for seq in sequences)

    padded_seqs = np.full((len(sequences), max_len), padding_value, dtype=np.float32)
    masks = np.zeros((len(sequences), max_len), dtype=np.float32)

    for i, seq in enumerate(sequences):
        seq_len = len(seq)
        padded_seqs[i, :seq_len] = seq
        masks[i, :seq_len] = 1  # Valid data points

    return padded_seqs, masks

# Bandpass filter for seismic data
def apply_bandpass_filter(trace, lowcut=0.5, highcut=1.0, sampling_rate=6.625, order=4):
    sos = signal.butter(order, [lowcut, highcut], btype='bandpass', fs=sampling_rate, output='sos')
    filtered_trace = signal.sosfilt(sos, trace)
    return filtered_trace

# STA/LTA feature extraction
def extract_sta_lta_features(trace, sampling_rate, sta_window=1.0, lta_window=5.0, fixed_length=500):
    sta_samples = int(sta_window * sampling_rate)
    lta_samples = int(lta_window * sampling_rate)
    cft = classic_sta_lta(trace, sta_samples, lta_samples)
    
    if len(cft) > fixed_length:
        features = cft[:fixed_length]  # Truncate if longer
    else:
        features = np.pad(cft, (0, fixed_length - len(cft)), 'constant')  # Pad with zeros if shorter
    
    return features

# FFT feature extraction
def extract_spectral_features(trace, sampling_rate, n_fft=256):
    """
    Extract spectral features using FFT.
    
    Parameters:
    - trace: Seismic data trace
    - sampling_rate: Sampling rate of the trace
    - n_fft: Number of FFT points
    
    Returns:
    - spectral_features: Extracted spectral features
    """
    # Perform FFT on the trace
    f, t, Sxx = signal.spectrogram(trace, fs=sampling_rate, nfft=n_fft)
    spectral_features = Sxx.mean(axis=1)  # Mean over time to reduce dimensionality
    return spectral_features

# Data augmentation (add random noise)
def augment_data(trace):
    noise = np.random.normal(0, 0.01, len(trace))
    augmented_trace = trace + noise
    return augmented_trace

# Complete preprocessing function
def preprocess_seismic_data(filepath, filetype, sampling_rate=6.625):
    if filetype == 'csv':
        seismic_data = pd.read_csv(filepath)
        trace = seismic_data['velocity(m/s)'].values
    elif filetype == 'mseed':
        st = read(filepath)
        trace = st[0].data
    
    # Apply bandpass filter
    filtered_trace = apply_bandpass_filter(trace, sampling_rate=sampling_rate)
    
    # Extract STA/LTA and FFT features
    sta_lta_features = extract_sta_lta_features(filtered_trace, sampling_rate)
    fft_features = extract_spectral_features(filtered_trace, sampling_rate)
    
    # Combine both features into a single feature vector
    combined_features = np.concatenate((sta_lta_features, fft_features))
    
    return combined_features

# Load seismic data and optionally augment minority class data
def load_seismic_data(data_dir, catalog_df=None, include_catalog=False, augment_minority_class=False):
    seismic_data = []
    labels = []
    
    for root, _, files in os.walk(data_dir):
        for file in files:
            filepath = os.path.join(root, file)
            if file.endswith('.mseed'):
                filetype = 'mseed'
            elif file.endswith('.csv'):
                filetype = 'csv'
            else:
                continue  # Skip unsupported file types
            
            # Preprocess seismic data (bandpass filtering and STA/LTA + FFT features)
            features = preprocess_seismic_data(filepath, filetype)
            seismic_data.append(features)
            
            # Add labels from catalog if included
            if include_catalog and catalog_df is not None:
                event_id = os.path.splitext(file)[0]
                if event_id in catalog_df['filename'].values:
                    label_row = catalog_df.loc[catalog_df['filename'] == event_id]
                    labels.append(label_row['mq_type'].values[0])  # Extract the string label
    
    # Pad sequences
    padded_data, masks = pad_sequences(seismic_data)

    if include_catalog:
        # Encode labels to numeric values
        label_encoder = LabelEncoder()
        labels_encoded = label_encoder.fit_transform(labels)  # Convert labels to integers
        
        # Augment minority class data if specified
        if augment_minority_class:
            minority_class_indices = np.where(labels_encoded == np.min(labels_encoded))[0]
            augmented_seismic_data = [augment_data(seismic_data[idx]) for idx in minority_class_indices]
            padded_data = np.concatenate((padded_data, augmented_seismic_data), axis=0)
            labels_encoded = np.concatenate((labels_encoded, labels_encoded[minority_class_indices]), axis=0)
        
        return padded_data, labels_encoded, masks
    else:
        return padded_data, masks

# CNN architecture with dropout and global average pooling
class SeismicCNN(nn.Module):
    def __init__(self):
        super(SeismicCNN, self).__init__()
        self.conv1 = nn.Conv1d(in_channels=1, out_channels=64, kernel_size=3, padding=1)
        self.conv2 = nn.Conv1d(in_channels=64, out_channels=128, kernel_size=3, padding=1)
        self.pool = nn.MaxPool1d(2)
        self.dropout = nn.Dropout(p=0.5)  # Dropout for regularization
        self.global_pool = nn.AdaptiveAvgPool1d(1)  # Global average pooling
        self.fc1 = None  # Dynamically initialized later
        self.fc2 = None  # Dynamically initialized later

    def forward(self, x):
        x = torch.relu(self.conv1(x))
        x = self.pool(x)
        x = self.dropout(x)
        x = torch.relu(self.conv2(x))
        x = self.pool(x)
        x = self.global_pool(x)  # Global pooling instead of flattening
        x = x.view(x.size(0), -1)  # Flatten after pooling
        
        if self.fc1 is None:
            self.fc1 = nn.Linear(x.size(1), 100)  # Dynamically initialize fully connected layer
            self.fc2 = nn.Linear(100, 3)  # For 3 classes
        
        x = torch.relu(self.fc1(x))  # Fully connected layer
        x = self.fc2(x)  # Output layer (logits for 3 classes)
        return x

# Prepare data for PyTorch (ensure correct shape)
def prepare_data_for_pytorch(X, y):
    X_tensor = torch.tensor(X, dtype=torch.float32).unsqueeze(1)  # Add channel dimension (batch_size, channels, features)
    y_tensor = torch.tensor(y, dtype=torch.long)  # Long tensor for labels
    dataset = TensorDataset(X_tensor, y_tensor)
    loader = DataLoader(dataset, batch_size=32, shuffle=True)
    return loader

# Train CNN model for multi-class classification
def train_cnn_model(train_loader, y_train, num_epochs=50, learning_rate=0.0001):  # Reduced LR and epochs
    model = SeismicCNN().to(device)
    
    # Calculate class weights for imbalanced data
    class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
    class_weights = torch.tensor(class_weights, dtype=torch.float32).to(device)
    
    # Using class weights in the loss function
    criterion = nn.CrossEntropyLoss(weight=class_weights)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0

        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            
            optimizer.zero_grad()
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        print(f'Epoch {epoch+1}/{num_epochs} - Loss: {running_loss:.4f}')
    
    return model


# Evaluate the model
def evaluate_model(model, test_loader):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for X_batch, y_batch in test_loader:
            X_batch = X_batch.to(device)
            outputs = model(X_batch)
            _, preds = torch.max(outputs, 1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(y_batch.numpy())

    print("\nTest Set Evaluation Metrics:")
    print(classification_report(all_labels, all_preds))
    print(f"Accuracy: {accuracy_score(all_labels, all_preds):.4f}")

from imblearn.over_sampling import SMOTE


# Oversampling the minority classes after preprocessing
def oversample_data(X_train, y_train):
    sm = SMOTE(n_neighbors=1, random_state=42)  # Set n_neighbors to 1
    X_res, y_res = sm.fit_resample(X_train, y_train)
    return X_res, y_res



# Modify main() to use the oversample_data function
def main():
    catalog_path = '../../data/lunar_data/training/catalogs/apollo12_catalog_GradeA_final.csv'
    data_directory = '../../data/lunar_data/training/data/'

    # Load and preprocess data
    print("Preprocessing data...")
    catalog = pd.read_csv(catalog_path)
    X, y, _ = load_seismic_data(data_directory, catalog_df=catalog, include_catalog=True, augment_minority_class=False)

    # Train/Test split (80% train, 20% test)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    # Oversample minority classes in the training set
    X_train_res, y_train_res = oversample_data(X_train, y_train)

    print(f"Shape of training data: {X_train_res.shape}, Test data: {X_test.shape}")

    # Prepare data for PyTorch
    train_loader = prepare_data_for_pytorch(X_train_res, y_train_res)
    test_loader = prepare_data_for_pytorch(X_test, y_test)

    # Train CNN
    print("Training CNN...")
    cnn_model = train_cnn_model(train_loader, y_train_res, num_epochs=100)
    print("CNN training complete.")

    # Evaluate the model on the test set
    print("Evaluating CNN on test set...")
    evaluate_model(cnn_model, test_loader)

if __name__ == "__main__":
    main()



Preprocessing data...


TypeError: SMOTE.__init__() got an unexpected keyword argument 'n_neighbors'