In [1]:
import os
import numpy as np
import pandas as pd
from obspy import read
from scipy import signal
from obspy.signal.trigger import classic_sta_lta
from sklearn.preprocessing import LabelEncoder
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [2]:
# Function to apply padding and create a mask
def pad_sequences(sequences, max_len=None, padding_value=0):
    """
    Pad each sequence to the maximum length with a specified padding value and create a mask.
    """
    if max_len is None:
        max_len = max(len(seq) for seq in sequences)

    padded_seqs = np.full((len(sequences), max_len), padding_value, dtype=np.float32)
    masks = np.zeros((len(sequences), max_len), dtype=np.float32)

    for i, seq in enumerate(sequences):
        seq_len = len(seq)
        padded_seqs[i, :seq_len] = seq
        masks[i, :seq_len] = 1  # Valid data points

    return padded_seqs, masks

In [3]:
# Bandpass filter for seismic data
def apply_bandpass_filter(trace, lowcut=0.5, highcut=3.0, sampling_rate=6.625, order=4):
    sos = signal.butter(order, [lowcut, highcut], btype='bandpass', fs=sampling_rate, output='sos')
    filtered_trace = signal.sosfilt(sos, trace)
    return filtered_trace

In [4]:
# STA/LTA feature extraction
def extract_sta_lta_features(trace, sampling_rate, sta_window=1.0, lta_window=5.0, fixed_length=500):
    sta_samples = int(sta_window * sampling_rate)
    lta_samples = int(lta_window * sampling_rate)
    cft = classic_sta_lta(trace, sta_samples, lta_samples)
    
    if len(cft) > fixed_length:
        features = cft[:fixed_length]  # Truncate if longer
    else:
        features = np.pad(cft, (0, fixed_length - len(cft)), 'constant')  # Pad with zeros if shorter
    
    return features

In [5]:
# Complete preprocessing function
def preprocess_seismic_data(filepath, filetype, sampling_rate=6.625):
    if filetype == 'csv':
        seismic_data = pd.read_csv(filepath)
        trace = seismic_data['velocity(m/s)'].values
    elif filetype == 'mseed':
        st = read(filepath)
        trace = st[0].data
    
    filtered_trace = apply_bandpass_filter(trace, sampling_rate=sampling_rate)
    features = extract_sta_lta_features(filtered_trace, sampling_rate)
    return features

In [6]:
def load_seismic_data(data_dir, catalog_df=None, include_catalog=False):
    seismic_data = []
    labels = []
    
    for root, _, files in os.walk(data_dir):
        for file in files:
            filepath = os.path.join(root, file)
            if file.endswith('.csv'):
                filetype = 'csv'
            elif file.endswith('.mseed'):
                filetype = 'mseed'
            else:
                continue  # Skip unsupported file types
            
            # Preprocess seismic data (bandpass filtering and STA/LTA)
            features = preprocess_seismic_data(filepath, filetype)
            seismic_data.append(features)
            
            if include_catalog and catalog_df is not None:
                event_id = os.path.splitext(file)[0]
                if event_id in catalog_df['filename'].values:
                    label_row = catalog_df.loc[catalog_df['filename'] == event_id]
                    labels.append(label_row['mq_type'].values[0])  # Extract the string label
    
    # Convert seismic data to NumPy array
    padded_data, masks = pad_sequences(seismic_data)

    if include_catalog:
        # Encode labels to numeric values
        label_encoder = LabelEncoder()
        labels_encoded = label_encoder.fit_transform(labels)  # Convert labels to integers
        return padded_data, labels_encoded, masks
    else:
        return padded_data, masks


In [7]:
def train_and_evaluate_decision_tree(X, y, test_size=0.2, random_state=42):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    dt_model = DecisionTreeClassifier()
    dt_model.fit(X_train, y_train)

    y_pred = dt_model.predict(X_test)

    print("Classification Report for Decision Tree:")
    print(classification_report(y_test, y_pred))

    return dt_model


In [8]:
def main():
    catalog_path = '../../data/lunar_data/training/catalogs/apollo12_catalog_GradeA_final.csv'
    data_directory = '../../data/lunar_data/training/data/'

    # Load and preprocess data
    print("Preprocessing data...")
    catalog = pd.read_csv(catalog_path)
    X, y, _ = load_seismic_data(data_directory, catalog_df=catalog, include_catalog=True)

    # Now `y` is a NumPy array of encoded labels, so we can check its shape
    print(f"Shape of features (X): {X.shape}")
    print(f"Shape of labels (y): {y.shape}")
    print(f"Unique labels: {np.unique(y)}")  # Optional: Check the unique label values

    if X.shape[0] == 0 or y.shape[0] == 0:
        print("No data to train on!")
        return

    # Train and evaluate Decision Tree
    print("Training Decision Tree...")
    dt_model = train_and_evaluate_decision_tree(X, y)

In [9]:
# Call the main function
if __name__ == "__main__":
    main()

Preprocessing data...
Shape of features (X): (152, 500)
Shape of labels (y): (152,)
Unique labels: [0 1 2]
Training Decision Tree...
Classification Report for Decision Tree:
              precision    recall  f1-score   support

           0       0.50      0.67      0.57         3
           1       0.96      0.93      0.94        27
           2       1.00      1.00      1.00         1

    accuracy                           0.90        31
   macro avg       0.82      0.86      0.84        31
weighted avg       0.92      0.90      0.91        31

