In [1]:
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
from torch.utils.data import DataLoader, TensorDataset
from obspy import read
from obspy.signal.trigger import classic_sta_lta
from sklearn.preprocessing import OneHotEncoder

In [2]:
# Function to apply padding and create a mask
def pad_sequences(sequences, max_len=None, padding_value=0):
    if max_len is None:
        max_len = max(len(seq) for seq in sequences)

    padded_seqs = np.full((len(sequences), max_len), padding_value, dtype=np.float32)
    masks = np.zeros((len(sequences), max_len), dtype=np.float32)

    for i, seq in enumerate(sequences):
        seq_len = len(seq)
        padded_seqs[i, :seq_len] = seq
        masks[i, :seq_len] = 1  # Valid data points

    return padded_seqs, masks

In [3]:
# Preprocessing function for both .csv and .mseed
def preprocess_seismic_data(filepath, filetype, sampling_rate=6.625):
    if filetype == 'csv':
        seismic_data = pd.read_csv(filepath)
        trace = seismic_data['velocity(m/s)'].values
    elif filetype == 'mseed':
        st = read(filepath)
        trace = st[0].data
    
    sos = signal.butter(4, [0.5, 1.0], btype='bandpass', fs=sampling_rate, output='sos')
    filtered_trace = signal.sosfilt(sos, trace)

    return filtered_trace

In [4]:
# Load the catalog for training
def load_catalog(catalog_file):
    catalog_df = pd.read_csv(catalog_file)
    
    # One-hot encoding of the 'mq_type' column
    encoder = OneHotEncoder(sparse_output=False)
    one_hot_labels = encoder.fit_transform(catalog_df[['mq_type']])
    
    # Add the one-hot encoded columns to the catalog
    one_hot_label_columns = encoder.categories_[0]
    catalog_df = catalog_df.join(pd.DataFrame(one_hot_labels, columns=one_hot_label_columns))
    
    return catalog_df, encoder

In [5]:
# Load all data files (csv or mseed) from the directory
def load_seismic_data(data_dir, catalog_df=None, include_catalog=False):
    seismic_data = []
    labels = []
    
    for root, _, files in os.walk(data_dir):
        for file in files:
            filepath = os.path.join(root, file)
            if file.endswith('.csv'):
                filetype = 'csv'
            elif file.endswith('.mseed'):
                filetype = 'mseed'
            else:
                continue  # Skip unsupported file types
            
            # Preprocess seismic data
            filtered_trace = preprocess_seismic_data(filepath, filetype)
            seismic_data.append(filtered_trace)
            
            if include_catalog and catalog_df is not None:
                # Extract label from catalog based on file name (if it exists)
                event_id = os.path.splitext(file)[0]
                if event_id in catalog_df['filename'].values:
                    label_row = catalog_df.loc[catalog_df['filename'] == event_id]
                    labels.append(label_row.iloc[0, -len(catalog_df.columns[-3:]):].values)  # One-hot encoded labels
    
    # Padding and masking
    seismic_data_padded, masks = pad_sequences(seismic_data)
    
    seismic_data_padded = np.array(seismic_data_padded)
    masks = np.array(masks)

    if include_catalog:
        return seismic_data_padded, labels, masks
    else:
        return seismic_data_padded, masks

In [6]:
# Function to extract STA/LTA features with fixed length
def extract_sta_lta_features(tr_data, tr_times, sta_len=120, lta_len=600, fixed_length=500):
    df = len(tr_times) / (tr_times[-1] - tr_times[0])  # Estimate sampling rate
    cft = classic_sta_lta(tr_data, int(sta_len * df), int(lta_len * df))

    if len(cft) > fixed_length:
        features = cft[:fixed_length]  # Truncate if longer
    else:
        # Pad with zeros if shorter
        features = np.pad(cft, (0, fixed_length - len(cft)), 'constant')

    return features

In [7]:
# Prepare data from traces and extract features
def prepare_data_from_traces(catalog, data_directory, use_mseed=True):
    features_list = []
    labels_list = []

    for idx, row in catalog.iterrows():
        test_filename = row['filename']
        arrival_time = row['time_rel(sec)']

        if use_mseed:
            mseed_file = os.path.join(data_directory, f"S12_GradeA/{test_filename}.mseed")
            if not os.path.exists(mseed_file):
                continue
            st = read(mseed_file)
            tr = st.traces[0]
            tr_data = tr.data
            tr_times = tr.times()
        else:
            csv_file = os.path.join(data_directory, f"S12_GradeA/{test_filename}.csv")
            if not os.path.exists(csv_file):
                continue
            data_cat = pd.read_csv(csv_file)
            tr_data = np.array(data_cat['velocity(m/s)'])
            tr_times = np.array(data_cat['time_rel(sec)'])

        # Extract features using STA/LTA with a fixed length
        features = extract_sta_lta_features(tr_data, tr_times)
        features_list.append(features)
        labels_list.append(1)  # All catalog events are considered 1 for simplicity

    X = np.array(features_list)
    y = np.array(labels_list)

    return X, y

In [8]:
import pandas as pd
import numpy as np

# Function to save processed data into a catalog with added debugging
def save_processed_data(X, y, save_path='processed_data_catalog.csv'):
    """
    Save the processed features (X) and labels (y) into a CSV file for future use.
    
    Parameters:
    X (numpy array): The features data, expected to be a 2D array or reshaped as needed.
    y (numpy array): The labels.
    save_path (str): The file path where the data will be saved.
    """
    # Check shapes of X and y for debugging
    print(f"Shape of X: {X.shape}")
    print(f"Shape of y: {y.shape}")
    
    # Reshape X if necessary to ensure it's 2D (samples, features)
    if len(X.shape) > 2:
        # Flatten each trace if it's multi-dimensional (e.g., 3D or more)
        X_flattened = X.reshape(X.shape[0], -1)
        print(f"X was reshaped to: {X_flattened.shape}")
    else:
        X_flattened = X

    # Ensure y is flattened for single-label output
    y_flattened = y.flatten()
    print(f"y was flattened to: {y_flattened.shape}")

    # Convert features and labels into a DataFrame for easy storage
    try:
        data = pd.DataFrame(X_flattened)
        data['label'] = y_flattened

        # Save to CSV
        data.to_csv(save_path, index=False)
        print(f"Processed data saved to {save_path}")
    except Exception as e:
        print(f"Error while saving data: {str(e)}")

# Example usage (mock data for demonstration purposes)
if __name__ == "__main__":
    # Example mock data to test saving functionality
    X_example = np.random.rand(100, 50)  # 100 samples, 50 features
    y_example = np.random.randint(0, 2, 100)  # 100 labels (binary classification)
    
    save_processed_data(X_example, y_example, 'processed_data_catalog.csv')


Shape of X: (100, 50)
Shape of y: (100,)
y was flattened to: (100,)
Processed data saved to processed_data_catalog.csv


In [9]:
# Function to load processed data from a saved catalog
def load_processed_data(load_path='processed_data_catalog.csv'):
    """
    Load the processed features and labels from a CSV file.
    
    Parameters:
    load_path (str): The file path from which the data will be loaded.
    
    Returns:
    X (numpy array): The loaded features.
    y (numpy array): The loaded labels.
    """
    # Load data from CSV
    data = pd.read_csv(load_path)
    
    # Separate features and labels
    X = data.iloc[:, :-1].values  # All columns except the last
    y = data['label'].values  # The last column is the label

    print(f"Processed data loaded from {load_path}")
    return X, y

In [10]:
# Train and evaluate Decision Tree
def train_decision_tree(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train a Decision Tree Classifier
    dt_model = DecisionTreeClassifier()
    dt_model.fit(X_train, y_train)

    # Make predictions
    y_pred = dt_model.predict(X_test)

    # Evaluate the model
    print("Classification Report for Decision Tree:")
    print(classification_report(y_test, y_pred))

    return dt_model


In [11]:
# Modified main function to include saving and loading of processed data
def main():
    catalog_path = '../../data/lunar_data/training/catalogs/apollo12_catalog_GradeA_final.csv'
    data_directory = '../../data/lunar_data/training/data/'
    processed_data_path = 'processed_data_catalog.csv'

    # Check if preprocessed data exists
    if os.path.exists(processed_data_path):
        print("Loading processed data from catalog...")
        X, y = load_processed_data(processed_data_path)
    else:
        # If no processed data is found, preprocess and save
        print("Preprocessing data...")
        catalog = pd.read_csv(catalog_path)
        X, y = prepare_data_from_traces(catalog, data_directory)

        # Save the processed data for future use
        save_processed_data(X, y, processed_data_path)

    # Train and evaluate Decision Tree
    dt_model = train_decision_tree(X, y)

# %%
# Run the main function
if __name__ == "__main__":
    main()

Loading processed data from catalog...
Processed data loaded from processed_data_catalog.csv
Classification Report for Decision Tree:
              precision    recall  f1-score   support

           0       0.50      0.40      0.44        10
           1       0.50      0.60      0.55        10

    accuracy                           0.50        20
   macro avg       0.50      0.50      0.49        20
weighted avg       0.50      0.50      0.49        20

