# Train and evaluate CNN models



Author: Akash Kharita

Date: 02/28/2024

Modified by Marine Denolle on 06/20/24


### Import modules

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import h5py
import obspy
# from tqdm import tqdm
from glob import glob
# import time
import random
import sys
from datetime import datetime

from scipy import stats,signal


import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import random_split
import torchvision.transforms as transforms
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
# from sklearn.preprocessing import MinMaxScaler
from torch.utils.data import Dataset



# Check if a GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Importing all the models

In [2]:

# from design_CNN_models import Archtime
# from design_CNN_models import Archtime_do
# from design_CNN_models import WaveDecompNet
# from design_CNN_models import WaveDecompNet_do
# from design_CNN_models import SeismicCNN_batch
# from design_CNN_models import SeismicCNN_batch_do
# from design_CNN_models import SeismicNet
# from design_CNN_models import SeismicNet_do


# from neural_network_processing_functions import extract_datasets
# from neural_network_processing_functions import train_model
# from neural_network_processing_functions import plot_train_val_loss
# from neural_network_processing_functions import plot_accuracy
# from neural_network_processing_functions import extract_datasets_for_test
# from neural_network_processing_functions import train_model_for_test
# from neural_network_processing_functions import test_model

## Parameters

### Waveform data

In [3]:
#data files
file_noise="/data/whd01/yiyu_data/PNWML/noise_waveforms.hdf5";
file_comcat=  "/data/whd01/yiyu_data/PNWML/comcat_waveforms.hdf5";
file_exotic="/data/whd01/yiyu_data/PNWML/exotic_waveforms.hdf5";

### Waveform Metadata

In [4]:
# metadata
# accessing the comcat metadata
comcat_metadata = pd.read_csv("/data/whd01/yiyu_data/PNWML/comcat_metadata.csv")

# accessing the exotic metadata
exotic_metadata = pd.read_csv("/data/whd01/yiyu_data/PNWML/exotic_metadata.csv")

# accessing the data files
metadata_noise = pd.read_csv("/data/whd01/yiyu_data/PNWML/noise_metadata.csv")

# creating individual data frames for each class
cat_exp = comcat_metadata[comcat_metadata['source_type'] == 'explosion']
cat_eq = comcat_metadata[comcat_metadata['source_type'] == 'earthquake']
cat_su = exotic_metadata[exotic_metadata['source_type'] == 'surface event']
cat_noise = metadata_noise
cat_noise['event_id'] = [cat_noise['trace_start_time'][i]+'_noise' for i in range(len(cat_noise))]


### Data Prep

In [5]:
start=-30
input_window_length=100 # in seconds
fs=50 # target sampling rate

number_data_per_class=100 # number of data samples per class
num_channels=3  # number of components to check

all_data=False
shifting=True


# training parameters
train_split = 80
val_split=10
test_split = 10
learning_rate=0.001
n_epochs=10

## Additional functions

In [6]:
# defining a very simple CNN
        
class SeismicCNN(nn.Module):
    def __init__(self, num_classes=4, num_channels = 3):
        super(SeismicCNN, self).__init__()
        # Define the layers of the CNN architecture
        self.conv1 = nn.Conv1d(in_channels= num_channels, out_channels=8, kernel_size=9,stride=1,padding='same')
#         self.conv2 = nn.Conv1d(in_channels= 8, out_channels=8, kernel_size=9,stride=2,padding=4)       
#         self.conv3 = nn.Conv1d(in_channels= 8, out_channels=16, kernel_size=7,stride=1,padding='same')
#         self.conv4 = nn.Conv1d(in_channels= 16, out_channels=16, kernel_size=7,stride=2,padding=3)        
#         self.conv5 = nn.Conv1d(in_channels= 16, out_channels=32, kernel_size=5,stride=1,padding='same')
#         self.conv6 = nn.Conv1d(in_channels= 32, out_channels=32, kernel_size=5,stride=2,padding=2)                
#         self.conv7 = nn.Conv1d(in_channels= 32, out_channels=64, kernel_size=3,stride=1,padding='same')
        
        self.pool1 = nn.MaxPool1d(kernel_size=2, stride=2)
        
        # batch-normalization layers
        self.bn1 = nn.BatchNorm1d(8)#, dtype=torch.float64)
#         self.bn2 = nn.BatchNorm1d(8)#, dtype=torch.float64)
#         self.bn3 = nn.BatchNorm1d(16)#, dtype=torch.float64)
#         self.bn4 = nn.BatchNorm1d(16)#, dtype=torch.float64)
#         self.bn5 = nn.BatchNorm1d(32)#, dtype=torch.float64)
#         self.bn6 = nn.BatchNorm1d(32)#, dtype=torch.float64)
#         self.bn7 = nn.BatchNorm1d(64)#, dtype=torch.float64)
        
        self.fc1 = nn.Linear(5000, 128)  # Adjust input size based on your data
#         self.fc1 = nn.Linear(4992, 128)  # Adjust input size based on your data
        self.fc2 = nn.Linear(128,4)  # Adjust input size based on your data
        self.fc1_bn = nn.BatchNorm1d(128)
        self.fc2_bn = nn.BatchNorm1d(num_classes)
        
        
        # Calculate the input size for the first fully connected layer
        fc_input_size = self._get_conv_output_size(num_channels, 5000)
        
        # define dropout
        self.dropout = nn.Dropout(0.2)
        
    def forward(self, x):
        x = F.relu(self.bn1(self.conv1(x))) # feature extraction, output size of 8,5000
#         x = self.pool1(F.relu(self.bn2(self.conv2(x)))) # feature extraction, output size of 8,1250 
        # max pooling is done at this point, but after the 2 convolutions to avoid aliasing
#         x = F.relu(self.bn3(self.conv3(x))) # feature extraction, output size of 16,1250
#         x = self.pool1(F.relu(self.bn4(self.conv4(x)))) # feature extraction, output size of 16, 312
#         x =  F.relu(self.bn5(self.conv5(x))) # feature extraction, output size of 32, 312
#         x = self.pool1(F.relu(self.bn6(self.conv6(x)))) # feature extraction, output size of 32, 78
#         x = F.relu(self.bn7(self.conv7(x))) # feature extraction, output size of 64, 78            
        x = x.view(x.size(0), -1) # Flatten before fully connected layer, 4992 features!
        x = F.relu(self.fc1_bn(self.fc1(x)))  # classifier
        x = self.fc2_bn(self.fc2(x)) # classifier
        
        # Apply softmax for probabilities
        x = torch.softmax(x, dim=1)
        
        return x

### test CNN

In [7]:
# Assuming the input shape is (batch_size, num_channels, num_features)
batch_size = 1  # You can adjust the batch size as needed
num_channels = 3
num_features = 5000

# Create a random input tensor with the specified shape
random_input = torch.randn(batch_size, num_channels, num_features)

In [8]:
# Initialize your model
model = SeismicCNN(num_classes=4, num_channels=num_channels).to(device)  # Use 'cuda' if you have a GPU available

# Move the random input to the same device as your model
random_input = random_input.to(device).float()  # Use 'cuda' if you have a GPU available

# Set the model to evaluation mode
model.eval()

# Pass the random input through the model
with torch.no_grad():  # Disable gradient computation
    output = model(random_input)
print(output)

AttributeError: 'SeismicCNN' object has no attribute '_get_conv_output_size'

In [None]:
model=SeismicCNN().to(device)
print(model)

In [None]:
# adding some more comments here
from torch.utils.data import Dataset
class PNWDataSet(Dataset): # create custom dataset
    def __init__(self, data,labels,num_classes): # initialize
        self.data = data 
        self.labels = labels
        self.num_classes = num_classes

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        sample_data = self.data[index]
        sample_labels = self.labels[index]
        
        # Convert labels to one-hot encoded vectors
        sample_labels = torch.nn.functional.one_hot(torch.tensor(sample_labels), num_classes=self.num_classes)
        print(sample_labels)
        
        return torch.Tensor(sample_data), sample_labels.float()  # return data as a tensor
    

In [None]:
def extract_waveforms(cat, file_name, start=-20, input_window_length=100, fs=50, number_data=1000, num_channels=3, all_data=False, shifting=True):
    
    """
    This is a function defined to extract the waveforms from file of waveforms and a dataframe of metadata. 
    The functions will also filter and resample the data if the data sampling rate is different from the target sampling rate
    The data is shuffled in order it is called from the file it was stored in.
    The data is shuffled in time by allowing a shift in selecting the waveform window with some of the pre-P data.
    The data us normalized to its max(abs) on either component.
    
    Inputs:
    cat -  Catalog containing metadata of the events, so we can extract the data using the bucket information
    file_name - path of the h5py file containing the data
    start - origin or first arrival time
    num_features - window length to extract
    before - number of samples to take before the arrival time
    after - number of samples to take after the arrival time.
    num_samples - no. of events per class to extract
    
    input_window_length: desired window length in seconds
    fs: desired sampling rate.
    num_channels - no. of channels per event to extract, if set 1, will extract Z component, if set any other number, will extract - ZNE component. 
    all_samples - if true, will extract all the samples corresponding of a given class
    shifting - if true, will extract windows randomly starting between P-5, P-20. The random numbers follow a gaussian distribution. 
    Outputs:
    
    """   
    cat = cat.sample(frac=1).reset_index(drop=True)
    if all_data:number_data = len(cat) # how many data to include
    # open the file
    f = h5py.File(file_name, 'r')
    x=np.zeros(shape=(number_data,3,int(fs*input_window_length)))
    event_ids = cat['event_id'].values
    if not all_data:event_ids=event_ids[:number_data]
        
    for index in range(number_data):
        # read data
        bucket, narray = cat.loc[index]['trace_name'].split('$')
        xx, _, _ = iter([int(i) for i in narray.split(',:')])
        data = f['/data/%s' % bucket][xx, :, : ] # get all of the data
        if fs != cat.loc[index,'trace_sampling_rate_hz']: #resample the data
            nyquist = 0.5 * cat.loc[index,'trace_sampling_rate_hz']
            low = 0.05 / nyquist;  high = 20 / nyquist
            b, a = signal.butter(4, [low, high], btype='band')

            # Apply the taper+filter to the signal
            taper = signal.windows.tukey(data.shape[-1],alpha=0.1)
            data = np.array([np.multiply(taper,row) for row in data])
            filtered_signal = np.array([signal.filtfilt(b, a, row) for row in data])

            # resample
            number_of_samples = int(filtered_signal.shape[1] * fs / cat.loc[index,'trace_sampling_rate_hz'])
            data = np.array([signal.resample(row, number_of_samples) for row in filtered_signal])

            
        if event_ids[index].split("_")[-1]!="noise":
            #random start between P-20 and P-5 (upper bound is exclusive in numpy.random.randint)        
            ii = int(np.random.randint(start,-4)*fs)
            
            if np.isnan(cat.loc[index, 'trace_P_arrival_sample']):continue
            
            istart = int(cat.loc[index, 'trace_P_arrival_sample']*fs/cat.loc[index,'trace_sampling_rate_hz']) + ii # start around the P
            iend  = istart + int(fs*input_window_length)
            if iend>data.shape[-1]:
                istart = istart - (iend-data.shape[-1])
                iend = data.shape[-1]
        else:
            istart=0
            iend=istart+int(fs*input_window_length)

        
        # normalize the data
        mmax = np.max(np.abs(data[:,istart:iend]))
        # store data in big index
        x[index,:,:iend-istart] = data[:,istart:iend]/mmax
        
        if num_channels==1:
            x2 = x[:,2,:]
            del x
            x = x2
            
    # remove rows with zeros if there are any
    idx=np.where(np.mean(np.abs(x[:,2,0:10]),axis=-1)>0)[0]
                     
    
    f.close()
    return x[idx,:,:], event_ids[idx]


In [None]:
# def train_model(model, train_loader, val_loader,  n_epochs=100, batch_size=32,learning_rate=0.001,criterion=nn.CrossEntropyLoss()):
#     """
#     Function to train and evaluate the defined model.

#     Parameters:
#         model (torch.nn.Module): The neural network model.
#         train_loader (torch.utils.data.DataLoader): DataLoader for training data.
#         val_dataset (torch.utils.data.Dataset): Validation dataset.
#         val_loader (torch.utils.data.DataLoader): DataLoader for validation data.
#         optimizer (torch.optim.Optimizer): Optimizer for training the model.
#         n_epochs (int): Number of training epochs.
#         batch_size (int): Batch size for training.
#         number_input (int): Number of points in the input data.
#         num_channels (int): Number of channels in the input data.

#     Returns:
#         accuracy_list (list): List of accuracies computed from each epoch.
#         train_loss_list (list): List of training losses from each epoch.
#         val_loss_list (list): List of validation losses from each epoch.
#         y_pred (list): List of predicted values.
#         y_true (list): List of true values.
#     """
    
#     optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

#     N_test = len(val_dataset)

#     # to store the accuracies computed from each epoch.
# #     accuracy_list = []

#     # # Save loss and error for plotting
#     loss_time = np.zeros(n_epochs)
#     accuracy_time = np.zeros(n_epochs)

#     # to store the predicted values
# #     y_pred = []
# #     y_true = []

#     for epoch in range(n_epochs):
#         running_loss = 0
#         for data in train_loader:
#             inputs, labels = data[0].to(device), data[1].to(device)
#             inputs = inputs.float()
#             labels = labels.long()
            
            
#             # Set the parameter gradients to zero
#             optimizer.zero_grad()
            
#             outputs = model(inputs)
            
#             loss = criterion(outputs, labels)
            
#             # computing the gradients
#             loss.backward()

#             # updating the parameters
#             optimizer.step()

#             running_loss += loss.item()

#         # updating the training loss list
#         loss_time[epoch] = running_loss/len(trainloader)

#              # We evaluate the model, so we do not need the gradient
#         with torch.no_grad(): # Context-manager that disabled gradient calculation.
#             # Loop on samples in test set
#             for data in trainloader:
#                 # Get the sample and modify the format for PyTorch
#                 inputs, labels = data[0].to(device), data[1].to(device)
#                 inputs = inputs.float() 
#                 labels = labels.long()
#                 # Use model for sample in the test set
#                 outputs = model(inputs)
#                 # Compare predicted label and true label
#                 _, predicted = torch.max(outputs.data, 1)
#                 total += labels.size(0)
#                 correct += (predicted == labels).sum().item()
#         # Save error at the end of each epochs
#         accuracy_time[epoch] = 100 * correct / total

#     # Print intermediate results on screen
#     if testloader is not None:
#         print('[Epoch %d] loss: %.3f - accuracy: %.3f' %
#           (epoch + 1, running_loss/len(trainloader), 100 * correct / total))
#     else:
#         print('[Epoch %d] loss: %.3f' %
#           (epoch + 1, running_loss/len(trainloader)))

#     return loss_time, accuracy_time



## Select only high SNR data

based on the SNR value in the Z component, then store the reduced panda dataframe for each class.

In [None]:
# explosions
trace_snr_db_values = np.array([float(cat_exp.loc[idx, 'trace_snr_db'].split("|")[-1]) for idx in cat_exp.index.values.tolist()])
ii2= np.where(trace_snr_db_values>18)[0].astype(int) 
df_exp = cat_exp.iloc[ii2]

# earthquake
trace_snr_db_values = np.array([float(cat_eq.loc[idx, 'trace_snr_db'].split("|")[-1]) for idx in cat_eq.index.values.tolist()])
ii2= np.where(trace_snr_db_values>18)[0].astype(int) 
df_eq = cat_eq.iloc[ii2]

# surface events
trace_snr_db_values = np.array([float(cat_su.loc[idx, 'trace_snr_db'].split("|")[-1]) for idx in cat_su.index.values.tolist()])
ii2= np.where(trace_snr_db_values>18)[0].astype(int) 
df_su = cat_su.iloc[ii2]

# noise
# does not change
df_noise = cat_noise

In [None]:
df_noise.head()

In [None]:
# surface events
d_su, id_su = extract_waveforms(df_su, file_exotic, input_window_length = input_window_length, fs=fs,
                                start =start, number_data = number_data_per_class, num_channels = num_channels,
                                shifting = shifting, all_data = False)
print(d_su.shape)

In [None]:
# noise
d_noise, id_noise = extract_waveforms(df_noise, file_noise, input_window_length = input_window_length, fs=fs,
                                      start = start, number_data = number_data_per_class,
                                      num_channels = num_channels, shifting = shifting, all_data = all_data)
print(d_noise.shape)

In [None]:
# explosions
d_exp, id_exp = extract_waveforms(df_exp, file_comcat, input_window_length = input_window_length, fs=fs,
                                  start = start,  number_data = number_data_per_class, num_channels = num_channels,
                                  shifting = shifting, all_data = all_data)

print(d_exp.shape)

In [None]:
# earthquakes
d_eq, id_eq = extract_waveforms(df_eq, file_comcat, input_window_length = input_window_length,  fs=fs,
                                start =start,  number_data = number_data_per_class, num_channels = num_channels,
                                shifting = shifting, all_data = all_data)
print(d_eq.shape)

In [None]:
# concatenate all data into one input
X = np.vstack([d_noise, d_exp, d_eq, d_su])
print(X.shape)
plt.plot(X[21,:,:].T)
plt.show()


## Prepare labels
labels to encode: here we understand that the classes are labeled as integers 

* 0: noise
* 1: explosion
* 2: earthquake
* 3: surface event

In [None]:
# 
# event_ids = ['noise']*len(d_noise)+['explosion']*len(d_exp)+['earthquake']*len(d_eq)+['surface']*len(d_su)
event_ids = [0]*len(d_noise)+[1]*len(d_exp)+[2]*len(d_eq)+[3]*len(d_su)
# y = np.hstack([id_noise, id_exp, id_eq, id_su])
y = event_ids
print(y)
# y_encoded = label_encoder.fit_transform(y)
# print(y_encoded)

In [None]:
custom_dataset = PNWDataSet(X,y,4)
for data,yy in custom_dataset:
    print(data)
    break

## Shuffle and split data

In [None]:
# Make the data a PNWDataSet
custom_dataset = PNWDataSet(X,y,4)
# first split train+val
# Determine the size of the training set
train_size = 317 #int(train_split/100 * len(custom_dataset)) # 80% of the data set
val_size = 39#int(val_split/100 * len(custom_dataset)) # 10% of the data set
test_size = 41#len(custom_dataset) - train_size - test_size # the rest is test
print(train_size,val_size,test_size)
print(len(custom_dataset))
print([train_size, test_size+val_size])
print(train_size+test_size+val_size)
train_dataset, val_dataset = random_split(custom_dataset, [train_size, test_size+val_size])
# then split val into val+test
test_dataset, val_dataset = random_split(val_dataset, [test_size,val_size])

train_loader = DataLoader(train_dataset)
val_loader = DataLoader(val_dataset)
test_loader = DataLoader(test_dataset)

## Defining some common parameters for all models

In [None]:

# Define the loss function (e.g., Cross-Entropy)
criterion = nn.CrossEntropyLoss()


## Training and Testing all the models

In [None]:
train_loader = train_dataset
val_loader = val_dataset
# def train_model(model, train_loader, val_loader,  n_epochs=100, batch_size=32,learning_rate=0.001,criterion=nn.CrossEntropyLoss()):
# """
# Function to train and evaluate the defined model.

# Parameters:
#     model (torch.nn.Module): The neural network model.
#     train_loader (torch.utils.data.DataLoader): DataLoader for training data.
#     val_dataset (torch.utils.data.Dataset): Validation dataset.
#     val_loader (torch.utils.data.DataLoader): DataLoader for validation data.
#     optimizer (torch.optim.Optimizer): Optimizer for training the model.
#     n_epochs (int): Number of training epochs.
#     batch_size (int): Batch size for training.
#     number_input (int): Number of points in the input data.
#     num_channels (int): Number of channels in the input data.

# Returns:
#     accuracy_list (list): List of accuracies computed from each epoch.
#     train_loss_list (list): List of training losses from each epoch.
#     val_loss_list (list): List of validation losses from each epoch.
#     y_pred (list): List of predicted values.
#     y_true (list): List of true values.
# """

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

N_test = len(val_dataset)

# to store the accuracies computed from each epoch.
#     accuracy_list = []

# # Save loss and error for plotting
loss_time = np.zeros(n_epochs)
accuracy_time = np.zeros(n_epochs)

# to store the predicted values
#     y_pred = []
#     y_true = []

for epoch in range(n_epochs):
    running_loss = 0
    for data in train_loader:
        inputs, labels = data[0].to(device), data[1].to(device)
#         print(data)
        print(data[0],inputs)
        inputs = inputs.float()
        labels = labels.long()

        

        # Set the parameter gradients to zero
        optimizer.zero_grad()
        print("ready to estimate first")
        print(inputs.shape)
        outputs = model(inputs)
        print(outputs)
        loss = criterion(outputs, labels)
        print(loss)
        # computing the gradients
        loss.backward()

        # updating the parameters
        optimizer.step()

        running_loss += loss.item()

    # updating the training loss list
    loss_time[epoch] = running_loss/len(trainloader)

         # We evaluate the model, so we do not need the gradient
    with torch.no_grad(): # Context-manager that disabled gradient calculation.
        # Loop on samples in test set
        for data in trainloader:
            # Get the sample and modify the format for PyTorch
            inputs, labels = data[0].to(device), data[1].to(device)
            inputs = inputs.float() 
            labels = labels.long()
            # Use model for sample in the test set
            outputs = model(inputs)
            # Compare predicted label and true label
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    # Save error at the end of each epochs
    accuracy_time[epoch] = 100 * correct / total

# Print intermediate results on screen
if testloader is not None:
    print('[Epoch %d] loss: %.3f - accuracy: %.3f' %
      (epoch + 1, running_loss/len(trainloader), 100 * correct / total))
else:
    print('[Epoch %d] loss: %.3f' %
      (epoch + 1, running_loss/len(trainloader)))



In [None]:
model = SeismicCNN(num_classes=4, num_channels=3)
dummy_input = torch.randn(1, 3, 5000)
output = model(dummy_input)

In [None]:
(loss, accuracy) = train_model(model,train_dataset,val_dataset)

In [None]:
fig, ax1 = plt.subplots()

color = 'tab:red'
ax1.set_xlabel('Epoch')
ax1.set_ylabel('Loss', color=color)
ax1.plot(np.arange(1, 101), loss, color=color)
ax1.tick_params(axis='y', labelcolor=color)

ax2 = ax1.twinx()

color = 'tab:blue'
ax2.set_ylabel('Correct predictions', color=color)
ax2.plot(np.arange(1, 101), accuracy, color=color)
ax2.tick_params(axis='y', labelcolor=color)

fig.tight_layout()
plt.show()

## Archtime (Original)

In [None]:
# Archtime_normal
num_channels = 1
number_features = 5000

#train_dataset, train_loader, test_dataset, test_loader, val_dataset, val_loader = extract_datasets(num_channels = 1, num_samples = 5000)
#train_dataset, train_loader, y_train, test_dataset, test_loader, y_test,  val_dataset, val_loader, y_val = extract_datasets(before = 1000, after = 40000, num_samples = 5000, batch_size = 32, num_channels = 1, train_size = 4000, test_size = 0, num_features = 5000, shifting = True)


data_loader_train,data_loader_val,data_loader_test = prepare_datasets()


# train_dataset, train_loader, y_train, test_dataset, test_loader, y_test,  val_dataset, val_loader, y_val, event_ids_normal = extract_datasets(before = 1000, after = 4000, num_samples = 5500, batch_size = 32, num_channels = 1, train_size = 5000, test_size = 1, num_features = 5000, shifting = True, all_samples = False)


model_archtime = Archtime(num_channels = 3, num_input = 5000)


optimizer = torch.optim.Adam(model_archtime.parameters(), lr=0.001)
accuracy_archtime, train_loss_archtime, val_loss_archtime, y_pred, y_true  = train_model(model_archtime, train_loader, val_dataset, val_loader, optimizer, n_epochs = number_epochs, num_channels = num_channels, num_features = 5000)
