In [1]:
# ───────────────────────────────
# 📦 Standard Libraries
# ───────────────────────────────
import os
import sys
import math
import time
import csv
import pickle
from datetime import datetime, timedelta
from collections import Counter

# ───────────────────────────────
# 📚 Data Handling & Utilities
# ───────────────────────────────
import numpy as np
import pandas as pd
from glob import glob
from tqdm import tqdm
import h5py
from joblib import dump, load

# ───────────────────────────────
# 📊 Visualization
# ───────────────────────────────
import matplotlib.pyplot as plt
import seaborn as sns

# ───────────────────────────────
# 📈 Machine Learning
# ───────────────────────────────
from sklearn.model_selection import (
    train_test_split, RandomizedSearchCV, GridSearchCV, cross_val_score
)
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score, roc_curve, roc_auc_score, auc,
    classification_report, confusion_matrix
)
from sklearn.preprocessing import LabelEncoder, StandardScaler
from imblearn.under_sampling import RandomUnderSampler

# ───────────────────────────────
# 🌍 Seismology & Signal Processing
# ───────────────────────────────
import obspy
from obspy import UTCDateTime
from obspy.geodetics.base import gps2dist_azimuth
from obspy.clients.fdsn import Client
from obspy.signal.filter import envelope
from scipy import stats, signal

# ───────────────────────────────
# 🔗 External Tools
# ───────────────────────────────
from zenodo_get import zenodo_get

# ───────────────────────────────
# 🤖 Deep Learning
# ───────────────────────────────
import torch
import torch.nn.functional as F


# ───────────────────────────────
# 🛠 Custom Utilities
# ───────────────────────────────
sys.path.insert(0, '/home/ak287/PNW_Seismic_Event_Classification/deep_learning/scripts')
from neural_network_architectures import SeismicCNN_2d

from utils import (
    extract_waveforms,
    compute_spectrogram,
    normalize_spectrogram_minmax,
    return_train_val_loaders,
    plot_confusion_matrix_and_cr,
    train_model,
    WaveformPreprocessor
)

# Pandas display options
pd.set_option('display.max_columns', None)


sys.path.append('/home/ak287/seisbench/seisbench/models')
import seisbench.models as sbm



# for extracting unique stations
import re
from pathlib import Path

cuda


In [2]:
# Function to extract information
def extract_datetime_info(date_str):
    date_obj = datetime.strptime(date_str, '%Y_%m_%d %H%M%S')
    hour_of_day = date_obj.hour - 8
    day_of_week = date_obj.weekday()  # Full day name
    month_of_year = date_obj.month  # Full month name
    return hour_of_day, day_of_week, month_of_year



def conv_to_datetime(df):
    new_df = []
    for i in range(len(df)):
        year = int(df[i].split('_')[0])
        month = int(df[i].split('_')[1])
        day = int(df[i].split('_')[2].split(' ')[0])
        hour = int(df[i].split('_')[2].split(' ')[1][0:2])
        minute = int(df[i].split('_')[2].split(' ')[1][2:4])
        second = int(df[i].split('_')[2].split(' ')[1][4:])
        new_df.append(obspy.UTCDateTime(year, month, day, hour, minute, second))
        
    return np.array(new_df)

## Loading the older catalog. 

In [3]:
pd.set_option('display.max_columns', None)
cat_all_old = pd.read_csv('../../data/IRISExoticEventCatalog.txt', sep='|')
cat_old = cat_all_old.copy()
cat_old

Unnamed: 0,areaSource,areaSourceHigh,areaSourceLow,areaTotal,associationId,datlocation,depth,endtime,eventid,h,hHigh,hLow,hfall,hfall_high,hfall_low,l,lHigh,lLow,latitude,locuncertKm,longitude,lppotential,mass,massHigh,massLow,maxdistvhfKm,maxdistvhfReached,maxdisthfKm,maxdisthfReached,maxdistipKm,maxdistipReached,maxdistlpKm,maxdistlpReached,maxdistinfraKm,maxdistinfraReached,infraDetected,name,otherdataquality1to5,peakDischarge,peakDischargeLow,peakDischargeHigh,peakFlowheight,peakFlowheightLow,peakFlowheightHigh,starttime,sources,toeLat,toeLon,topLat,topLon,upstreamDrainarea,type,subtype,volume,volumeHigh,volumeLow
0,,,,,22,IRIS,,2007_07_25 010251,22,1860.0,,,,,,5760.0,,,61.10249,0.00,-140.30550,1,1.080000e+11,,,,,1079.0,True,,,1100.0,False,,,,Mount Steele 2,5,,,,,,,2007_07_25 005835,"Allstadt, Kate E., McVey, Brennah G., and Malo...",61.14054,-140.25324,61.10249,-140.30550,,rock and ice avalanche,Rock/ice/debris avalanches and slides,54000000.0,80500000.0,27500000.0
1,,,,,218,"RESIF, ETH, ORFEUS, LMU, BGR, IRIS, INGV",,2017_08_23 093828,218,,,,,,,,,,46.29552,,9.60190,0,,,,,,246.0,False,,,0.0,,,,,"Piz Cengalo 4, Switzerland",1,,,,,,,2017_08_23 093617,"Collins, E.A., Allstadt, K.E., Groult, C., Hib...",,,,,,landslide,Rock/ice/debris avalanches and slides,,,
2,111000.0,,,1130000.0,127,IRIS,,2017_07_22 205957,127,931.0,,,,,,3730.0,,,60.07385,0.00,-139.84532,1,,,,,,351.0,False,,,386.0,False,0.0,,False,Lucia Glacier,4,,,,,,,2017_07_22 205630,"Collins, E.A., Allstadt, K.E., Groult, C., Hib...",60.06616,-139.90755,60.07385,-139.84532,,rock avalanche,Rock/ice/debris avalanches and slides,3000000.0,4000000.0,2300000.0
3,100000.0,150000.0,65000.0,820000.0,186,"IRIS, NCEDC",,2013_07_25 101727,186,690.0,760.0,640.0,,,,2000.0,2100.0,1600.0,61.98450,0.00,-143.16830,1,4.000000e+10,,,,,500.0,False,,,500.0,False,,,,Wrangell Mountains,4,,,,,,,2013_07_25 101456,"Collins, E.A., Allstadt, K.E., Groult, C., Hib...",61.97220,-143.14950,61.98450,-143.16830,,rock and debris avalanche,Rock/ice/debris avalanches and slides,2600000.0,3900000.0,1800000.0
4,23400.0,,,,147,IRIS,,2019_05_13 144245,147,1240.0,,,,,,4320.0,,,50.34480,0.00,-122.45110,1,,,,,,391.0,True,,,301.0,True,,,,Mount Joffre 1,5,,,,,,,2019_05_13 143955,"Collins, E.A., Allstadt, K.E., Groult, C., Hib...",50.36720,-122.41540,50.34480,-122.45110,,"rock avalanche, debris flow",Rock/ice/debris avalanches and slides,1900000.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
240,,,,,185,Attachments below,,2015_06_09 160558,185,,,,,,,,,,46.67199,,10.72268,0,,,,3.0,,3.0,True,,,0.0,,,,,Gadria 3,5,27.0,,,1.7,,,2015_06_09 151558,"Coviello, V., Arattano, M., Comiti, F., Maccon...",,,,,6.3,debris flow,Lahar/debris flow/outburst flood,12600.0,,
241,310000.0,520000.0,200000.0,7900000.0,19,IRIS,,2012_06_11 222652,19,2400.0,2500.0,2300.0,,,,9300.0,9500.0,9100.0,58.79367,0.00,-137.44247,1,2.000000e+10,,,,,890.0,True,,,1000.0,False,,,,Mount Lituya main,4,,,,,,,2012_06_11 222400,"Allstadt, Kate E., McVey, Brennah G., and Malo...",58.81626,-137.30032,58.79367,-137.44247,,rock and ice avalanche,Rock/ice/debris avalanches and slides,13000000.0,60000000.0,2800000.0
242,130000.0,160000.0,84000.0,3000000.0,188,"IRIS, NCEDC",,2010_07_09 073820,188,1800.0,1900.0,1600.0,,,,7600.0,7900.0,7300.0,51.87130,0.00,-125.94340,1,,,,,,500.0,False,,,500.0,False,,,,Sheemahant Glacier,4,,,,,,,2010_07_09 073435,"Collins, E.A., Allstadt, K.E., Groult, C., Hib...",51.84780,-126.02910,51.87130,-125.94340,,rock slide,Rock/ice/debris avalanches and slides,3800000.0,5400000.0,2700000.0
243,,,,,159,IRIS,,2017_03_06 234541,159,,,,,,,,,,60.03300,1.00,-153.09870,1,,,,,,319.0,True,,,136.0,True,,,True,Iliamna,2,,,,,,,2017_03_06 232627,"Collins, E.A., Allstadt, K.E., Groult, C., Hib...",,,,,,ice avalanche,Rock/ice/debris avalanches and slides,,,


## Loading the newer catalog. 

In [4]:
pd.set_option('display.max_columns', None)
cat_all_new = pd.read_csv('../../src/IRIS_DMC_esecEventsDb_updated.txt', sep='|')
cat_new = cat_all_new.copy()
cat_new

Unnamed: 0,areaSource,areaSourceHigh,areaSourceLow,areaTotal,associationId,datlocation,depth,endtime,eventid,h,hHigh,hLow,hfall,hfall_high,hfall_low,l,lHigh,lLow,latitude,locuncertKm,longitude,lppotential,mass,massHigh,massLow,maxdistvhfKm,maxdistvhfReached,maxdisthfKm,maxdisthfReached,maxdistipKm,maxdistipReached,maxdistlpKm,maxdistlpReached,maxdistinfraKm,maxdistinfraReached,infraDetected,name,otherdataquality1to5,peakDischarge,peakDischargeLow,peakDischargeHigh,peakFlowheight,peakFlowheightLow,peakFlowheightHigh,starttime,sources,toeLat,toeLon,topLat,topLon,upstreamDrainarea,type,subtype,volume,volumeHigh,volumeLow
0,,,,,22,IRIS,,2007_07_25 010251,22,1860.0,,,,,,5760.0,,,61.10249,0.00,-140.30550,1,1.080000e+11,,,,,1079.0,True,,,1100.0,False,,,,Mount Steele 2,5,,,,,,,2007_07_25 005835,"Allstadt, Kate E., McVey, Brennah G., and Malo...",61.14054,-140.25324,61.10249,-140.30550,,rock and ice avalanche,Rock/ice/debris avalanches and slides,54000000.0,80500000.0,27500000.0
1,100000.0,,,1600000.0,273,IRIS,,2017_06_23 214249,273,1200.0,,,,,,2600.0,,,32.07970,0.00,103.66280,1,,,,,,591.0,False,,,591.0,False,,,,"Xinmo Village, China",5,,,,,,,2017_06_23 213916,"Collins, E.A., Allstadt, K.E., Groult, C., Hib...",,,,,,"rock slide, rock avalanche",Rock/ice/debris avalanches and slides,13000000.0,14000000.0,12000000.0
2,,,,,218,"RESIF, ETH, ORFEUS, LMU, BGR, IRIS, INGV",,2017_08_23 093828,218,,,,,,,,,,46.29552,,9.60190,0,,,,,,246.0,False,,,0.0,,,,,"Piz Cengalo 4, Switzerland",1,,,,,,,2017_08_23 093617,"Collins, E.A., Allstadt, K.E., Groult, C., Hib...",,,,,,landslide,Rock/ice/debris avalanches and slides,,,
3,111000.0,,,1130000.0,127,IRIS,,2017_07_22 205957,127,931.0,,,,,,3730.0,,,60.07385,0.00,-139.84532,1,,,,,,351.0,False,,,386.0,False,0.0,,False,Lucia Glacier,4,,,,,,,2017_07_22 205630,"Collins, E.A., Allstadt, K.E., Groult, C., Hib...",60.06616,-139.90755,60.07385,-139.84532,,rock avalanche,Rock/ice/debris avalanches and slides,3000000.0,4000000.0,2300000.0
4,,,,10000000.0,285,IRIS,,1997_12_26 070719,285,,,,,,,4000.0,,,16.70900,0.10,-62.17620,1,,,,,,0.0,,,,450.0,False,,,,Montserrat,5,,,,,,,1997_12_26 070240,"Collins, E.A., Allstadt, K.E., Groult, C., Hib...",16.67530,-62.17620,16.70900,-62.17620,,"debris avalanche, pyroclastic density current",Rock/ice/debris avalanches and slides,46000000.0,50000000.0,40000000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
314,3500000.0,,,,275,IRIS,,2009_08_10 110853,275,1500.0,,,,,,7200.0,7500.0,7000.0,23.22810,0.00,120.75170,1,,,,,,208.0,True,,,277.0,False,,,,"Butangbunasi, Taiwan",5,,,,,,,2009_08_10 110556,"Collins, E.A., Allstadt, K.E., Groult, C., Hib...",23.18310,120.77820,23.22810,120.75170,,landslide,Rock/ice/debris avalanches and slides,83800000.0,,
315,,,,,319,"NCEDC, IRIS",,2022_08_03 032337,319,,,,,,,,,,41.39070,1.00,-122.18550,0,,,,0.0,,14.0,False,,,0.0,,,,False,"Mud Creek, Mount Shasta",3,,,,,,,2022_08_03 012901,"Collins, E.A., Allstadt, K.E., Groult, C., Hib...",,,,,,debris flow,Lahar/debris flow/outburst flood,,,
316,130000.0,160000.0,84000.0,3000000.0,188,"IRIS, NCEDC",,2010_07_09 073820,188,1800.0,1900.0,1600.0,,,,7600.0,7900.0,7300.0,51.87130,0.00,-125.94340,1,,,,,,500.0,False,,,500.0,False,,,,Sheemahant Glacier,4,,,,,,,2010_07_09 073435,"Collins, E.A., Allstadt, K.E., Groult, C., Hib...",51.84780,-126.02910,51.87130,-125.94340,,rock slide,Rock/ice/debris avalanches and slides,3800000.0,5400000.0,2700000.0
317,,,,,159,IRIS,,2017_03_06 234541,159,,,,,,,,,,60.03300,1.00,-153.09870,1,,,,,,319.0,True,,,136.0,True,,,True,Iliamna,2,,,,,,,2017_03_06 232627,"Collins, E.A., Allstadt, K.E., Groult, C., Hib...",,,,,,ice avalanche,Rock/ice/debris avalanches and slides,,,


## Testing the retrained models on new events

In [5]:
cat_test = cat_new.merge(
    cat_old.drop_duplicates(),        # just in case cat_old has duplicates
    how="left",
    indicator=True
).query('_merge == "left_only"') \
  .drop(columns='_merge')             # (_merge was just for filtering)



# 1️⃣  Build the sets once
ids_test = set(cat_test['eventid'])
ids_old  = set(cat_old['eventid'])

# 2️⃣  Find and display every overlapping ID
overlap = ids_test & ids_old
print(f"🔍  Overlap count: {len(overlap)}")
if overlap:
    print("Overlapping eventid(s):", sorted(overlap))

# 3️⃣  Option A – DROP the overlaps from cat_test
cat_test_clean = cat_test[~cat_test['eventid'].isin(overlap)].copy()

# ───────────────────────────────────────────────────────────────
# If you’d rather regenerate your split instead of dropping, do it
# here (e.g. re-sample from cat_new after excluding cat_old).
# ───────────────────────────────────────────────────────────────

# 4️⃣  Final safety check
assert not cat_test_clean['eventid'].isin(cat_old['eventid']).any(), \
       "Overlap still present!"

print("✅  cat_test is now disjoint from cat_old.")

cat = cat_test_clean.copy()

🔍  Overlap count: 6
Overlapping eventid(s): [4, 23, 59, 81, 166, 192]
✅  cat_test is now disjoint from cat_old.


In [6]:
event_ids = cat['eventid'].values
source_types = cat['type'].values
source_subtypes = cat['subtype'].values

In [7]:
hod = [extract_datetime_info(i)[0] for i in cat['starttime'].values]
dow = [extract_datetime_info(i)[1] for i in cat['starttime'].values]
moy = [extract_datetime_info(i)[2] for i in cat['starttime'].values]

In [11]:
cat['starttime']

1      2017_06_23 213916
4      1997_12_26 070240
6      2013_01_21 010828
18     2020_07_24 214125
28     2021_07_27 172215
             ...        
296    2022_09_01 204553
306    2023_09_16 123545
312    2022_09_01 205421
314    2009_08_10 110556
315    2022_08_03 012901
Name: starttime, Length: 74, dtype: object

In [14]:
cat = cat.reset_index(drop = True)
## computing the durations
sttime = cat['starttime']
ettime = cat['endtime']

starttimes = conv_to_datetime(sttime)
endtimes = conv_to_datetime(ettime)
dur = conv_to_datetime(ettime) - conv_to_datetime(sttime)

In [16]:
st_z = []
st_data = []
evids = []
hod_trace = []
dow_trace = []
moy_trace = []
event_types = []
event_subtypes = []
event_duration = []
trids = []


for i in tqdm(range(len(event_ids))):
    try:
        files = glob("../../data/iris_esec_waveforms/waveforms/"+str(event_ids[i])+'/*HZ*')
        for file in files:
            tr = obspy.read(file)
            tr = tr.detrend(type = 'linear')
            d = tr[0].resample(100).data
            
            ## We are going to test the ml_40 model (so we will use P-10, P+30)
            data = d[8500:12500]
            snr = np.nanmax(abs(data))/np.nanmean(abs(data))

            if snr > 5:
                st_data.append(data)
                evids.append(event_ids[i])           
                trids.append(tr[0].id)
                hod_trace.append(hod[i])
                dow_trace.append(dow[i])
                moy_trace.append(moy[i])
                event_types.append(source_types[i])
                event_subtypes.append(source_subtypes[i])
                event_duration.append(dur[i])
                
    except:
        pass
    

100%|██████████| 74/74 [00:07<00:00, 10.23it/s]
