# Detection and Phase Picking in Cascadia using ELEP

This notebook deploys in parallel a detection+phase picking job on Cascadia Initiative stations using an ensemble deep learning ELEP (Yuan et al, 2023).
Written by Hiroto Bito.
Modified by CoolTeam (2/14/24)

In [2]:
import logging
import os

from obspy.clients.fdsn import Client
import numpy as np
import obspy
import matplotlib.pyplot as plt
from obspy.clients.fdsn import Client
import datetime
from datetime import timedelta
import pandas as pd
import dask
from dask.diagnostics import ProgressBar

from obspy.clients.fdsn.client import Client
from obspy.core.utcdatetime import UTCDateTime
from obspy import Stream

from pnwstore.mseed import WaveformClient
import torch
import numpy as np
from tqdm import tqdm
import time 
import pandas as pd
import gc
import seisbench.models as sbm
from ELEP.elep.ensemble_statistics import ensemble_statistics
from ELEP.elep.ensemble_coherence import ensemble_semblance 
from ELEP.elep.trigger_func import picks_summary_simple

In [3]:
device = torch.device("cpu")

In [4]:
# Define clients
client_inventory = Client('IRIS')
client_waveform = WaveformClient()
client_ncedc = Client('NCEDC')

## Make an array of networks and stations for CI year 2 in Morton's Catalog (not necessary to run anymore)

In [None]:
# Read Morton's catalog
cat_ds03 = pd.read_excel("../data/ds03.xlsx")

In [None]:
# Show the catalog
cat_ds03

In [None]:
# Get the names of the networks
cat_ds03_CI2 = cat_ds03[cat_ds03['CI YEAR']==2]
cat_ds03_CI2

In [None]:
# Assign the network codes
networks= cat_ds03_CI2['NETWORK CODE'].unique()

In [None]:
# Make a list of networks and stations
networks_stas = []
for i in range(len(networks)):
    network = networks[i]
    sta = cat_ds03_CI2.loc[cat_ds03_CI2['NETWORK CODE']==network, 'STATION']
    
    sta = [i.replace(" ", "") for i in sta]
    sta = set(sta)
    sta =list(sta)

    for j in range(len(sta)):
        networks_stas.append([network,sta[j]])

networks_stas =np.array(networks_stas)
networks_stas

In [None]:
print(len(networks_stas))

In [None]:
index =[]
client= client_inventory
for i, ii in enumerate(networks_stas):
    print(networks_stas[i])
    index.append(i)
    inventory = client.get_stations(network=networks_stas[i][0], station=networks_stas[i][1])
    

In [None]:
index

In [None]:
# Since the ['UW' 'I02D'] pair give an error when receiveing their data through inventory, remove this pair.
networks_stas = np.delete(networks_stas, index[-1],axis=0)
networks_stas

In [None]:
# Test if the array removed the  ['UW' 'I02D'] works through inventory this time. It works.
index =[]
client= client_inventory
for i, ii in enumerate(networks_stas):
    print(networks_stas[i])
    index.append(i)
    inventory = client.get_stations(network=networks_stas[i][0], station=networks_stas[i][1])

In [None]:
len(networks_stas)

## Make an array of networks and stations with more stations 

In [5]:
# Get all as many stations as possible
index =[]
client= client_inventory
starttime = datetime.datetime(year=2010,month=10,day=1,hour=0,minute=0,second=0,microsecond=0)
endtime = datetime.datetime(year=2015,month=10,day=31,hour=0,minute=0,second=0,microsecond=0)
inventory = client.get_stations(network="C8,7D,7A,CN,NV,UW,UO,NC,BK,TA,OO,PB,X6,Z5,X9", station="*", minlatitude=40,minlongitude=-127,maxlatitude=50,maxlongitude=-125, starttime=starttime.strftime('%Y%m%d'),endtime=endtime.strftime('%Y%m%d'))

In [6]:
# Print the inventory
inventory

Inventory created at 2024-04-16T19:42:18.836400Z
	Created by: IRIS WEB SERVICE: fdsnws-station | version: 1.1.52
		    http://service.iris.edu/fdsnws/station/1/query?starttime=2010-10-01...
	Sending institution: IRIS-DMC (IRIS-DMC)
	Contains:
		Networks (10):
			7A, 7D, C8, CN, NV, OO, PB, X6, X9, Z5
		Stations (192):
			7A.W01 (WHOI OBS ID S85)
			7A.W02 (WHOI OBS ID S83)
			7A.W03 (WHOI OBS ID S86)
			7A.W04 (WHOI OBS ID S89)
			7A.W06 (WHOI OBS ID S80)
			7A.W07 (WHOI OBS ID S82)
			7A.W08 (WHOI OBS ID S81)
			7A.W09 (WHOI OBS ID S84)
			7A.W10 (WHOI OBS ID S88)
			7D.FN11C (LDEO OBS TRAWL-RESISTANT)
			7D.FN12A (LDEO Trawl-Resistant OBS Site FN12A)
			7D.FN12C (LDEO OBS TRAWL-RESISTANT)
			7D.FN13C (LDEO OBS STANDARD DEEP)
			7D.FN16A (LDEO OBS STANDARD DEEP)
			7D.FN16C (LDEO OBS STANDARD DEEP)
			7D.FN17C (LDEO OBS TRAWL-RESISTANT)
			7D.FS20B (LDEO OBS STANDARD DEEP)
			7D.G02B (SIO OBS ID G02B)
			7D.G02D (LDEO OBS STANDARD DEEP)
			7D.G03A (WHOI OBS ID S87)
			7D.G03B (WHOI OB

In [7]:
# Make a list of networks and stations
networks_stas = []
for i in range(len(inventory)):
    network = inventory[i].code
    
    for j in range(len(inventory[i])):
        networks_stas.append([network,inventory[i].stations[j].code])

networks_stas =np.array(networks_stas)
networks_stas  
    

array([['7A', 'W01'],
       ['7A', 'W02'],
       ['7A', 'W03'],
       ['7A', 'W04'],
       ['7A', 'W06'],
       ['7A', 'W07'],
       ['7A', 'W08'],
       ['7A', 'W09'],
       ['7A', 'W10'],
       ['7D', 'FN11C'],
       ['7D', 'FN12A'],
       ['7D', 'FN12C'],
       ['7D', 'FN13C'],
       ['7D', 'FN16A'],
       ['7D', 'FN16C'],
       ['7D', 'FN17C'],
       ['7D', 'FS20B'],
       ['7D', 'G02B'],
       ['7D', 'G02D'],
       ['7D', 'G03A'],
       ['7D', 'G03B'],
       ['7D', 'G03D'],
       ['7D', 'G04B'],
       ['7D', 'G04D'],
       ['7D', 'G10B'],
       ['7D', 'G10D'],
       ['7D', 'G11B'],
       ['7D', 'G11D'],
       ['7D', 'G18D'],
       ['7D', 'G19B'],
       ['7D', 'G19D'],
       ['7D', 'G20B'],
       ['7D', 'G20D'],
       ['7D', 'G26B'],
       ['7D', 'G26D'],
       ['7D', 'G27B'],
       ['7D', 'G27D'],
       ['7D', 'G28B'],
       ['7D', 'G34B'],
       ['7D', 'G34D'],
       ['7D', 'G35B'],
       ['7D', 'G35D'],
       ['7D', 'G36B2'],
       ['7D

In [8]:
len(networks_stas)

192

## Actual Picking

In [9]:
# Now create your list of days to loop over!
t1 = datetime.datetime(year=2012,month=10,day=1,hour=0,minute=0,second=0,microsecond=0)
t2 = datetime.datetime(year=2012,month=10,day=31,hour=0,minute=0,second=0,microsecond=0)
time_bins = pd.to_datetime(np.arange(t1,t2+datetime.timedelta(days=1),pd.Timedelta(1,'days')))

In [None]:
# filename = "../data/catalogs_elep_all_stations_20121001_to_20121031"
# os.makedirs(filename,exist_ok=True)

In [11]:
# Specify some parameters - you can change what you specify here vs. within the large function, this is just an example.
# Depending on whether the pertained models take a long time to load every time, you may want to load those outside the function and just feed them to the function rather than loading them every time in parallel.
twin = 6000     # length of time window
step = 3000     # step length
l_blnd, r_blnd = 500, 500

###########################
# This is not a valid file path to save files. 
filepath = "../data/catalogs_elep_all_stations_20121001_to_20121031/"

# If we define:
station = "M14B"
# t1 = datetime(2012,10,1)
tstring = t1.strftime('%Y%m%d')

# You use this arguments below as (it should be filepath, not file_path)
file_name = filepath+station+'_'+tstring+'.csv'

# Then your file name will be:
print(file_name)
#https://cascadia.ess.washington.edu/jhub/user/hbito/notebooks/elep-test/surface_events/src7D_20121001.csv

# 1. There is a missing "/" after src in filepath.
# 2. This string beginning with https:// is a web link. You can replace filepath as:
# filepath = "/home/hbito/elep-test/surface_events/src/"
# 3. it may not be recommended to save data product to a src (source) folder. 
#    People usually make other directories to save those final files. 
#    Just a notice: not our top prioirty though.
###########################

../data/catalogs_elep_all_stations_20121001_to_20121031/M14B_20121001.csv


In [12]:
# download models
pretrain_list = ["pnw","ethz","instance","scedc","stead","geofon"]
pn_pnw_model = sbm.EQTransformer.from_pretrained('pnw')
pn_ethz_model = sbm.EQTransformer.from_pretrained("ethz")
pn_instance_model = sbm.EQTransformer.from_pretrained("instance")
pn_scedc_model = sbm.EQTransformer.from_pretrained("scedc")
pn_stead_model = sbm.EQTransformer.from_pretrained("stead")
pn_geofon_model = sbm.EQTransformer.from_pretrained("geofon")

In [13]:
# Define the function for stacking the segmented time windows after prediction
##################################### add a nseg argument here
# def stacking(data, npts, l_blnd, r_blnd):
def stacking(data, npts, l_blnd, r_blnd, nseg):
#####################################
    _data = data.copy()
    stack = np.full(npts, np.nan, dtype = np.float32)
    _data[:, :l_blnd] = np.nan; _data[:, -r_blnd:] = np.nan
    stack[:twin] = _data[0, :]
    for iseg in range(nseg-1):
        idx = step*(iseg+1)
        stack[idx:idx + twin] = \
                np.nanmax([stack[idx:idx + twin], _data[iseg+1, :]], axis = 0)
    return stack

In [14]:
# Trying to debug the error when I try to get the waveforms from the NCEDC client
sdata = client_waveform.get_waveforms(network='7D', station="J25B", channel="?H?", 
                                          year=t1.strftime('%Y'), month=t1.strftime('%m'), 
                                          day=t1.strftime('%d'))
sdata

3 Trace(s) in Stream:
7D.J25B..HH1 | 2012-10-01T00:00:00.006900Z - 2012-10-01T23:59:59.990900Z | 125.0 Hz, 10799999 samples
7D.J25B..HH2 | 2012-10-01T00:00:00.006900Z - 2012-10-01T23:59:59.990900Z | 125.0 Hz, 10799999 samples
7D.J25B..HHZ | 2012-10-01T00:00:00.006900Z - 2012-10-01T23:59:59.990900Z | 125.0 Hz, 10799999 samples

## Test if we can retrieve streams from stations KBO and KRMB and KEB for anytime before mid-October 2012
These stations don't seem to have stream before mid-October 2012

In [None]:
client_ncedc = Client('NCEDC')

sdata = client_ncedc.get_waveforms(network='NC', station='KBO', location="*", channel='?H?',
                                   starttime=UTCDateTime(t1), endtime=UTCDateTime(t1 + timedelta(days=1)))
sdata

In [None]:
client_ncedc = Client('NCEDC')

sdata = client_ncedc.get_waveforms(network='NC', station='KRMB', location="*", channel='?H?',
                                   starttime=UTCDateTime(t1), endtime=UTCDateTime(t1 + timedelta(days=1)))
sdata

In [None]:
client_ncedc = Client('NCEDC')

sdata = client_ncedc.get_waveforms(network='NC', station='KEB', location="*", channel='?H?',
                                   starttime=UTCDateTime(t1), endtime=UTCDateTime(t1 + timedelta(days=1)))
sdata

## Test if we can retrieve streams from station KOM


In [None]:
# client_waveform = WaveformClient()
sdata = client_ncedc.get_waveforms(network='NC', station='KOM', location="*", channel='?H?',
                                   starttime=UTCDateTime(t1), endtime=UTCDateTime(t1 + timedelta(days=1)))
sdata

## Test if we can retrieve streams from station JCC


In [None]:
# client_waveform = WaveformClient()
_sdata = client_ncedc.get_waveforms(network='BK', station='JCC', location="*", channel='?H?',
                                   starttime=UTCDateTime(t1), endtime=UTCDateTime(t1 + timedelta(days=1)))
sdata = Stream()
# Check for HH and BH channels presence
has_HH = bool(_sdata.select(channel="HH?"))
has_BH = bool(_sdata.select(channel="BH?"))

# Apply selection logic based on channel presence
if has_HH and has_BH:
    # If both HH and BH channels are present, select only HH
    sdata += _sdata.select(channel="HH?")
elif has_HH:
    # If only HH channels are present
    sdata += _sdata.select(channel="HH?")
elif has_BH:
    # If only BH channels are present
    sdata += _sdata.select(channel="BH?")

In [None]:
sdata

In [None]:
# Testing how the stream for JCC looks like since I received an error for this station in the parallel. 
# As shown in the output, JCC has two channel types which causes an error when converting the traces in the stream to a array since 
# these channels have different lengths of traces. 
sdata = sdata.select(channel = "[HB]H?")
    ###############################
    # If no data returned, skipping
if len(sdata) == 0:
    logging.warning("No stream returned. Skipping.")
    
###############################

sdata.filter(type='bandpass',freqmin=4,freqmax=15)

###############################
sdata.merge(fill_value='interpolate') # fill gaps if there are any.
###############################

# Get the necassary information about the station
delta = sdata[0].stats.delta
starttime = sdata[0].stats.starttime
fs = sdata[0].stats.sampling_rate
dt = 1/fs


# Make all the traces in the stream have the same lengths
max_starttime = max([tr.stats.starttime for tr in sdata])
min_endtime = min([tr.stats.endtime for tr in sdata])

for tr in sdata:
    tr.trim(starttime=max_starttime,endtime=min_endtime, nearest_sample=True)    


## Define the detection function

In [15]:
# Write your function that you want to run in parallel: I recommend you design this to essentially perform your entire workflow on one station for one day, and write a csv file for that station, much the way you already have it.
# This is what will run in parallel!
# So, the only inputs are the station name, the start and end times you want to detect for, the path of the folder you want to write the results to, and the parameters you already specified. Here is where you could also feed in the preloaded models if that becomes important.
def run_detection(network,station,t1,t2,filepath,twin,step,l_blnd,r_blnd):
    # Define tstring
    tstring = t1.strftime('%Y%m%d')

    if os.path.exists(filepath+station+'_'+tstring+'.csv'):
        return 
	# Load data
	# Reshape data
	# Predict on base models
	# Stack
	# Create and write csv file. Define file name using the station code and the input filepath
    
    # Get the inventory for the stations
    ###############################
    #stations = station           # Seems you never use this "stations" again
                                  # so you can just use "station"
    ###############################
    network = network
#     channels = '[HB][HN][BH]?'
#     channels = 'HH?,HN?,BH?' 
    channels = '?H?'
    
    client = client_inventory
    inventory = client.get_stations(network=network, station=station)
    
    # Get waveforms and filter
    ###############################
   #sdata = client_waveform.get_waveforms(network="7D", station=station, channel="BH?", starttime=t1, 
   #                                      year=t1.strftime('%Y'), month=t1.strftime('%m'), 
   #                                      day=t1.strftime('%d'))
   # You've already defined stations, network, channels above, so you can use them here.
   # You don't need "starttime = t1" argument.
    
    
    try:
        if network in ['NC', 'BK']:
            # Query waveforms
            _sdata = client_ncedc.get_waveforms(network=network, station=station, location="*", channel=channels,
                                               starttime=UTCDateTime(t1), endtime=UTCDateTime(t1 + timedelta(days=1)))
        else: 
            _sdata = client_waveform.get_waveforms(network=network, station=station, channel=channels, 
                                              year=t1.strftime('%Y'), month=t1.strftime('%m'), 
                                              day=t1.strftime('%d'))
    except obspy.clients.fdsn.header.FDSNNoDataException:
        print(f"WARNING: No data for {network}.{station}.{channels} on {t1}.")
        return
    
#     sdata = sdata.select(channel = "[HB]H?")
        
# Create a new stream
    sdata = Stream()
# Check for HH and BH channels presence
    has_HH = bool(_sdata.select(channel="HH?"))
    has_BH = bool(_sdata.select(channel="BH?"))

    # Apply selection logic based on channel presence
    if has_HH and has_BH:
        # If both HH and BH channels are present, select only HH
        sdata += _sdata.select(channel="HH?")
    elif has_HH:
        # If only HH channels are present
        sdata += _sdata.select(channel="HH?")
    elif has_BH:
        # If only BH channels are present
        sdata += _sdata.select(channel="BH?")

    ###############################
    # If no data returned, skipping
    if len(sdata) == 0:
        logging.warning("No stream returned. Skipping.")
        return
    ###############################
    
    sdata.filter(type='bandpass',freqmin=4,freqmax=15)
    
    ###############################
    sdata.merge(fill_value='interpolate') # fill gaps if there are any.
    ###############################

    # Get the necassary information about the station
    delta = sdata[0].stats.delta
    starttime = sdata[0].stats.starttime
    fs = sdata[0].stats.sampling_rate
    dt = 1/fs
    

    # Make all the traces in the stream have the same lengths
    max_starttime = max([tr.stats.starttime for tr in sdata])
    min_endtime = min([tr.stats.endtime for tr in sdata])
    
    for tr in sdata:
        tr.trim(starttime=max_starttime,endtime=min_endtime, nearest_sample=True)    
        
    # Reshaping data
    arr_sdata = np.array(sdata)
    npts = arr_sdata.shape[1]
    ############################### avoiding errors at the end of a stream
   #nseg = int(np.ceil((npts - twin) / step)) + 1
    nseg = int(np.floor((npts - twin) / step)) + 1
    ###############################
    windows = np.zeros(shape=(nseg, 3, twin), dtype= np.float32)
    tap = 0.5 * (1 + np.cos(np.linspace(np.pi, 2 * np.pi, 6)))
    
    # Define the parameters for semblance
    paras_semblance = {'dt':dt, 'semblance_order':2, 'window_flag':True, 
                   'semblance_win':0.5, 'weight_flag':'max'}
    p_thrd, s_thrd = 0.05, 0.05

    windows_std = np.zeros(shape=(nseg, 3, twin), dtype= np.float32)
    windows_max = np.zeros(shape=(nseg, 3, twin), dtype= np.float32)
    windows = np.zeros(shape=(nseg, 3, twin), dtype= np.float32)
    windows_idx = np.zeros(nseg, dtype=np.int32)

    for iseg in range(nseg):
        idx = iseg * step
        windows[iseg, :] = arr_sdata[:, idx:idx + twin]
        windows[iseg, :] -= np.mean(windows[iseg, :], axis=-1, keepdims=True)
        # original use std norm
        windows_std[iseg, :] = windows[iseg, :] / np.std(windows[iseg, :]) + 1e-10
        # others use max norm
        windows_max[iseg, :] = windows[iseg, :] / (np.max(np.abs(windows[iseg, :]), axis=-1, keepdims=True))
        windows_idx[iseg] = idx

    # taper
    windows_std[:, :, :6] *= tap; windows_std[:, :, -6:] *= tap[::-1]; 
    windows_max[:, :, :6] *= tap; windows_max[:, :, -6:] *= tap[::-1];
    del windows

#     print(f"Window data shape: {windows_std.shape}")
    
    # Predict on base models
    
    pretrain_list = ['original', 'ethz', 'instance', 'scedc', 'stead']

    # dim 0: 0 = P, 1 = S
    batch_pred = np.zeros([2, len(pretrain_list), nseg, twin], dtype = np.float32) 
    for ipre, pretrain in enumerate(pretrain_list):
        t0 = time.time()
        eqt = sbm.EQTransformer.from_pretrained(pretrain)
        eqt.to(device);
        eqt._annotate_args['overlap'] = ('Overlap between prediction windows in samples \
                                        (only for window prediction models)', step)
        eqt._annotate_args['blinding'] = ('Number of prediction samples to discard on \
                                         each side of each window prediction', (l_blnd, r_blnd))
        eqt.eval();
        if pretrain == 'original':
            # batch prediction through torch model
            windows_std_tt = torch.Tensor(windows_std)
            _torch_pred = eqt(windows_std_tt.to(device))
        else:
            windows_max_tt = torch.Tensor(windows_max)
            _torch_pred = eqt(windows_max_tt.to(device))
        batch_pred[0, ipre, :] = _torch_pred[1].detach().cpu().numpy()
        batch_pred[1, ipre, :] = _torch_pred[2].detach().cpu().numpy()

    # clean up memory
    del _torch_pred, windows_max_tt, windows_std_tt
    del windows_std, windows_max
    gc.collect()
    torch.cuda.empty_cache()

    print(f"All prediction shape: {batch_pred.shape}")
    
    ####################### You don't need this
#     pretrain_pred = np.zeros([2, len(pretrain_list), npts], dtype = np.float32)
#     for ipre, pretrain in enumerate(pretrain_list):
#        # 0 for P-wave
#         pretrain_pred[0, ipre, :] = stacking(batch_pred[0, ipre, :], npts, l_blnd, r_blnd)
# 
#        # 1 for S-wave
#        pretrain_pred[1, ipre, :] = stacking(batch_pred[1, ipre, :], npts, l_blnd, r_blnd)
    ####################### You don't need this
    
    smb_pred = np.zeros([2, nseg, twin], dtype = np.float32)
    # calculate the semblance
    ## the semblance may takes a while bit to calculate
    
    ############################# remove tqdm (extra progress bar)
#     for iseg in tqdm(range(nseg)):
    for iseg in range(nseg):
    #############################
        # 0 for P-wave
        smb_pred[0, iseg, :] = ensemble_semblance(batch_pred[0, :, iseg, :], paras_semblance)

        # 1 for P-wave
        smb_pred[1, iseg, :] = ensemble_semblance(batch_pred[1, :, iseg, :], paras_semblance)

    ## ... and stack
    # 0 for P-wave
    ####################### add a nseg argument here
    #smb_p = stacking(smb_pred[0, :], npts, l_blnd, r_blnd)
    smb_p = stacking(smb_pred[0, :], npts, l_blnd, r_blnd, nseg)

    # 1 for P-wave
    #smb_s = stacking(smb_pred[1, :], npts, l_blnd, r_blnd)
    smb_s = stacking(smb_pred[1, :], npts, l_blnd, r_blnd, nseg)
    #######################
    # clean-up RAM
    del smb_pred, batch_pred

    p_index = picks_summary_simple(smb_p, p_thrd)
    s_index = picks_summary_simple(smb_s, s_thrd)
    print(f"{len(p_index)} P picks\n{len(s_index)} S picks")
    
    # Create lists and a data frame
    event_id = []
    source_type = []
    station_network_code = []
    station_channel_code = []
    station_code = []
    station_location_code = []
    station_latitude_deg= []
    station_longitude_deg = []
    station_elevation_m = []
    trace_name = []
    trace_sampling_rate_hz = []
    trace_start_time = []
    trace_S_arrival_sample = []
    trace_P_arrival_sample = []
    trace_S_onset = []
    trace_P_onset = []
    trace_snr_db = []
    trace_p_arrival = []
    trace_s_arrival = []

    for i, idx in enumerate(p_index):
        event_id.append(' ')
        source_type.append(' ')
        station_network_code.append(network)   # Change to otehr networks
        station_channel_code.append(' ')
        station_code.append(station)
        station_location_code.append(sdata[0].stats.location)   
        station_latitude_deg.append(inventory[0][0].latitude)
        station_longitude_deg.append(inventory[0][0].longitude)   
        station_elevation_m.append(inventory[0][0].elevation)
        trace_name.append(' ')
        trace_sampling_rate_hz.append(sdata[0].stats.sampling_rate)
        trace_start_time.append(sdata[0].stats.starttime)
        trace_S_arrival_sample.append(' ')
        trace_P_arrival_sample.append(' ')
        trace_S_onset.append(' ')
        trace_P_onset.append(' ')
        trace_snr_db.append(' ')
        trace_s_arrival.append(np.nan)
        trace_p_arrival.append(str(starttime  + idx * delta))

    for i, idx in enumerate(s_index):
        event_id.append(' ')
        source_type.append(' ')
        station_network_code.append(network) # Change to otehr networks
        station_channel_code.append(' ')
        station_code.append(station)
        station_location_code.append(sdata[0].stats.location)   
        station_latitude_deg.append(inventory[0][0].latitude)
        station_longitude_deg.append(inventory[0][0].longitude)   
        station_elevation_m.append(inventory[0][0].elevation)
        trace_name.append(' ')
        trace_sampling_rate_hz.append(sdata[0].stats.sampling_rate)
        trace_start_time.append(sdata[0].stats.starttime)
        trace_S_arrival_sample.append(' ')
        trace_P_arrival_sample.append(' ')
        trace_S_onset.append(' ')
        trace_P_onset.append(' ')
        trace_snr_db.append(' ')
        trace_s_arrival.append(str(starttime  + idx * delta))
        trace_p_arrival.append(np.nan)

    # dictionary of lists
    dict = {'event_id':event_id,'source_type':source_type,'station_network_code':station_network_code,\
            'station_channel_code':station_channel_code,'station_code':station_code,'station_location_code':station_location_code,\
            'station_latitude_deg':station_latitude_deg,'station_longitude_deg':station_longitude_deg, \
            'station_elevation_m':station_elevation_m,'trace_name':trace_name,'trace_sampling_rate_hz':trace_sampling_rate_hz,\
            'trace_start_time':trace_start_time,'trace_S_arrival_sample':trace_S_arrival_sample,\
            'trace_P_arrival_sample':trace_P_arrival_sample, 'trace_S_onset':trace_S_onset,'trace_P_onset':trace_P_onset,\
            'trace_snr_db':trace_snr_db, 'trace_s_arrival':trace_s_arrival, 'trace_p_arrival':trace_p_arrival}

    df = pd.DataFrame(dict)

    # Make the specific day into a string:
    tstring = t1.strftime('%Y%m%d')
    # Build the full file name:
    ##################################################
#     file_name = file_path+station+'_'+tstring+'.csv'
    file_name = filepath+station+'_'+tstring+'.csv'
    ##################################################
    # Write to file using that name
    df.to_csv(file_name)

In [16]:
# Combine that list of days with the list of stations
# We are essentially creating a list of the number of tasks we have to do with the information that is unique to each task; we will do them in parallel
task_list = []
for i in range(len(networks_stas)):
	for t in time_bins:
		task_list.append([networks_stas[i][0], networks_stas[i][1],t])

In [17]:
task_list

[['7A', 'W01', Timestamp('2012-10-01 00:00:00')],
 ['7A', 'W01', Timestamp('2012-10-02 00:00:00')],
 ['7A', 'W01', Timestamp('2012-10-03 00:00:00')],
 ['7A', 'W01', Timestamp('2012-10-04 00:00:00')],
 ['7A', 'W01', Timestamp('2012-10-05 00:00:00')],
 ['7A', 'W01', Timestamp('2012-10-06 00:00:00')],
 ['7A', 'W01', Timestamp('2012-10-07 00:00:00')],
 ['7A', 'W01', Timestamp('2012-10-08 00:00:00')],
 ['7A', 'W01', Timestamp('2012-10-09 00:00:00')],
 ['7A', 'W01', Timestamp('2012-10-10 00:00:00')],
 ['7A', 'W01', Timestamp('2012-10-11 00:00:00')],
 ['7A', 'W01', Timestamp('2012-10-12 00:00:00')],
 ['7A', 'W01', Timestamp('2012-10-13 00:00:00')],
 ['7A', 'W01', Timestamp('2012-10-14 00:00:00')],
 ['7A', 'W01', Timestamp('2012-10-15 00:00:00')],
 ['7A', 'W01', Timestamp('2012-10-16 00:00:00')],
 ['7A', 'W01', Timestamp('2012-10-17 00:00:00')],
 ['7A', 'W01', Timestamp('2012-10-18 00:00:00')],
 ['7A', 'W01', Timestamp('2012-10-19 00:00:00')],
 ['7A', 'W01', Timestamp('2012-10-20 00:00:00')],


In [19]:
len(task_list)

5952

In [None]:
# Now we start setting up a parallel operation using a package called Dask.

# Start by writing a new a function that is specifically designed to be run in parallel through dask. All it essentially does is define the inputs to the larger run_detection function and then runs the function itself, but because we "decorate" it with @dask.delayed to start, the code will recognize that it should be run in parallel.

@dask.delayed
def loop_days(task,filepath,twin,step,l_blnd,r_blnd):

	# Define the parameters that are specific to each task
	t1 = obspy.UTCDateTime(task[2])
	t2 = obspy.UTCDateTime(t1 + pd.Timedelta(1,'days'))
	network = task[0]
	station = task[1]
    
    #print network and station
	print([network,station,t1])
	# Call to the function that will perform the operation and write the results to file
	run_detection(network,station,t1,t2,filepath,twin,step,l_blnd,r_blnd)
	

# Now we set up the parallel operation
# The below builds a framework for the computer to run in parallel. This doesn't actually execute anything.
lazy_results = [loop_days(task,filepath,twin,step,l_blnd,r_blnd) for task in task_list]
    

# The below actually executes the parallel operation!
# It's nice to do it with the ProgressBar so you can see how long things are taking.
# Each operation should also write a file so that is another way to check on progress.
with ProgressBar():
    #################################
    # Add scheduler = 'single-threaded'
	dask.compute(lazy_results, scheduler='single-threaded') 
    

In [None]:
# Now we start setting up a parallel operation using a package called Dask.

# Start by writing a new a function that is specifically designed to be run in parallel through dask. All it essentially does is define the inputs to the larger run_detection function and then runs the function itself, but because we "decorate" it with @dask.delayed to start, the code will recognize that it should be run in parallel.

@dask.delayed
def loop_days(task,filepath,twin,step,l_blnd,r_blnd):

    # Define the parameters that are specific to each task
    t1 = obspy.UTCDateTime(task[2])
    t2 = obspy.UTCDateTime(t1 + pd.Timedelta(1,'days'))
    network = task[0]
    station = task[1]

    #print network and station
    print([network,station,t1])
    # Call to the function that will perform the operation and write the results to file
    try: 
        run_detection(network,station,t1,t2,filepath,twin,step,l_blnd,r_blnd)
    except:
        return


# Now we set up the parallel operation
# The below builds a framework for the computer to run in parallel. This doesn't actually execute anything.
lazy_results = [loop_days(task,filepath,twin,step,l_blnd,r_blnd) for task in task_list]
    

# The below actually executes the parallel operation!
# It's nice to do it with the ProgressBar so you can see how long things are taking.
# Each operation should also write a file so that is another way to check on progress.
with ProgressBar():
    #################################
    # Add scheduler = 'single-threaded'
	dask.compute(lazy_results, scheduler='single-threaded') 
    

## Checking if the catalogs for all the stations were produced

In [None]:
pwd

In [None]:
cd /home/hbito/cascadia_obs_ensemble/data

In [None]:
cd catalogs_elep_all_stations_20121001_to_20121031_temp

In [None]:
ls I02D*

In [None]:
ls_sta = [ 'I02D', 'J01E' 'FS16B', 'J33B', 'M14B', 'FS20B', 'G26B',
          'M09B','FS17B','G27B','M12B', 'FS13B', 'J09B', 'G17B','J25B',
          'G19B','FS09B','G10B', 'G25B','BABR','KBO','KOM', 'KRMB',
          'KHMB', 'KHBB','KSXB', 'KEB','KMPB','JCC']

In [None]:
import glob

ls_sta = ['I02D', 'J01E', 'FS16B', 'J33B', 'M14B', 'FS20B', 'G26B',
          'M09B', 'FS17B', 'G27B', 'M12B', 'FS13B', 'J09B', 'G17B', 'J25B',
          'G19B', 'FS09B', 'G10B', 'G25B', 'BABR', 'KBO', 'KOM', 'KRMB',
          'KHMB', 'KHBB', 'KSXB', 'KEB', 'KMPB', 'JCC']

for station in ls_sta:
    file_pattern = f"{station}_*.csv"
    matching_files = glob.glob(file_pattern)
    if matching_files:
        print(f"Catalogs for station {station}:")
        for file in matching_files:
            print(file)
    else:
        print(f"No catalogs found for station {station}")

In [None]:
pwd

In [None]:
import glob

ls_sta = ['I02D', 'J01E', 'FS16B', 'J33B', 'M14B', 'FS20B', 'G26B',
          'M09B', 'FS17B', 'G27B', 'M12B', 'FS13B', 'J09B', 'G17B', 'J25B',
          'G19B', 'FS09B', 'G10B', 'G25B', 'BABR', 'KBO', 'KOM', 'KRMB',
          'KHMB', 'KHBB', 'KSXB', 'KEB', 'KMPB', 'JCC']

for station in ls_sta:
    file_pattern = f"{station}_*.csv"
    matching_files = glob.glob(file_pattern)
    # Sort the files by the date part of the filename
    matching_files_sorted = sorted(matching_files, key=lambda x: x.split('_')[1])
    if matching_files_sorted:
        print(f"Catalogs for station {station}:")
        for file in matching_files_sorted:
            print(file)
    else:
        print(f"No catalogs found for station {station}")