# Surface Event Phase Picking

This is a modified version of the surface-event location+directivity analysis that Francesca Skene ( fskene@uw.edu), originally created by her in 7/22/22, who started as an undergraduate student at UW. This is marine denolle's version. It includes:
* Waveform download for each event on each volcano given the PNSN pick times of "su" events.
* Data pre-processing to trim the data within 1-20 Hz and remove outliers.
* phase picking using transfer-learned model (Ni et al, 2023)
* Centroid time picking using envelope measurements
* Frequency measurements for doppler analysis
* gathering of the data into a CSV data frame.

Updated 03/21/2024
Marine Denolle
(mdenolle@uw.edu)

Import Modules

In [None]:
# import sys
# sys.path.append('/data/wsd01/pnwstore/')
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd
import obspy
from obspy.core import UTCDateTime
from obspy.clients.fdsn.client import Client

import scipy
from scipy import optimize
from scipy.optimize import curve_fit
from datetime import datetime
from utils import *
from mbf_elep_func import *
import torch
plt.rcParams.update({'font.size': 10})


import seisbench.models as sbm
device = torch.device("cpu")

# from ELEP.elep.ensemble_statistics import ensemble_statistics
from ELEP.elep.ensemble_coherence import ensemble_semblance 
# from ELEP.elep.ensemble_learners import ensemble_regressor_cnn
from ELEP.elep import mbf, mbf_utils
from ELEP.elep import trigger_func

from ELEP.elep.mbf_utils import make_LogFq, make_LinFq, rec_filter_coeff, create_obspy_trace
from ELEP.elep.mbf import MB_filter as MBF

from joblib import Parallel, delayed
from matplotlib.backends.backend_pdf import PdfPages
import pyproj

Parameters

In [None]:
# define clients to download the station data
# client = WaveformClient() # we ignore PNWdatastore for now
client2 = Client('IRIS') # IRIS client

t_before = 15 #number of seconds before pick time
# t_after = 15 #number of seconds after pick time
t_before_raw = 1200 #number of seconds before pick time before removing instrumental response
# t_after_raw = 1200 #number of seconds after pick time before removing instrumental response
fs = 40 #sampling rate that all waveforms are resampled to
window = 150 #window length of the signal (this will help with phase picking with EqT next). 
# Use 150 seconds @ 40 Hz gives 6001 points. 
pr = 98 #percentile
thr = 7 #SNR threshold
station_distance_threshold = 25 #distance threshold in km
pi = np.pi
v_s = 1000 #shear wave velocity at the surface

# range of dates that we are looking at
t_beginning = UTCDateTime(2001,1,1,0,0,0) 
t_end = UTCDateTime(2024,1,1,23,59)

smooth_length = 20 # constant for smoothing the waveform envelopes
low_cut = 1 #low frequency threshold
high_cut = 15 #high frequency threshold
az_thr = 1000 #threshold of distance in meters from source location
step = 100 #step every 100 m
t_step = 1 #step every second
ratio = 5.6915196 #used to define the grid 
# colors = list(plt.cm.tab10(np.arange(10)))*3
radius = 6371e3 # radius of the earth

## Volcano - Station Information

In [None]:
#this data includes all stations within 50km of each volcano and the lat, lon, elev of each station
df = pd.read_csv('../data/station/Volcano_Metadata_50km.csv')

## PNSN SU Pick information

In [None]:
f1 = pd.read_csv("../data/events/su_picks.txt",sep="|") 
f1.head()
print(f1.keys())

In [None]:
# clean up the spaces in the file
format='%Y/%m/%d %H:%M:%S'
test=f1["date"].values.tolist()
start_time_temp = [  datetime.strptime(x.strip(),'%Y/%m/%d %H:%M:%S') for x in f1["date"].values.tolist()]
# # Ignore events prior to t_beginning
ik=np.where(np.array(start_time_temp)>datetime(2001,1,1))[0][0]

# select only net, sta, evid, startime for event past the start date.

start_time = start_time_temp[ik:]
net=[ x.strip() for x in f1["net"].values.tolist()][ik:]
sta=[ x.strip() for x in f1["sta"].values.tolist()][ik:]
evt_id=[ x for x in f1["orid"].values.tolist()][ik:]
all_stas=set(sta)

In [None]:
f1

## ML Models

In [None]:
# import os
# os.makedirs("/Users/marinedenolle/.seisbench/models/v3/eqtransformer",exist_ok=True)

In [None]:
# !wget https://github.com/congcy/ELEP/raw/main/docs/tutorials/data/pnw.pt.v1 -O ~/.seisbench/models/v3/eqtransformer/pnw.pt.v1
# !wget https://github.com/congcy/ELEP/raw/main/docs/tutorials/data/pnw.json.v1 -O ~/.seisbench/models/v3/eqtransformer/pnw.json.v1

In [None]:
# download models
list_models_name = ["pnw","ethz","instance","scedc","stead","geofon"]
pn_pnw_model = sbm.EQTransformer.from_pretrained('pnw')
pn_ethz_model = sbm.EQTransformer.from_pretrained("ethz")
pn_instance_model = sbm.EQTransformer.from_pretrained("instance")
pn_scedc_model = sbm.EQTransformer.from_pretrained("scedc")
pn_stead_model = sbm.EQTransformer.from_pretrained("stead")
pn_geofon_model = sbm.EQTransformer.from_pretrained("geofon")
# pn_neic_model = sbm.EQTransformer.from_pretrained("neic")

list_models = [pn_pnw_model, pn_ethz_model, pn_instance_model, pn_scedc_model, pn_stead_model, pn_geofon_model]

pn_pnw_model.to(device);
pn_ethz_model.to(device);
pn_scedc_model.to(device);
# pn_neic_model.to(device);
pn_geofon_model.to(device);
pn_stead_model.to(device);
pn_instance_model.to(device);

In [None]:
paras_semblance = {'dt':0.025, 'semblance_order':4, 'window_flag':True, 
                   'semblance_win':0.5, 'weight_flag':'max'}
p_thrd, s_thrd = 0.01, 0.05

fqmin = low_cut
fqmax = high_cut
dt = 0.025; fs = 40
nfqs = 10
nt = 6000; nc = 3
fq_list = make_LogFq(fqmin, fqmax, dt, nfqs)
coeff_HP, coeff_LP = rec_filter_coeff(fq_list, dt)
MBF_paras = {'f_min':fqmin, 'f_max':fqmax, 'nfqs':nfqs, 'frequencies':fq_list, 'CN_HP':coeff_HP, 'CN_LP':coeff_LP, \
    'dt':dt, 'fs':fs, 'nt':nt, 'nc':nc, 'npoles': 2}

# Measurements

* download waveforms
* phase pick onset
* estimate SNR
* measure centroid, max envelope, duration
* measure Fmax for doppler analysis


In [None]:

pdf = PdfPages('../plots/Mt_RAinier_plot.pdf')
associated_volcano = "Mt_Rainier"
dff=[] 
# event_ID = '0000' #str(evt_id[n])
nplot=0
for n in range(len(evt_id)):
    if start_time[n]<datetime(2022,1,1):continue   
    event_ID = str(evt_id[n])
    if (n>1) & (event_ID==str(evt_id[n-1])):continue
    otime = UTCDateTime(start_time[n])  
    associated_volcano="Mt_Rainier"


    #get info for stations within 50km of volcano that event ocurred at
    stations = df[df['Volcano_Name'] == associated_volcano]['Station'].values.tolist()
    networks = df[df['Volcano_Name'] == associated_volcano]['Network'].values.tolist()
    latitudes = df[df['Volcano_Name'] == associated_volcano]['Latitude'].values.tolist()
    longitudes = df[df['Volcano_Name'] == associated_volcano]['Longitude'].values.tolist()
    elevations = df[df['Volcano_Name']== associated_volcano]['Elevation'].values.tolist()


    #################### WAVEFORM DOWNLOAD #######################
    #Download all waveforms for that event based on stations and times
    bulk = [] 
    for m in range(0, len(networks)):
        bulk.append([networks[m], stations[m], '*', '*Z', otime-t_before_raw, otime+t_before_raw])
    try:
        st = client2.get_waveforms_bulk(bulk)
        st = resample(st,fs)  #resampling the data to 40Hz for each trace
        evt_data = obspy.Stream()
        snr=[]
        stas=[]
        nets=[]
        lats=[]
        lons=[]
        els=[]
        centroid_time = []
        data_env_dict = {}
        duration = []
        max_time = []

        # #Keeping all traces for one event with channel z, SNR>10, and bandpassed between 2-12Hz
        # ,nets,max_amp_times,durations,data_env_dict,t_diff = [],[],[],[],[],[],[],{},{}
        for i,ii in enumerate(st):
            ii.detrend(type = 'demean')
            ii.filter('bandpass',freqmin=low_cut,freqmax=high_cut,corners=2,zerophase=True)
            # trim the data and noise window to exactly 6000 points
            signal_window = ii.copy()
            noise_window = ii.copy()
            signal_window.trim(otime - t_before, otime - t_before + window) # trim the signal at the first pick time of the PNSN data, with loose 40s before
            noise_window.trim(otime - window -t_before, otime - t_before) # noise window of the same length
            if  len(signal_window.data)<=10 or  len(noise_window.data)<=10: continue # skip if no data

            snr2 = (20 * np.log(np.percentile(np.abs(signal_window.data),pr) 
                            / np.percentile(np.abs(noise_window.data),pr))/np.log(10))
            # if not np.percentile(np.abs(signal_window.data),pr):continue # skip if max amplitude is zero
            snr1 = (20 * np.log(np.percentile(np.abs(signal_window.data[:signal_window.stats.npts//2]),pr) 
                            / np.percentile(np.abs(noise_window.data[:noise_window.stats.npts//2]),pr))/np.log(10))
            
            # snr1 = (20 * np.log(np.percentile(np.abs(signal_window.data),pr) 
                            # / np.percentile(np.abs(noise_window.data),pr))/np.log(10))

            if snr1<thr: # and 100<max_amp_time<200:
                st.remove(ii)
                continue

        ################# ENVELOPE, CENTROID, DURATION #######################
            # enveloping the data 
            data_envelope = obspy.signal.filter.envelope(signal_window.data)
            data_envelope = obspy.signal.util.smooth(data_envelope, smooth_length)

            data_env_dict[ii.stats.network+'.'+ii.stats.station]= data_envelope/max(np.abs(data_envelope))
            print("snr",snr1,snr2)
            print(ii.stats.network+'.'+ii.stats.station)
            # max time
            # finding the time of max amplitude of each event
            # signal_window is windowed at otime-t_v before the PNSN pick time
            # crap = np.argmax(np.abs(data_envelope[:(t_before+40)*fs])) # time of max amplitude relative to otime

            # centroid time
            ss = signal_window.copy()
            ss.trim()
            tcrap = signal_window.times()-t_before
            it = np.where(tcrap>0)[0] # origin tim
            # print(ii.stats.station,max_time[-1],centroid_time[-1])

            # find duration as data starting with the "origin time" and ending when the envelope falls below the mean noise
            data_envelope -= np.mean(data_envelope[:t_before*fs])
            
            mmax = np.max(np.cumsum(data_envelope[it]**4))
            crap = np.where( np.cumsum(data_envelope[it]**4) <= 0.999*mmax)[0]#[-1]
            duration.append(len(crap)/fs)

            it = np.where((tcrap>0) & (tcrap<duration[-1]))[0] # select the time window after origin and before the end
            centroid_time.append(np.sum(data_envelope[it]*tcrap[it])/np.sum(data_envelope[it]))

            max_time.append(tcrap[it[np.argmax(data_envelope[it])]])

            stas.append(ii.stats.station)
            nets.append(ii.stats.network)
            ista=stations.index(ii.stats.station)
            lats.append(latitudes[ista])
            lons.append(longitudes[ista])
            els.append(elevations[ista])
            snr.append(snr1)
            evt_data.append(signal_window)

            t = evt_data.select(station=stas[-1])[0].times()
            
        if len(stas)<3:continue
        centroid_time = np.asarray(centroid_time)
        # centroid_time -= t_before
        max_time = np.asarray(max_time)
        # max_time -= t_before
        duration = np.asarray(duration)
        # duration -= t_before

        ################### ELEP #######################

            # test the new function
        smb_peak= apply_elep(evt_data, stas, \
                list_models, MBF_paras, paras_semblance, t_before)
        smb_peak -= t_before


        ############### RECALCULATE CENTROID & DURATION ################
        new_centroid_time = np.zeros(len(stas))
        new_duration = np.zeros(len(stas))
        for ista in range(len(stas)):
            tt_data = evt_data.select(station=stas[ista])[0].data
            data_envelope = obspy.signal.filter.envelope(tt_data)
            data_envelope = obspy.signal.util.smooth(data_envelope, smooth_length)
            t = evt_data.select(station=stas[ista])[0].times()
            data_envelope = data_envelope[:len(t)]
            data_envelope = data_envelope - np.mean(data_envelope[0:int(t_before*fs)]) # remove the mean of the noise
            data_envelope = data_envelope/np.max(np.abs(data_envelope)) # normalize the envelope
            t = t - t_before - smb_peak[ista] # shift the time to the pick time
            ikk=np.where(t>0)[0]#[0] # find the first positive time
            # data_envelope = data_envelope[ikk]
            mmax = np.max(np.cumsum(data_envelope[ikk]**4))
            crap = np.where( np.cumsum(data_envelope[ikk]**4) <= 0.999*mmax)[0]#[-1]
            new_duration[ista]= len(crap)/fs

            it = np.where((t>0) & (t<new_duration[ista]))[0]
            new_centroid_time[ista] = np.sum(data_envelope[it]*t[it])/np.sum(data_envelope[it])+smb_peak[ista]




            print(stas[ista],"old centroid time", centroid_time[ista], "new centroid time",new_centroid_time[ista])
            print(stas[ista],"old duration", duration[ista], "new duration time",new_duration[ista])



        ############## PEAK FREQUENCY MEASUREMENTS ############
        # Given the approximate measurement of duration, window the signal windows around that
        # then measure peak frequency so that there is less noise in it.
        # perform this on the Z component only.

        char_freq, sharp_weight= [],[]
        # fig1,ax1 = plt.subplots(1,1,figsize=(11,8), dpi = 200)
        for ii,i in enumerate(evt_data):
            data = np.zeros(200*fs)
            crap=i.copy()
            otime1 = crap.stats.starttime + smb_peak[ii] # pick time
            crap.trim(otime1  - 10, otime1 + 2*new_duration[ii] + 10) # window the data around the pick time
            crap.taper(max_percentage=0.01,max_length=20)

            data[:len(crap.data)] = crap.data #*100
            f,psd=scipy.signal.welch(data,fs=fs,nperseg=81,noverlap=4)
            #just get the frequencies within the filter band
            above_low_cut = [f>low_cut]
            below_high_cut = [f<high_cut]
            in_band = np.logical_and(above_low_cut,below_high_cut)[0]
            f = f[in_band]
            psd = psd[in_band]

            # calculate characteristic frequency and report
            char_freq_max = f[np.argmax(psd)]
            char_freq_mean= np.sum(psd*f)/np.sum(psd)
            psd_cumsum = np.cumsum(psd)
            psd_sum = np.sum(psd)
            char_freq_median = f[np.argmin(np.abs(psd_cumsum-psd_sum/2))]
            char_freq.append(char_freq_mean)

            # plt.rcParams.update({'font.size': 20})
            # p=ax1.plot(f,psd,label=stas[ii],linewidth=2)
            # cc = p[0].get_color()
            # ax1.set_xscale('log')
            # ax1.set_yscale('log')
            # ax1.grid('True')
            # ax1.set_xlabel('Frequency [Hz]')
            # ax1.set_ylabel('PSD [$(mm/s)^2$/Hz]')
            # ax1.vlines(char_freq_mean,ymin=np.min(psd)/10,ymax=np.max(psd)*10,linestyle="--",colors=cc)

        #             # weighting the data by the spikiness of the PSD vs frequency graphs
            ratio = (np.mean(psd)/np.max(psd))
            sharp_weight.append(int(1/(ratio**2)*20))
            # del fig1,ax1


            ############# KEEP DATA #######################

        #         if not max(smb_peak.shape):continue
        ddict = {'otime':otime, 'nets':nets, 'stas':stas,  'snr':snr, 'smb_peak': smb_peak, 'max_time':max_time, 'centroid_time': centroid_time , \
                'lats':lats, 'lons':lons, 'elevs':els, 'char_freq':char_freq, 'duration':duration,'new_duration':new_duration, \
                    'new_centroid':new_centroid_time,'sharp_weight':sharp_weight, 'volcano':associated_volcano, 'event_ID':event_ID}
        if not np.any(dff):
            dff = pd.DataFrame.from_dict(ddict)
            dff.to_csv("../data/events/MLPicks_MtRainier.csv")
        else:
            dff=pd.concat([dff,pd.DataFrame.from_dict(ddict)],ignore_index=True)
            dff.to_csv("../data/events/MLPicks_MtRainier.csv")

        print(dff)
        if nplot<100:
            nplot+=1
            fig = plt.figure(figsize = (11,8), dpi=400)
            fig.suptitle(str(otime)+" "+associated_volcano)
            ax = plt.subplot(1,1,1)
            iplot = 0
            for i in range(len(stas)):
                data = evt_data.select(station=stas[i])[0].data
                max1 = np.max(np.abs(data))
                t = evt_data.select(station=stas[i])[0].times()
                ax.plot(t-t_before,data/max1+iplot*1.5,linewidth=0.5) # plot the data
                if np.any(data_env_dict[nets[i]+'.'+stas[i]]): # plot the envelope
                    ax.plot(t-t_before,data_env_dict[nets[i]+'.'+stas[i]]+iplot*1.5,'k',linewidth=1)
                # ax.plot(smb_peak[i],iplot*1.5,'r*',markersize=5)   #  the pick time
                ax.plot(centroid_time[i],iplot*1.5,'rp',markersize=5)   #  old centroid time
                ax.plot(new_centroid_time[i],iplot*1.5,'kp',markersize=5)   #  new centroid time
                ax.plot(max_time[i],iplot*1.5,'r*',markersize=5)  # maimum t
                ax.set_yticks([])
                ax.text(-15, iplot*1.5+0.5, stas[i])
                ax.vlines(smb_peak[i],iplot*1.5-1.,iplot*1.5+1.,'r') # pick time
                ax.vlines(smb_peak[i]+duration[i],iplot*1.5-1.,iplot*1.5+1.,'k') # old duration
                ax.vlines(smb_peak[i]+new_duration[i],iplot*1.5-1.,iplot*1.5+1.,'b') # new duration
                print(sta[i],duration[i],char_freq[i])

                iplot+=1
            ax.grid(True)
            ax.set_xlim([-t_before,90])
            ax.set_xlabel('time (seconds) relative to PNSN picks')
            plt.show()
            pdf.savefig(fig)
            plt.clf()
            del fig
        dff.describe()
    except:
        print("No data for event",event_ID)
dff.to_csv("../data/events/MLPicks_MtRainier.csv")

pdf.close()

In [None]:
signal_window.stats.npts
