##########################################################################

GOAL:

    make it so that this pulls in the events.csv as a df, finds the lines containing overlaps, deletes them, then puts the df back into csv just now without the overlap
    
    add template locations as a column in the df after checking for redpy overlap
    
##########################################################################

Import Everything

In [None]:
import pandas as pd
import numpy as np
import yaml
import obspy
from obspy import UTCDateTime
from obspy.clients.fdsn import Client
import matplotlib.pyplot as plt
from time import time
from glob import glob
from obspy.signal.trigger import classic_sta_lta, plot_trigger, trigger_onset
import csv
import re

from obspy.core.utcdatetime import UTCDateTime

Set Parameters

In [None]:
with open('/home/smocz/expand_redpy/scripts/config.yaml') as file:
    config = yaml.load(file, Loader=yaml.FullLoader)

rpwi = config['rpwi'] #time in seconds before and after REDpy catalog datetimes to exclude detections from, window length=2*rpwi
homedir = config ['homedir']
readdir = config['readdir']
vv = config['vv']
years = config['years']
volc_list_names = config['volc_list_names']
# datadir = '/data/wsd01/HOOD_data/UW/'+str(year)+'/' #directory to get data from

print(volc_list_names[vv])

Read the REDpy Catalogs and Volcano Metadata

In [None]:
Baker = pd.read_csv(readdir+'Baker_catalog.csv')
Hood = pd.read_csv(readdir+'Hood_catalog.csv')


St_Helens = pd.read_csv(readdir+'MountStHelens_catalog.csv')

# Combining borehole and local catalogs with St_Helens

Helens_Borehole = pd.read_csv(readdir+'MSHborehole_catalog.csv')
Helens_Borehole['Clustered'] += 2000 
# Cluster 0 in Helens_Borehole is now Cluster 2000 in St_Helens
Helens_Local = pd.read_csv(readdir+'MSHlocal_catalog.csv')
Helens_Local['Clustered'] += 3000
# Cluster 0 in Helens_Local is now Cluster 3000 in St_Helens

# Use St_Helens to access all three St Helens catalogs
St_Helens = pd.concat([St_Helens,Helens_Borehole,Helens_Local])
clid = np.unique(St_Helens['Clustered'].values.tolist()) #find the largest cluster ID for a volcano to set range
print(clid[-1])

Newberry = pd.read_csv(readdir+'Newberry_catalog.csv')
Rainier = pd.read_csv(readdir+'Rainier_catalog.csv')

volc_md = pd.read_csv(readdir+'Volcano_Metadata.csv')
# read metadata file to create dataframe of labels

Use Volcano Metadata to Create Lists of Stations for Each Volcano

In [None]:
volc_md['netsta'] = volc_md['Network'].astype(str)+'.'+volc_md['Station'].astype(str)

Baker_sta = volc_md[volc_md['Volcano_Name'] == 'Baker']['netsta'].values.tolist()
Hood_sta = volc_md[volc_md['Volcano_Name'] == 'Hood']['netsta'].values.tolist() 
St_Helens_sta = volc_md[volc_md['Volcano_Name'] == 'St_Helens']['netsta'].values.tolist()
Newberry_sta = volc_md[volc_md['Volcano_Name'] == 'Newberry']['netsta'].values.tolist() 
Rainier_sta = volc_md[volc_md['Volcano_Name'] == 'Rainier']['netsta'].values.tolist()

Create Lists of Volcano Information

In [None]:
#enumerate [0,1,2,3,4]
volc_list = [Baker,Hood,Newberry,Rainier,St_Helens] # list of dataframes for each volcano
volc_list_names = ['Baker','Hood','Newberry','Rainier','St_Helens'] # list of names of each volcano
volc_sta = [Baker_sta,Hood_sta,Newberry_sta,Rainier_sta,St_Helens_sta] # lists of stations connected to respective volcanoes

Updated Sorting - December 4, 2022

- removes redpy overlap
- adds location (same as the location for each cluster)

In [None]:
v = volc_sta[vv]
# print(v)
# print(volc_list_names[vv])
catalog = volc_list[vv]
pd_sta = {} #will become a dictionary of event dataframes
for year in years: #for each year
#     print(year)
    try: #try to read the events csv
        read = pd.read_csv(homedir+f'events/{volc_list_names[vv]}_{year}_events.csv')
    except:
        print(f'No detections on {volc_list_names[vv]} for {year}') #if no events csv, say so
        continue
    pd_sta[f'{volc_list_names[vv]}_{year}'] = pd.read_csv(homedir+f'events/{volc_list_names[vv]}_{year}_events.csv')
    #record the events csv as a dataframe in pd_dict

read = pd.concat(pd_sta, axis=0,ignore_index=True) #combine the different year dataframes into one dataframe for the whole volcano

display(read)


#go by cluster to speed things up:
cl_list = np.unique(read['Cluster_ID'].values.tolist())

for cl in cl_list:
    times = read[read['Cluster_ID']==cl]['Earliest_Detection_Time'].values.tolist() #get every time for this cluster from new detections
#     print(cl,'-',times)
    rpdatetimes = catalog[catalog['Clustered'] == cl]['datetime'].values.tolist() #get every time for this cluster from REDPy
    for ii,i in enumerate(times): #for each detection time
        #make a list of datetimes for the current cluster
        skip=1 #set variable to arbitrary number
        for rr,r in enumerate(rpdatetimes): #run through each redpy time for this cl
            rs = UTCDateTime(r)-rpwi #redpy time
            rend = UTCDateTime(r)+rpwi #changed from re to rend because of import re for cl_list
            if UTCDateTime(i)>rs and UTCDateTime(i)<rend:#if there is an overlap
                skip=2 #reset the variable 
                print(f'Overlap with REDpy detections, {i}') #say so
                #drop the row from the csv
                useless = read[read['Earliest_Detection_Time']==i].index.tolist()
                read = read.drop(useless)
                break #break out of the loop
        if skip != 2: #if NO overlap has occured
            print(f'no overlap for {i} cluster {cl}')
read = read.reset_index(drop=True) #reset index

display(read)
# add cluster locations
loc_csv = pd.read_csv(homedir+f'locations/{volc_list_names[vv]}_Template_Locations.csv') #read locations csv

read['Latitude'] = '' #make new columns in the dataframe
read['Longitude'] = ''
# display(read)
cl_list_updated = np.unique(read['Cluster_ID'].values.tolist()) #get a new cl_list 
#(without any clusters that might have been dropped entirely from overlap)
for cl in cl_list_updated:
    print('---')
    cl_indx = read[read['Cluster_ID']==cl].index.tolist() #get index numbers for the rows with this cl
    print('cl_indx',cl_indx)
#find the indexes for each cluster
    lat = loc_csv[loc_csv['Cluster_ID']==cl]['Latitude'].values.tolist() #find lat for this cl
    lon = loc_csv[loc_csv['Cluster_ID']==cl]['Longitude'].values.tolist() #find lon for this cl
    print(lat[0],lon[0])
#input the correct lat and lon for those indexes
    for ci in cl_indx: #for each index of this cl
        print('ci',ci)
        read.at[ci, 'Latitude'] = lat[0] #make the column 'Latitude' at this index (ci) equal latitude for this cl
        read.at[ci, 'Longitude'] = lon[0] #make the column 'Longitude' at this index (ci) equal longitude for this cl
            
read.to_csv(homedir+f'final_catalogs/{volc_list_names[vv]}_updated_catalog.csv',index=False) #save as csv

In [None]:
#display whole dataframe for better viewing than from the csv itself

df = pd.read_csv(homedir+f'final_catalogs/{volc_list_names[vv]}_updated_catalog.csv')
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    display(df)