## Notebook for formatting data frames for phase association
Source:
1. https://chatgpt.com/


In [1]:
import pandas as pd
import numpy as np
import datetime
import glob
import os 

### 1.1 Load the station data frame

In [2]:
region = 'pnsn_sor'
stas_2011 = pd.read_csv(f'../data/datasets_2011/stas_2011_{region}.csv',index_col=0)
stas_2012 = pd.read_csv(f'../data/datasets_2012/stas_2012_{region}.csv',index_col=0)
stas_2013 = pd.read_csv(f'../data/datasets_2013/stas_2013_{region}.csv',index_col=0)
stas_2014 = pd.read_csv(f'../data/datasets_2014/stas_2014_{region}.csv',index_col=0)
stas_2015 = pd.read_csv(f'../data/datasets_2015/stas_2015_{region}.csv',index_col=0)
print(stas_2011)
print(stas_2012)
print(stas_2013)
print(stas_2014)
print(stas_2015)

         id   longitude   latitude  elevation
0  TA.J01D. -123.931396  43.161400      131.0
1  TA.K02D. -123.665398  42.695499      989.0
2   NC.KBO. -124.225983  42.212357     1010.0
3  TA.L02D. -123.603104  42.157799      458.0
4  NC.KSXB. -123.876884  41.830379     1136.0
5   UO.DBO. -123.244423  43.118721      957.0
6   NC.KEB. -124.334251  42.872211      818.0
         id   longitude   latitude  elevation
0  NC.KSXB. -123.876884  41.830379     1136.0
1  TA.J01D. -123.931396  43.161400      131.0
2   NC.KBO. -124.225983  42.212357     1010.0
3   UO.DBO. -123.244423  43.118721      957.0
4  TA.L02E. -123.602898  42.158001      454.0
5   NC.KEB. -124.334251  42.872211      818.0
6  TA.K02D. -123.665398  42.695499      989.0
7  TA.L02D. -123.603104  42.157799      458.0
8  TA.J01E. -123.931396  43.161499      128.0
         id   longitude   latitude  elevation
0   UO.DBO. -123.244423  43.118721      957.0
1   NC.KEB. -124.334251  42.872211      818.0
2  TA.K02D. -123.665398  42.695499

In [3]:
stas = pd.concat([stas_2011,stas_2012,stas_2013,
                           stas_2014,stas_2015],ignore_index=True)
stas = stas.drop_duplicates(subset='id')
stas = stas.reset_index(drop=True)
stas

Unnamed: 0,id,longitude,latitude,elevation
0,TA.J01D.,-123.931396,43.1614,131.0
1,TA.K02D.,-123.665398,42.695499,989.0
2,NC.KBO.,-124.225983,42.212357,1010.0
3,TA.L02D.,-123.603104,42.157799,458.0
4,NC.KSXB.,-123.876884,41.830379,1136.0
5,UO.DBO.,-123.244423,43.118721,957.0
6,NC.KEB.,-124.334251,42.872211,818.0
7,TA.L02E.,-123.602898,42.158001,454.0
8,TA.J01E.,-123.931396,43.161499,128.0


In [4]:
stas.to_csv(f'../data/datasets_{region}/all_stations_{region}.csv')

### 1.2 Concatenate data frames from several stations and format 

In [5]:
df = pd.read_csv('../data/datasets_all_regions/all_picks_all_years_for_assoc.csv',index_col=0)

In [6]:
df['station_id'] = df['station_network_code'] + '.' + df['station_code']+'.'
df = df[df['station_id'].isin(list(stas['id']))]

In [7]:
len(df['station_id'].drop_duplicates())

9

In [8]:
len(df)

533703

In [9]:
# df.to_csv('../data/datasets_all_years/all_picks_all_years_pnsn_jdf_for_picking.csv')

In [10]:
# # Remove picks with the following stations due to the noise: 'FN05A', 'YOUB' and 'MGB'
# df = df[~df['station_code'].isin(['FN05A', 'YOUB', 'MGB','FN14A','FN07A','J41A','J49A',
# 'J25B','J33B','FN18A','FN08A','M09B','G17B'])]

In [11]:
# # Remove picks with the following stations due to the noise
# df = df[~df['station_code'].isin(['FN05A', 'YOUB', 'MGB',
#                                   'J41A','J33B','M09B','FN01A','FN08A',
#                                   'FN12A','FN18A','J26A','J34A', 'J41A',
#                                   'M02A','FS03B','FS05B','FS08B','G33B',
#                                   'J17B','M10B','M13B','M18B'
#                                  ])]

In [12]:
len(df)

533703

In [13]:
df.to_csv(f'../data/datasets_{region}/all_picks_all_years_for_assoc_{region}.csv')

### 1.3 Create a CSV file for picks for the association

In [14]:
df = pd.read_csv(f'../data/datasets_{region}/all_picks_all_years_for_assoc_{region}.csv',index_col=0)

In [15]:
# Assign P to P phases in the df
p_phase = df.loc[df['trace_p_arrival'].notna(), 'phase'] = "P"

In [16]:
# Filter the df 
p_stas_picks_phase = df.loc[df['trace_p_arrival'].notna(), ['station_id','trace_p_arrival','phase','pick_id']]
p_stas_picks_phase

Unnamed: 0,station_id,trace_p_arrival,phase,pick_id
257,NC.KBO.,2011-01-01T00:04:56.400000Z,P,257
289,NC.KBO.,2011-01-01T00:35:49.600000Z,P,289
290,NC.KBO.,2011-01-01T00:55:55.930000Z,P,290
291,NC.KBO.,2011-01-01T03:08:21.550000Z,P,291
292,NC.KBO.,2011-01-01T08:20:27.320000Z,P,292
...,...,...,...,...
23570709,NC.KSXB.,2015-12-31T06:53:53.427700Z,P,23570709
23570710,NC.KSXB.,2015-12-31T08:16:56.207700Z,P,23570710
23570711,NC.KSXB.,2015-12-31T08:48:23.407700Z,P,23570711
23570712,NC.KSXB.,2015-12-31T09:46:44.477700Z,P,23570712


In [17]:
# Rename the df
_p_stas_picks_phase = p_stas_picks_phase.rename(columns={"station_id": "station", "trace_p_arrival": "time"})
_p_stas_picks_phase

Unnamed: 0,station,time,phase,pick_id
257,NC.KBO.,2011-01-01T00:04:56.400000Z,P,257
289,NC.KBO.,2011-01-01T00:35:49.600000Z,P,289
290,NC.KBO.,2011-01-01T00:55:55.930000Z,P,290
291,NC.KBO.,2011-01-01T03:08:21.550000Z,P,291
292,NC.KBO.,2011-01-01T08:20:27.320000Z,P,292
...,...,...,...,...
23570709,NC.KSXB.,2015-12-31T06:53:53.427700Z,P,23570709
23570710,NC.KSXB.,2015-12-31T08:16:56.207700Z,P,23570710
23570711,NC.KSXB.,2015-12-31T08:48:23.407700Z,P,23570711
23570712,NC.KSXB.,2015-12-31T09:46:44.477700Z,P,23570712


In [18]:
# Assign S to S phases in the df
s_phase = df.loc[df['trace_s_arrival'].notna(), 'phase'] = "S"# Filter the df 

In [19]:
# Filter the df 
s_stas_picks_phase = df.loc[df['trace_s_arrival'].notna(), ['station_id','trace_s_arrival','phase','pick_id']]
s_stas_picks_phase

Unnamed: 0,station_id,trace_s_arrival,phase,pick_id
337,NC.KBO.,2011-01-01T03:32:29.330000Z,S,337
338,NC.KBO.,2011-01-01T17:48:18.870000Z,S,338
554,NC.KEB.,2011-01-01T01:07:32.173700Z,S,554
555,NC.KEB.,2011-01-01T00:48:36.213700Z,S,555
556,NC.KEB.,2011-01-01T00:16:13.373700Z,S,556
...,...,...,...,...
23570695,NC.KSXB.,2015-12-31T09:46:49.217700Z,S,23570695
23570696,NC.KSXB.,2015-12-31T08:53:14.187700Z,S,23570696
23570697,NC.KSXB.,2015-12-31T05:19:50.267700Z,S,23570697
23570698,NC.KSXB.,2015-12-31T17:59:06.237700Z,S,23570698


In [20]:
# Rename the df
_s_stas_picks_phase = s_stas_picks_phase.rename(columns={"station_id": "station", "trace_s_arrival": "time"})
_s_stas_picks_phase

Unnamed: 0,station,time,phase,pick_id
337,NC.KBO.,2011-01-01T03:32:29.330000Z,S,337
338,NC.KBO.,2011-01-01T17:48:18.870000Z,S,338
554,NC.KEB.,2011-01-01T01:07:32.173700Z,S,554
555,NC.KEB.,2011-01-01T00:48:36.213700Z,S,555
556,NC.KEB.,2011-01-01T00:16:13.373700Z,S,556
...,...,...,...,...
23570695,NC.KSXB.,2015-12-31T09:46:49.217700Z,S,23570695
23570696,NC.KSXB.,2015-12-31T08:53:14.187700Z,S,23570696
23570697,NC.KSXB.,2015-12-31T05:19:50.267700Z,S,23570697
23570698,NC.KSXB.,2015-12-31T17:59:06.237700Z,S,23570698


In [21]:
# Concatenate the P and S picks
picks = pd.concat(objs = [_p_stas_picks_phase,_s_stas_picks_phase] , axis=0)
picks

Unnamed: 0,station,time,phase,pick_id
257,NC.KBO.,2011-01-01T00:04:56.400000Z,P,257
289,NC.KBO.,2011-01-01T00:35:49.600000Z,P,289
290,NC.KBO.,2011-01-01T00:55:55.930000Z,P,290
291,NC.KBO.,2011-01-01T03:08:21.550000Z,P,291
292,NC.KBO.,2011-01-01T08:20:27.320000Z,P,292
...,...,...,...,...
23570695,NC.KSXB.,2015-12-31T09:46:49.217700Z,S,23570695
23570696,NC.KSXB.,2015-12-31T08:53:14.187700Z,S,23570696
23570697,NC.KSXB.,2015-12-31T05:19:50.267700Z,S,23570697
23570698,NC.KSXB.,2015-12-31T17:59:06.237700Z,S,23570698


In [22]:
# Swap the time and phase columns
picks = picks.iloc[:,[0,2,1,3]]
picks = picks.reset_index(drop=True)
picks

Unnamed: 0,station,phase,time,pick_id
0,NC.KBO.,P,2011-01-01T00:04:56.400000Z,257
1,NC.KBO.,P,2011-01-01T00:35:49.600000Z,289
2,NC.KBO.,P,2011-01-01T00:55:55.930000Z,290
3,NC.KBO.,P,2011-01-01T03:08:21.550000Z,291
4,NC.KBO.,P,2011-01-01T08:20:27.320000Z,292
...,...,...,...,...
533698,NC.KSXB.,S,2015-12-31T09:46:49.217700Z,23570695
533699,NC.KSXB.,S,2015-12-31T08:53:14.187700Z,23570696
533700,NC.KSXB.,S,2015-12-31T05:19:50.267700Z,23570697
533701,NC.KSXB.,S,2015-12-31T17:59:06.237700Z,23570698


In [23]:
picks[0:20]

Unnamed: 0,station,phase,time,pick_id
0,NC.KBO.,P,2011-01-01T00:04:56.400000Z,257
1,NC.KBO.,P,2011-01-01T00:35:49.600000Z,289
2,NC.KBO.,P,2011-01-01T00:55:55.930000Z,290
3,NC.KBO.,P,2011-01-01T03:08:21.550000Z,291
4,NC.KBO.,P,2011-01-01T08:20:27.320000Z,292
5,NC.KBO.,P,2011-01-01T09:52:17.800000Z,324
6,NC.KBO.,P,2011-01-01T12:26:28.580000Z,325
7,NC.KBO.,P,2011-01-01T13:25:25.820000Z,326
8,NC.KBO.,P,2011-01-01T13:29:27.390000Z,327
9,NC.KBO.,P,2011-01-01T14:00:38.590000Z,328


In [24]:
# Save the these picks to the data folder
picks.to_csv(f"../data/datasets_{region}/picks_{region}.csv")