## Notebook for formatting data frames for phase association
Source:
1. https://chatgpt.com/


In [1]:
import pandas as pd
import numpy as np
import datetime
import glob
import os 

### 1.1 Load the station data frame

In [2]:
region = 'pnsn_nor'
stas_2011 = pd.read_csv(f'../data/datasets_2011/stas_2011_{region}.csv',index_col=0)
stas_2012 = pd.read_csv(f'../data/datasets_2012/stas_2012_{region}.csv',index_col=0)
stas_2013 = pd.read_csv(f'../data/datasets_2013/stas_2013_{region}.csv',index_col=0)
stas_2014 = pd.read_csv(f'../data/datasets_2014/stas_2014_{region}.csv',index_col=0)
stas_2015 = pd.read_csv(f'../data/datasets_2015/stas_2015_{region}.csv',index_col=0)
print(stas_2011)
print(stas_2012)
print(stas_2013)
print(stas_2014)
print(stas_2015)

         id   longitude   latitude  elevation
0  TA.G03D. -123.264099  45.211498      222.0
1  TA.J01D. -123.931396  43.161400      131.0
2  TA.K02D. -123.665398  42.695499      989.0
3  TA.I02D. -123.846901  44.105900      109.0
4  UW.TAKO. -124.083370  43.743130       12.0
5  UW.JEDS. -124.049050  43.751570      159.5
6   UO.DBO. -123.244423  43.118721      957.0
7  TA.I03D. -123.348701  43.697201      140.0
8  UW.HEBO. -123.755386  45.213501      875.0
9  UW.BABR. -123.789240  44.621320      438.7
          id   longitude   latitude  elevation
0   TA.J01D. -123.931396  43.161400      131.0
1    UO.DBO. -123.244423  43.118721      957.0
2   UW.HEBO. -123.755386  45.213501      875.0
3   TA.I02D. -123.846901  44.105900      109.0
4   UW.TAKO. -124.083370  43.743130       12.0
5   TA.G03D. -123.264099  45.211498      222.0
6   TA.K02D. -123.665398  42.695499      989.0
7   UW.BABR. -123.789240  44.621320      438.7
8   TA.I03D. -123.348701  43.697201      140.0
9   TA.J01E. -123.931396

In [3]:
stas = pd.concat([stas_2011,stas_2012,stas_2013,
                           stas_2014,stas_2015],ignore_index=True)
stas = stas.drop_duplicates(subset='id')
stas = stas.reset_index(drop=True)
stas

Unnamed: 0,id,longitude,latitude,elevation
0,TA.G03D.,-123.264099,45.211498,222.0
1,TA.J01D.,-123.931396,43.1614,131.0
2,TA.K02D.,-123.665398,42.695499,989.0
3,TA.I02D.,-123.846901,44.1059,109.0
4,UW.TAKO.,-124.08337,43.74313,12.0
5,UW.JEDS.,-124.04905,43.75157,159.5
6,UO.DBO.,-123.244423,43.118721,957.0
7,TA.I03D.,-123.348701,43.697201,140.0
8,UW.HEBO.,-123.755386,45.213501,875.0
9,UW.BABR.,-123.78924,44.62132,438.7


In [4]:
stas.to_csv(f'../data/datasets_{region}/all_stations_{region}.csv')

### 1.2 Concatenate data frames from several stations and format 

In [5]:
df = pd.read_csv('../data/datasets_all_regions/all_picks_all_years_for_assoc.csv',index_col=0)

In [6]:
df['station_id'] = df['station_network_code'] + '.' + df['station_code']+'.'
df = df[df['station_id'].isin(list(stas['id']))]

In [7]:
len(df['station_id'].drop_duplicates())

12

In [8]:
len(df)

500698

In [9]:
# df.to_csv('../data/datasets_all_years/all_picks_all_years_pnsn_jdf_for_picking.csv')

In [10]:
# # Remove picks with the following stations due to the noise: 'FN05A', 'YOUB' and 'MGB'
# df = df[~df['station_code'].isin(['FN05A', 'YOUB', 'MGB','FN14A','FN07A','J41A','J49A',
# 'J25B','J33B','FN18A','FN08A','M09B','G17B'])]

In [11]:
# # Remove picks with the following stations due to the noise
# df = df[~df['station_code'].isin(['FN05A', 'YOUB', 'MGB',
#                                   'J41A','J33B','M09B','FN01A','FN08A',
#                                   'FN12A','FN18A','J26A','J34A', 'J41A',
#                                   'M02A','FS03B','FS05B','FS08B','G33B',
#                                   'J17B','M10B','M13B','M18B'
#                                  ])]

In [12]:
len(df)

500698

In [13]:
df.to_csv(f'../data/datasets_{region}/all_picks_all_years_for_assoc_{region}.csv')

### 1.3 Create a CSV file for picks for the association

In [14]:
df = pd.read_csv(f'../data/datasets_{region}/all_picks_all_years_for_assoc_{region}.csv',index_col=0)

In [15]:
# Assign P to P phases in the df
p_phase = df.loc[df['trace_p_arrival'].notna(), 'phase'] = "P"

In [16]:
# Filter the df 
p_stas_picks_phase = df.loc[df['trace_p_arrival'].notna(), ['station_id','trace_p_arrival','phase','pick_id']]
p_stas_picks_phase

Unnamed: 0,station_id,trace_p_arrival,phase,pick_id
242,TA.I02D.,2011-01-01T22:53:37.025000Z,P,242
243,TA.I02D.,2011-01-01T21:04:13.025000Z,P,243
245,TA.I02D.,2011-01-01T18:29:10.650000Z,P,245
250,TA.I02D.,2011-01-01T15:53:23.700000Z,P,250
251,TA.I02D.,2011-01-01T14:41:57.425000Z,P,251
...,...,...,...,...
23571042,UW.JEDS.,2015-12-31T17:10:16.015000Z,P,23571042
23571050,UW.BABR.,2015-12-31T22:02:55.090000Z,P,23571050
23571051,UW.BABR.,2015-12-31T17:44:49.490000Z,P,23571051
23571052,UW.BABR.,2015-12-31T04:49:44.140000Z,P,23571052


In [17]:
# Rename the df
_p_stas_picks_phase = p_stas_picks_phase.rename(columns={"station_id": "station", "trace_p_arrival": "time"})
_p_stas_picks_phase

Unnamed: 0,station,time,phase,pick_id
242,TA.I02D.,2011-01-01T22:53:37.025000Z,P,242
243,TA.I02D.,2011-01-01T21:04:13.025000Z,P,243
245,TA.I02D.,2011-01-01T18:29:10.650000Z,P,245
250,TA.I02D.,2011-01-01T15:53:23.700000Z,P,250
251,TA.I02D.,2011-01-01T14:41:57.425000Z,P,251
...,...,...,...,...
23571042,UW.JEDS.,2015-12-31T17:10:16.015000Z,P,23571042
23571050,UW.BABR.,2015-12-31T22:02:55.090000Z,P,23571050
23571051,UW.BABR.,2015-12-31T17:44:49.490000Z,P,23571051
23571052,UW.BABR.,2015-12-31T04:49:44.140000Z,P,23571052


In [18]:
# Assign S to S phases in the df
s_phase = df.loc[df['trace_s_arrival'].notna(), 'phase'] = "S"# Filter the df 

In [19]:
# Filter the df 
s_stas_picks_phase = df.loc[df['trace_s_arrival'].notna(), ['station_id','trace_s_arrival','phase','pick_id']]
s_stas_picks_phase

Unnamed: 0,station_id,trace_s_arrival,phase,pick_id
237,TA.I02D.,2011-01-01T14:30:03.850000Z,S,237
238,TA.I02D.,2011-01-01T12:46:14.425000Z,S,238
239,TA.I02D.,2011-01-01T10:11:33.950000Z,S,239
240,TA.I02D.,2011-01-01T10:08:57.875000Z,S,240
241,TA.I02D.,2011-01-01T04:47:11.175000Z,S,241
...,...,...,...,...
23571057,UW.JEDS.,2015-12-31T20:52:31.615000Z,S,23571057
23571058,UW.JEDS.,2015-12-31T21:04:20.990000Z,S,23571058
23571059,UW.JEDS.,2015-12-31T20:48:38.240000Z,S,23571059
23571060,UW.JEDS.,2015-12-31T23:00:02.715000Z,S,23571060


In [20]:
# Rename the df
_s_stas_picks_phase = s_stas_picks_phase.rename(columns={"station_id": "station", "trace_s_arrival": "time"})
_s_stas_picks_phase

Unnamed: 0,station,time,phase,pick_id
237,TA.I02D.,2011-01-01T14:30:03.850000Z,S,237
238,TA.I02D.,2011-01-01T12:46:14.425000Z,S,238
239,TA.I02D.,2011-01-01T10:11:33.950000Z,S,239
240,TA.I02D.,2011-01-01T10:08:57.875000Z,S,240
241,TA.I02D.,2011-01-01T04:47:11.175000Z,S,241
...,...,...,...,...
23571057,UW.JEDS.,2015-12-31T20:52:31.615000Z,S,23571057
23571058,UW.JEDS.,2015-12-31T21:04:20.990000Z,S,23571058
23571059,UW.JEDS.,2015-12-31T20:48:38.240000Z,S,23571059
23571060,UW.JEDS.,2015-12-31T23:00:02.715000Z,S,23571060


In [21]:
# Concatenate the P and S picks
picks = pd.concat(objs = [_p_stas_picks_phase,_s_stas_picks_phase] , axis=0)
picks

Unnamed: 0,station,time,phase,pick_id
242,TA.I02D.,2011-01-01T22:53:37.025000Z,P,242
243,TA.I02D.,2011-01-01T21:04:13.025000Z,P,243
245,TA.I02D.,2011-01-01T18:29:10.650000Z,P,245
250,TA.I02D.,2011-01-01T15:53:23.700000Z,P,250
251,TA.I02D.,2011-01-01T14:41:57.425000Z,P,251
...,...,...,...,...
23571057,UW.JEDS.,2015-12-31T20:52:31.615000Z,S,23571057
23571058,UW.JEDS.,2015-12-31T21:04:20.990000Z,S,23571058
23571059,UW.JEDS.,2015-12-31T20:48:38.240000Z,S,23571059
23571060,UW.JEDS.,2015-12-31T23:00:02.715000Z,S,23571060


In [22]:
# Swap the time and phase columns
picks = picks.iloc[:,[0,2,1,3]]
picks = picks.reset_index(drop=True)
picks

Unnamed: 0,station,phase,time,pick_id
0,TA.I02D.,P,2011-01-01T22:53:37.025000Z,242
1,TA.I02D.,P,2011-01-01T21:04:13.025000Z,243
2,TA.I02D.,P,2011-01-01T18:29:10.650000Z,245
3,TA.I02D.,P,2011-01-01T15:53:23.700000Z,250
4,TA.I02D.,P,2011-01-01T14:41:57.425000Z,251
...,...,...,...,...
500693,UW.JEDS.,S,2015-12-31T20:52:31.615000Z,23571057
500694,UW.JEDS.,S,2015-12-31T21:04:20.990000Z,23571058
500695,UW.JEDS.,S,2015-12-31T20:48:38.240000Z,23571059
500696,UW.JEDS.,S,2015-12-31T23:00:02.715000Z,23571060


In [23]:
picks[0:20]

Unnamed: 0,station,phase,time,pick_id
0,TA.I02D.,P,2011-01-01T22:53:37.025000Z,242
1,TA.I02D.,P,2011-01-01T21:04:13.025000Z,243
2,TA.I02D.,P,2011-01-01T18:29:10.650000Z,245
3,TA.I02D.,P,2011-01-01T15:53:23.700000Z,250
4,TA.I02D.,P,2011-01-01T14:41:57.425000Z,251
5,TA.I02D.,P,2011-01-01T14:29:40.425000Z,252
6,TA.I02D.,P,2011-01-01T13:20:03.850000Z,253
7,TA.I02D.,P,2011-01-01T10:08:56.625000Z,254
8,TA.I02D.,P,2011-01-01T05:09:05.175000Z,255
9,TA.I02D.,P,2011-01-01T01:52:48.425000Z,256


In [24]:
# Save the these picks to the data folder
picks.to_csv(f"../data/datasets_{region}/picks_{region}.csv")