## Notebook for formatting data frames for phase association
Source:
1. https://chatgpt.com/


In [1]:
import pandas as pd
import numpy as np
import datetime
import glob
import os 

### 1.1 Load the station data frame

In [2]:
region = 'or_shore'
stas_2011 = pd.read_csv(f'../data/datasets_2011/stas_2011_{region}.csv',index_col=0)
stas_2012 = pd.read_csv(f'../data/datasets_2012/stas_2012_{region}.csv',index_col=0)
stas_2013 = pd.read_csv(f'../data/datasets_2013/stas_2013_{region}.csv',index_col=0)
stas_2014 = pd.read_csv(f'../data/datasets_2014/stas_2014_{region}.csv',index_col=0)
stas_2015 = pd.read_csv(f'../data/datasets_2015/stas_2015_{region}.csv',index_col=0)
print(stas_2011)
print(stas_2012)
print(stas_2013)
print(stas_2014)
print(stas_2015)

         id   longitude   latitude  elevation
0  7D.J33A. -124.570801  45.106602     -348.7
1  UW.TAKO. -124.083370  43.743130       12.0
2  7D.J41A. -124.537201  45.811901     -175.0
3  7D.J25A. -124.621597  44.472900     -142.8
         id   longitude   latitude  elevation
0  7D.J25A. -124.621597  44.472900     -142.8
1  UW.TAKO. -124.083370  43.743130       12.0
2  7D.J41A. -124.537201  45.811901     -175.0
3  7D.J25B. -124.621696  44.471298     -147.0
4  7D.J33B. -124.570602  45.106602     -350.0
5  7D.J33A. -124.570801  45.106602     -348.7
6  7D.J17B. -124.614799  43.790001     -286.0
         id   longitude   latitude  elevation
0  7D.J41C. -124.537598  45.811699     -171.0
1  7D.J17B. -124.614799  43.790001     -286.0
2  7D.J25B. -124.621696  44.471298     -147.0
3  7D.J33B. -124.570602  45.106602     -350.0
4  7D.J33C. -124.570801  45.106800     -354.0
5  7D.J25C. -124.621696  44.473000     -144.0
         id   longitude   latitude  elevation
0  7D.J33C. -124.570801  45.106800

In [3]:
stas = pd.concat([stas_2011,stas_2012,stas_2013,
                           stas_2014,stas_2015],ignore_index=True)
stas = stas.drop_duplicates(subset='id')
stas = stas.reset_index(drop=True)
stas

Unnamed: 0,id,longitude,latitude,elevation
0,7D.J33A.,-124.570801,45.106602,-348.7
1,UW.TAKO.,-124.08337,43.74313,12.0
2,7D.J41A.,-124.537201,45.811901,-175.0
3,7D.J25A.,-124.621597,44.4729,-142.8
4,7D.J25B.,-124.621696,44.471298,-147.0
5,7D.J33B.,-124.570602,45.106602,-350.0
6,7D.J17B.,-124.614799,43.790001,-286.0
7,7D.J41C.,-124.537598,45.811699,-171.0
8,7D.J33C.,-124.570801,45.1068,-354.0
9,7D.J25C.,-124.621696,44.473,-144.0


In [4]:
stas.to_csv(f'../data/datasets_{region}/all_stations_{region}.csv')

### 1.2 Concatenate data frames from several stations and format 

In [5]:
df = pd.read_csv('../data/datasets_all_regions/all_picks_all_years_for_assoc.csv',index_col=0)

In [6]:
df['station_id'] = df['station_network_code'] + '.' + df['station_code']+'.'
df = df[df['station_id'].isin(list(stas['id']))]

In [7]:
len(df['station_id'].drop_duplicates())

12

In [8]:
len(df)

2100824

In [9]:
# df.to_csv('../data/datasets_all_years/all_picks_all_years_pnsn_jdf_for_picking.csv')

In [10]:
# # Remove picks with the following stations due to the noise: 'FN05A', 'YOUB' and 'MGB'
# df = df[~df['station_code'].isin(['FN05A', 'YOUB', 'MGB','FN14A','FN07A','J41A','J49A',
# 'J25B','J33B','FN18A','FN08A','M09B','G17B'])]

In [11]:
# # Remove picks with the following stations due to the noise
# df = df[~df['station_code'].isin(['FN05A', 'YOUB', 'MGB',
#                                   'J41A','J33B','M09B','FN01A','FN08A',
#                                   'FN12A','FN18A','J26A','J34A', 'J41A',
#                                   'M02A','FS03B','FS05B','FS08B','G33B',
#                                   'J17B','M10B','M13B','M18B'
#                                  ])]

In [12]:
len(df)

2100824

In [13]:
df.to_csv(f'../data/datasets_{region}/all_picks_all_years_for_assoc_{region}.csv')

### 1.3 Create a CSV file for picks for the association

In [14]:
df = pd.read_csv(f'../data/datasets_{region}/all_picks_all_years_for_assoc_{region}.csv',index_col=0)

In [15]:
# Assign P to P phases in the df
p_phase = df.loc[df['trace_p_arrival'].notna(), 'phase'] = "P"

In [16]:
# Filter the df 
p_stas_picks_phase = df.loc[df['trace_p_arrival'].notna(), ['station_id','trace_p_arrival','phase','pick_id']]
p_stas_picks_phase

Unnamed: 0,station_id,trace_p_arrival,phase,pick_id
1333,UW.TAKO.,2011-01-01T00:58:17.626000Z,P,1333
1334,UW.TAKO.,2011-01-01T02:49:13.526000Z,P,1334
1335,UW.TAKO.,2011-01-01T10:59:16.606000Z,P,1335
1336,UW.TAKO.,2011-01-01T12:31:07.946000Z,P,1336
1337,UW.TAKO.,2011-01-01T14:05:36.586000Z,P,1337
...,...,...,...,...
23250797,7D.J25D.,2015-09-17T00:14:06.106100Z,P,23250797
23250798,7D.J25D.,2015-09-17T01:05:18.322100Z,P,23250798
23250799,7D.J25D.,2015-09-17T00:18:08.682100Z,P,23250799
23250800,7D.J25D.,2015-09-17T08:38:17.890100Z,P,23250800


In [17]:
# Rename the df
_p_stas_picks_phase = p_stas_picks_phase.rename(columns={"station_id": "station", "trace_p_arrival": "time"})
_p_stas_picks_phase

Unnamed: 0,station,time,phase,pick_id
1333,UW.TAKO.,2011-01-01T00:58:17.626000Z,P,1333
1334,UW.TAKO.,2011-01-01T02:49:13.526000Z,P,1334
1335,UW.TAKO.,2011-01-01T10:59:16.606000Z,P,1335
1336,UW.TAKO.,2011-01-01T12:31:07.946000Z,P,1336
1337,UW.TAKO.,2011-01-01T14:05:36.586000Z,P,1337
...,...,...,...,...
23250797,7D.J25D.,2015-09-17T00:14:06.106100Z,P,23250797
23250798,7D.J25D.,2015-09-17T01:05:18.322100Z,P,23250798
23250799,7D.J25D.,2015-09-17T00:18:08.682100Z,P,23250799
23250800,7D.J25D.,2015-09-17T08:38:17.890100Z,P,23250800


In [18]:
# Assign S to S phases in the df
s_phase = df.loc[df['trace_s_arrival'].notna(), 'phase'] = "S"# Filter the df 

In [19]:
# Filter the df 
s_stas_picks_phase = df.loc[df['trace_s_arrival'].notna(), ['station_id','trace_s_arrival','phase','pick_id']]
s_stas_picks_phase

Unnamed: 0,station_id,trace_s_arrival,phase,pick_id
1381,UW.TAKO.,2011-01-01T16:43:55.306000Z,S,1381
1382,UW.TAKO.,2011-01-01T16:50:45.366000Z,S,1382
1383,UW.TAKO.,2011-01-01T17:14:04.486000Z,S,1383
5651,UW.TAKO.,2011-01-02T02:24:13.166000Z,S,5651
5654,UW.TAKO.,2011-01-02T06:01:58.966000Z,S,5654
...,...,...,...,...
23250778,7D.J25D.,2015-09-17T05:28:27.626100Z,S,23250778
23250779,7D.J25D.,2015-09-17T05:21:40.010100Z,S,23250779
23250786,7D.J25D.,2015-09-17T05:09:31.818100Z,S,23250786
23250787,7D.J25D.,2015-09-17T05:14:23.938100Z,S,23250787


In [20]:
# Rename the df
_s_stas_picks_phase = s_stas_picks_phase.rename(columns={"station_id": "station", "trace_s_arrival": "time"})
_s_stas_picks_phase

Unnamed: 0,station,time,phase,pick_id
1381,UW.TAKO.,2011-01-01T16:43:55.306000Z,S,1381
1382,UW.TAKO.,2011-01-01T16:50:45.366000Z,S,1382
1383,UW.TAKO.,2011-01-01T17:14:04.486000Z,S,1383
5651,UW.TAKO.,2011-01-02T02:24:13.166000Z,S,5651
5654,UW.TAKO.,2011-01-02T06:01:58.966000Z,S,5654
...,...,...,...,...
23250778,7D.J25D.,2015-09-17T05:28:27.626100Z,S,23250778
23250779,7D.J25D.,2015-09-17T05:21:40.010100Z,S,23250779
23250786,7D.J25D.,2015-09-17T05:09:31.818100Z,S,23250786
23250787,7D.J25D.,2015-09-17T05:14:23.938100Z,S,23250787


In [21]:
# Concatenate the P and S picks
picks = pd.concat(objs = [_p_stas_picks_phase,_s_stas_picks_phase] , axis=0)
picks

Unnamed: 0,station,time,phase,pick_id
1333,UW.TAKO.,2011-01-01T00:58:17.626000Z,P,1333
1334,UW.TAKO.,2011-01-01T02:49:13.526000Z,P,1334
1335,UW.TAKO.,2011-01-01T10:59:16.606000Z,P,1335
1336,UW.TAKO.,2011-01-01T12:31:07.946000Z,P,1336
1337,UW.TAKO.,2011-01-01T14:05:36.586000Z,P,1337
...,...,...,...,...
23250778,7D.J25D.,2015-09-17T05:28:27.626100Z,S,23250778
23250779,7D.J25D.,2015-09-17T05:21:40.010100Z,S,23250779
23250786,7D.J25D.,2015-09-17T05:09:31.818100Z,S,23250786
23250787,7D.J25D.,2015-09-17T05:14:23.938100Z,S,23250787


In [22]:
# Swap the time and phase columns
picks = picks.iloc[:,[0,2,1,3]]
picks = picks.reset_index(drop=True)
picks

Unnamed: 0,station,phase,time,pick_id
0,UW.TAKO.,P,2011-01-01T00:58:17.626000Z,1333
1,UW.TAKO.,P,2011-01-01T02:49:13.526000Z,1334
2,UW.TAKO.,P,2011-01-01T10:59:16.606000Z,1335
3,UW.TAKO.,P,2011-01-01T12:31:07.946000Z,1336
4,UW.TAKO.,P,2011-01-01T14:05:36.586000Z,1337
...,...,...,...,...
2100819,7D.J25D.,S,2015-09-17T05:28:27.626100Z,23250778
2100820,7D.J25D.,S,2015-09-17T05:21:40.010100Z,23250779
2100821,7D.J25D.,S,2015-09-17T05:09:31.818100Z,23250786
2100822,7D.J25D.,S,2015-09-17T05:14:23.938100Z,23250787


In [23]:
picks[0:20]

Unnamed: 0,station,phase,time,pick_id
0,UW.TAKO.,P,2011-01-01T00:58:17.626000Z,1333
1,UW.TAKO.,P,2011-01-01T02:49:13.526000Z,1334
2,UW.TAKO.,P,2011-01-01T10:59:16.606000Z,1335
3,UW.TAKO.,P,2011-01-01T12:31:07.946000Z,1336
4,UW.TAKO.,P,2011-01-01T14:05:36.586000Z,1337
5,UW.TAKO.,P,2011-01-01T15:40:03.026000Z,1338
6,UW.TAKO.,P,2011-01-01T15:41:36.006000Z,1358
7,UW.TAKO.,P,2011-01-01T15:42:51.426000Z,1360
8,UW.TAKO.,P,2011-01-01T17:27:06.646000Z,1361
9,UW.TAKO.,P,2011-01-01T19:14:26.306000Z,1380


In [24]:
# Save the these picks to the data folder
picks.to_csv(f"../data/datasets_{region}/picks_{region}.csv")