### Historical Data Platform QA/QC Event Evaluation Procedure
**Event**: Santa Ana Wind Event<br>
Start date: 2/16/1988<br> 
End date: 2/19/1988<br>
Location: Los Angeles, Orange counties<br>
Variables: wind speed, wind direction, air temperature, humidity<br>

In [1]:
# import libraries
import pandas as pd
import numpy as np
import xarray as xr

In [5]:
# read in stations
train_stns = pd.read_csv('../qaqc_training_station_list_events.csv')
train_stns

Unnamed: 0,network,era-id,elevation,latitude,longitude,start_date,end_date,event_type,notes
0,ASOSAWOS,ASOSAWOS_72681024131,860.4504,43.567000,-116.24100,1931-01-01 00:00:00+00:00,2022-12-31 00:00:00+00:00,all,
1,OtherISD,OtherISD_72286023119,468.2000,33.900000,-117.25000,1933-01-01 00:00:00+00:00,2023-03-14 00:00:00+00:00,all,
2,ASOSAWOS,ASOSAWOS_72698024229,6.7056,45.596000,-122.60900,1936-05-01 00:00:00+00:00,2022-12-31 00:00:00+00:00,all,
3,ASOSAWOS,ASOSAWOS_72492023237,8.2296,37.890000,-121.22600,1941-01-03 00:00:00+00:00,2022-12-31 00:00:00+00:00,all,
4,ASOSAWOS,ASOSAWOS_72384023155,149.3520,35.434000,-119.05500,1941-10-01 00:00:00+00:00,2022-12-31 00:00:00+00:00,all,
...,...,...,...,...,...,...,...,...,...
933,CWOP,CWOP_C0931,2959.0000,-114.135800,47.69420,,,,
934,RAWS,RAWS_TR343,1280.0000,47.956900,-124.26250,,,,not cleaned
935,RAWS,RAWS_TR612,500.0000,57.883333,-135.58333,,,,not cleaned
936,RAWS,RAWS_TT726,5650.0000,47.982767,-114.35590,,,,not cleaned


In [11]:
# identify stations with coverage of event
event_flags = ['all', 'santa_ana_wind']
event_stns = train_stns[train_stns['event_type'].isin(event_flags)]

# exclude "manual check on end date" for the time being -- SNOTEL stations all have 2100 as their end date regardless of when the data actually ends
mask = event_stns['notes'] == 'manual check on end date'
event_stns = event_stns[~mask]
event_stns

Unnamed: 0,network,era-id,elevation,latitude,longitude,start_date,end_date,event_type,notes
0,ASOSAWOS,ASOSAWOS_72681024131,860.4504,43.567000,-116.24100,1931-01-01 00:00:00+00:00,2022-12-31 00:00:00+00:00,all,
1,OtherISD,OtherISD_72286023119,468.2000,33.900000,-117.25000,1933-01-01 00:00:00+00:00,2023-03-14 00:00:00+00:00,all,
2,ASOSAWOS,ASOSAWOS_72698024229,6.7056,45.596000,-122.60900,1936-05-01 00:00:00+00:00,2022-12-31 00:00:00+00:00,all,
3,ASOSAWOS,ASOSAWOS_72492023237,8.2296,37.890000,-121.22600,1941-01-03 00:00:00+00:00,2022-12-31 00:00:00+00:00,all,
4,ASOSAWOS,ASOSAWOS_72384023155,149.3520,35.434000,-119.05500,1941-10-01 00:00:00+00:00,2022-12-31 00:00:00+00:00,all,
...,...,...,...,...,...,...,...,...,...
161,CIMIS,CIMIS_68,-38.0000,32.759575,-115.73207,1987-05-29 00:00:00+00:00,2024-01-22 13:36:19.951554+00:00,all,
163,CIMIS,CIMIS_75,410.0000,33.688450,-117.72118,1987-10-07 00:00:00+00:00,2024-01-22 13:36:19.951554+00:00,all,
164,CIMIS,CIMIS_75,410.0000,33.688450,-117.72118,1987-10-07 00:00:00+00:00,2024-01-22 13:36:19.951554+00:00,all,
165,CIMIS,CIMIS_76,160.0000,34.924000,-120.51200,1987-12-18 00:00:00+00:00,1993-07-01 00:00:00+00:00,santa_ana_wind,


In [None]:
# identify stations that are in the geographic region we are looking for
# santa ana wind event, focusing on LA and Orange counties


In [12]:
# randomly select 20 stations for fine-tuning of this procedure by HAID and VF
subset_stns = santa_ana_stns.sample(20)
subset_stns

Unnamed: 0,network,era-id,elevation,latitude,longitude,start_date,end_date,event_type,notes
47,ASOSAWOS,ASOSAWOS_72483023232,5.7912,38.507,-121.496,1973-01-01 00:00:00+00:00,2022-12-31 00:00:00+00:00,all,
65,ASOSAWOS,ASOSAWOS_72693024221,109.1184,44.133,-123.216,1973-01-01 00:00:00+00:00,2022-12-31 00:00:00+00:00,all,
49,ASOSAWOS,ASOSAWOS_72494023234,3.048,37.62,-122.366,1973-01-01 00:00:00+00:00,2022-12-31 00:00:00+00:00,all,
57,ASOSAWOS,ASOSAWOS_72594524283,64.3128,40.978,-124.105,1973-01-01 00:00:00+00:00,2022-12-31 00:00:00+00:00,all,
165,CIMIS,CIMIS_76,160.0,34.924,-120.512,1987-12-18 00:00:00+00:00,1993-07-01 00:00:00+00:00,santa_ana_wind,
42,ASOSAWOS,ASOSAWOS_72467523063,1980.2856,39.65,-106.917,1973-01-01 00:00:00+00:00,2022-12-31 00:00:00+00:00,all,
158,CIMIS,CIMIS_51,180.0,38.662,-122.866,1986-08-24 00:00:00+00:00,1994-03-28 00:00:00+00:00,santa_ana_wind,
73,ASOSAWOS,ASOSAWOS_72792024227,60.96,46.974,-122.905,1973-01-01 00:00:00+00:00,2022-12-31 00:00:00+00:00,all,
131,CIMIS,CIMIS_7,185.0,36.851248,-120.59098,1982-09-22 00:00:00+00:00,2024-01-22 13:36:19.951554+00:00,all,
22,ASOSAWOS,ASOSAWOS_72392793110,11.8872,34.2,-119.204,1944-04-01 00:00:00+00:00,2022-12-31 00:00:00+00:00,all,


### Step 2: Holistic / qualitative station evaluation

In [4]:
# phase 2 lok at full timeseries for flags

for stn in event_stns:
    want to pull out all flags noted, frequency of flags of time record
    

In [None]:
# look at timeseries during event for flags
event_start_date = 
event_end_date = 

# decision: do we add a few days prior/after event end date to evaluate "event anomaly"?

In [None]:
# look at all stations (5-20 view) for the event to qualitatively get a sense of how stations did during the event with regards to qaqc flags

### Step 3: Validate against other QA/QC'd data
* GHCNh data -- need to figure out how to read this funky file format first

In [None]:
# read in ghcnh data
## hector working on this

In [None]:
#

In [13]:
# initial test for identifying the event: large jumps on windspeed