# GHCNh false positive rate

## Load libraries

In [None]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import cartopy.crs as ccrs
import sys
import os

ghcnh_lib_path = "/Users/hector/ERA_work/historical-obs-platform/test_platform/scripts/3_qaqc_data/qaqc_eval_notebooks/GHCNh"
sys.path.append(ghcnh_lib_path)
from GHCNh_lib import GHCNh  # If GHCNh is was appended to path

sys.path.append(os.path.abspath("../scripts/3_qaqc_data"))
from qaqc_eval_utils import *

%matplotlib inline

%load_ext autoreload
%autoreload 2

## GHCNh dataset

### GHCNh test the convert to geopandas DataFrame

In [None]:
%%time
ghcnh = GHCNh(stations_local=True)
ghcnh.select_wecc()
id = ghcnh.stations_df["id"].iloc[0]
ghcnh.read_data_from_url(id, save=True)
ghcnh.convert_df_to_gpd()
lon = ghcnh.station_data.Longitude.mean()
lat = ghcnh.station_data.Latitude.mean()
print("{}, {:.5f}, {:.5f}".format(id, lon, lat))
ghcnh.station_data.head(3)

## Train stations and even evaluation

### Read in stations

In [None]:
train_stns = pd.read_csv("../qaqc_training_station_list_events.csv")

### How to identify other events of interest using a start and end date

In [None]:
alt_start_date = "2007-10-20"
alt_end_date = "2007-10-24"
alt_event_stns = find_other_events(
    train_stns, event_start=alt_start_date, event_end=alt_end_date, buffer=14, subset=5
)
alt_event_stns.head(3)

### Original functionality of `subset_eval_stns` function, random sampling from trainint stations

In [None]:
eval_stations = subset_eval_stns(
    event_to_eval="santa_ana_wind", stn_list=train_stns, subset=5, return_stn_ids=True
)
eval_stations.head(3)

In [None]:
eval_stations["era-id"].values[-1]
# eval_stations.loc[eval_stations['era-id']==stn_id]

In [None]:
fig, ax = stn_visualize(
    stn_id=eval_stations["era-id"].values[-1],
    stn_list=eval_stations,
    event_to_eval="santa_ana_wind",
)

ax.plot(
    ghcnh.stations_df.longitude,
    ghcnh.stations_df.latitude,
    "x",
    markersize=4,
    transform=ccrs.PlateCarree(),
    mfc="none",
)

### New addition, specific station flag

<div style="width: 50em;">
Specific station flag introduced to control which station we are analyzing
Useful for debugging, down the line this would help to manage using the same
station and not download different stations every time in the nearest stations
part (down)
</div>

In [None]:
""" Test when station is not in the training/event dataframe
This would lead to the error:
'ValueError: Station CIMIS_80 is not within the training/event dataset'
"""

# eval_stations = subset_eval_stns(
#     event_to_eval = 'santa_ana_wind',
#     stn_list = train_stns,
#     specific_station='CIMIS_80',
#     return_stn_ids = True
# )
# eval_stations

In [None]:
# specific_station = 'CIMIS_45'
specific_station = "CIMIS_75"

In [None]:
eval_stations = subset_eval_stns(
    event_to_eval="santa_ana_wind",
    stn_list=train_stns,
    specific_station=specific_station,
    return_stn_ids=True,
)
eval_stations

#### Plot 'specific station' from the training and the GHCNh stations in the 'area'

In [None]:
fig, ax = stn_visualize(
    stn_id=specific_station, stn_list=eval_stations, event_to_eval="santa_ana_wind"
)

ax.plot(
    ghcnh.stations_df.longitude,
    ghcnh.stations_df.latitude,
    "x",
    markersize=4,
    transform=ccrs.PlateCarree(),
    mfc="none",
)

## GHCNh stations distance to the testing stations

<div style="width: 50em;">
Create a distnace to specific station column for the ghcnh dataframe
Then, sort by distance to have the nearest stations to the one we are evaluating
</div>

### Create distance and sort it

In [None]:
stn_id = eval_stations["era-id"].values[-1]
point = eval_stations.loc[eval_stations["era-id"] == stn_id].geometry.iloc[0]
ghcnh_stations_df = ghcnh.stations_df.copy()
display(eval_stations.loc[eval_stations["era-id"] == stn_id, ["longitude", "latitude"]])
ghcnh_stations_df["distance_to_eval"] = ghcnh_stations_df.distance(
    point,
)
ghcnh_stations_df = ghcnh_stations_df.sort_values(by="distance_to_eval")
ghcnh_stations_df.head(5)

### Plot the closest 10 stations to the train/eval station

In [None]:
fig, ax = stn_visualize(
    stn_id=specific_station, stn_list=eval_stations, event_to_eval="santa_ana_wind"
)

plot_df = ghcnh_stations_df.iloc[:10]
ax.plot(
    plot_df.longitude,
    plot_df.latitude,
    "x",
    markersize=6,
    transform=ccrs.PlateCarree(),
    mfc="none",
);

In [None]:
era2ghcn_vars

In [None]:
# df, vars = return_ghcn_vars(ghcnh.station_data, 'tas')

In [None]:
# len(df.temperature_Quality_Code.dropna())/len(df.temperature_Quality_Code)

In [None]:
%%time
ds = pull_nc_from_aws(specific_station)  # manually retrieve a specific station
ds

In [None]:
%%time
df, MultiIndex, attrs, var_attrs, era_qc_vars = qaqc_ds_to_df(ds)
df.head(3)

In [None]:
%%time
df, MultiIndex, attrs, var_attrs, era_qc_vars = qaqc_ds_to_df(ds)
df.head(3)

### Why is the first time `qaqc_ds_to_df` much much slower than the second? 

<div style="width: 50em;">
The first time, `s3.open()` is done. This does reads the dataset in a lazy way. When converting to df, the whole file needs
to be read to memory, which `s3.open()` does much slower than s3.download()`
</div>

<div style="width: 50em;">

_From ChatGPT:_

When comparing s3.open and s3.download, the performance difference typically depends on how data is accessed and processed. Here’s an overview of the two methods:

1. **`s3.open()`**:
Use Case: Opens a file from an S3 bucket as a file-like object.
Performance:
When using s3.open, data is streamed in smaller chunks, allowing for on-the-fly reading (useful when handling large files). This reduces memory overhead since the file is not loaded into memory at once.
However, this streaming approach can make accessing data slower, especially when performing repeated read operations, as it incurs overhead due to network latency and chunk retrieval.
2. **`s3.download()`**:
Use Case: Downloads a file from S3 and stores it locally or in memory.
Performance:
s3.download typically downloads the entire file at once, which can be faster when you need the whole file to be processed or manipulated. There’s less latency involved after the file is downloaded, as subsequent operations on the local or in-memory copy are fast.
However, downloading large files can be slower initially (compared to streaming) and can consume significant memory.
Summary:
s3.open is slower for operations that require reading large amounts of data repeatedly due to chunk-based streaming.
s3.download is faster when you need the entire file at once, as you download the full data and can work with it in memory without additional network operations.
When to Use Each:
Use s3.open() for large files or if you only need partial access to the file (e.g., processing chunks of data or streaming video).
Use s3.download() when you want the entire file available immediately or if you need faster access after the file is fully downloaded.
</div>

<div style="width: 50em;">
One way to handle this, specially for speed in debugging and testing, is to download the file to a temporary file, with the option to save to disk if we want to. Since downloading the file it's much faster, this process will be faster. Also, we can set an instruction to look first locally, if the file is not there, then download from s3 bucket
</div>

In [None]:
%%time
ds = download_nc_from_aws(specific_station)
ds

#### Download the file to local folder with `save=True

In [None]:
ds = download_nc_from_aws(specific_station, save=True)
ds

### Once the file was downloaded, reading from disk should be faster for testing

In [None]:
%%time
ds = download_nc_from_aws(specific_station)
ds

In [None]:
%%time
df, MultiIndex, attrs, var_attrs, era_qc_vars = qaqc_ds_to_df(ds)
df.head(3)

#### Check if the ds_to_df problem was fixed

`ds_to_df` function was deleting all flags and converting them to nans

In [None]:
fig, ax = plt.subplots(figsize=(7, 3))
ds.tas_eraqc.plot(ax=ax, marker=".", lw=0);

In [None]:
fig, ax = plt.subplots(figsize=(7, 3))
df.tas_eraqc.plot(ax=ax, marker=".", lw=0);

## Classification metrics: precision, recall, false positive, and false negative rates

<div style="width: 50em;">
    
**Accuracy** is the most obvious classification metric, which is defined as the number of correct predictions divided by the total number of observations.

The biggest problem with accuracy is that it will let us down in situations where one class occurs with much higher frequency than another.  
</div>

<div style="width: 50em;">

---
**Discussion**

- We are tasked to build an QA/QC that analyzes weather stations. The goal is to build a pipeline that recognize "bad" values and flags them. Accuracy aims to flag as many "bad" avlues as the ground truth "bad" observations. 
- If our QA/QC gets a 1-% accuracy score, would we be happy with this outcome?
---
When accuracy is not a useful metric, we can instead focus on the uncommon class , in this case, the flag values:
- If QA/QC pipeline marks an obvservartion as "bad", how likely is it for that obsrevation is actually an error?", or
- What percent of "real"/ground truth bad observations are correctly flagged by our pipeline?"

These are the concepts of **precision** and **recall**, respectively. 

</div>

<div style="width: 50em;">

In the figure below, "correct" observations are representend in blue and "bad" observations are red. We pick the "flag" class to be the **positive class**. The arrows denote the results of our QA/QC pipeline. A blue arrow means the pipeline classifies the observation as "good" and a red arrow means the pipeline flags the observation as "bad".

![errors](precision_recall.svg)

Notice that there are instances when the QA/QC makes a correct classification, either it classifies that a "good" observation is good (true negative) or it flags that a bad observation is bad (true positive). But occasionally it makes an error. There are two types of errors, we either flagged a good observation as "bad" (false positive, also called type I error) or we classified an "real bad" observation as correct (false negative, also called type II error). 

* **Precision** is then defined as the number of true positives divided by the number of all positive *predictions*. Maximizing precision means **minimizing false positives**.
* **Recall** is defined as number of true positives divided by the number all positive *observations*. Maximizing recall means **minimizing false negatives**.
---
</div>

<div style="width: 50em;">
    
**In summary**:

Accuracy is the number of correct predictions divided by the total number of observations:
- Minimizing false positives means maximizing precision.
- Minimizing false negatives means maximizing recall
</div>

### False positive and false negative rates

<div style="width: 50em;">

The **false positive rate (FPR)** is the proportion of all negatives that still yield positive test outcomes, i.e., the conditional probability of a positive test result given an event that was not present. The false positive rate depends on the significance level. The **specificity** of the test is equal to 1 minus the false positive rate.

- In statistical hypothesis testing, this fraction is given the Greek letter α, and 1 − α is defined as the specificity of the test. 

- Increasing the specificity of the test lowers the probability of type I errors, but may raise the probability of type II errors (false negatives that reject the alternative hypothesis when it is true).

Complementarily, the **false negative rate (FNR)** is the proportion of positives which yield negative test outcomes with the test, i.e., the conditional probability of a negative test result given that the condition being looked for is present.

- In statistical hypothesis testing, this fraction is given the letter β. The "power" (or the "sensitivity") of the test is equal to 1 − β.

</div>

#### False positive rate (FPR)

<div style="width: 50em;">

**FPR** is a metric used to evaluate the performance of a classification model, particularly in binary classification. It measures the proportion of negative instances (i.e., instances that actually belong to the negative class) that are incorrectly classified as positive by the model.

$$\text{False Positive Rate (FPR)} = \frac{\text{False Positives (FP)}}{\text{False Positives (FP)} + \text{True Negatives (TN)}}\,\text{,}$$
where:

False Positives (FP): The number of instances where the model incorrectly predicted the positive class, but they actually belong to the negative class.

True Negatives (TN): The number of instances where the model correctly predicted the negative class.
</div>

<div style="width: 50em;">

**Interpretation:**

The false positive rate tells you how often your model incorrectly classifies negatives as positives.

- A low FPR is desirable, especially in cases where false positives carry significant costs (e.g., fraud detection, medical diagnoses).
- A high FPR means the model is making too many incorrect positive predictions, which can lead to undesirable outcomes.
</div>

#### False negativerate (FPR)

<div style="width: 50em;">
    
The **False Negative Rate (FNR)** is a metric used in binary classification to measure the proportion of positive instances that are incorrectly classified as negative by the model. In other words, it tells you how often the model fails to detect positives (i.e., actual positives that were classified as negatives).

$$\text{False Negative Rate (FNR)} = \frac{\text{False Negatives (FN)}}{\text{False Negatives (FN)} + \text{True Positives (TP)}} \, \text{,}$$
where

False Negatives (FN): The number of instances where the model incorrectly predicted the negative class, but they actually belong to the positive class.

True Positives (TP): The number of instances where the model correctly predicted the positive class.
</div>

<div style="width: 50em;">

**Interpretation:**

- A low FNR is desirable, especially in applications where missing positive instances is costly (e.g., failing to detect a disease or fraud).
- A high FNR means the model is missing too many actual positives, which can lead to harmful consequences.
</div>

<div style="width: 50em;">

**Relation to Sensitivity (Recall)**

The False Negative Rate (FNR) is related to Sensitivity (Recall), which measures the proportion of true positives identified by the model. Sensitivity is given by:
$$\text{Sensitivity (Recall)} = \frac{\text{True Positives (TP)}}{\text{True Positives (TP)} + \text{False Negatives (FN)}}$$
</div>

## FPR/FNR in terms of our QA/QC process

<img src="era_obs.png" alt="ERA_obs" width="700"/>
</img>

<div style="width: 50em;">

* **False possitive rate (FPR)**

**FPR** measures the proportion of "good" observations that are incorrectly classified as "bad" by the pipeline.

$$\text{False Positive Rate (FPR)} = \frac{\text{False Positives (FP)}}{\text{False Positives (FP)} + \text{True Negatives (TN)}}\,\text{,}$$
where:

FP: Number of incorrectly flagged observation by the QA/QC.

TN: Number of observations that were correctly not flagged by QA/QC.
</div>

* **False negative rate (FNR)**

<div style="width: 50em;">
    
**FNR**: proportion of "bad" observations incorrectly misded by the QA/QC. In other words, how often the QA/QC fails to detect "bad" values.

$$\text{FNR} = \frac{\text{False Negatives (FN)}}{\text{False Negatives (FN)} + \text{True Positives (TP)}} \, \text{,}$$
where

FN: Number of incorrectly missed "bad" observations not flagged by QA/QC.

TP: Number correctly flagged observation by QA/QC.
</div>

In [None]:
era2ghcn_vars

In [None]:
ghcn2era_vars

### GHCNh legacy quality flags

<div style="width: 50em;">

- GHCN datasets quality code (flags) have a very annoying thing: the **legacy codes.**
- For some reason, GHCN flags are letters or `str(numbers)`. But also, they have `int` or `float` in there. 
- It took a loooooot of digging and confusing, but whenever a letter or a str(number) is the flag, those
are the GHCN flags. Whenever there is a float or int, the flag is legacy (this is what I understand so far). 
- Before realizing this, the percentage of flagged data in GHCN was almost 100%, because there is a flag that is "data is ok"
- So, I had to re-write this and differentiate from str(number) and actual float/int numbers (I put a comment in the percentage function down there)
- Also, the next few cells show a little bit of this struggling and how the flags are handled
</div>

In [None]:
ghcnh.station_data["temperature_Quality_Code"][
    ghcnh.station_data["temperature_Quality_Code"].isna()
]

In [None]:
ghcnh.station_data["temperature_Quality_Code"][
    ~ghcnh.station_data["temperature_Quality_Code"].isna()
]

In [None]:
bad = ~ghcnh.station_data["temperature_Quality_Code"].isna()

In [None]:
len(ghcnh.station_data["temperature"]), len(ghcnh.station_data["temperature"][bad])

In [None]:
"""# FOR NOW, new stations (not downloaded locally) are failing, data is not available """

# id = ghcnh_stations_df['id'].iloc[5]
# ghcnh.read_data_from_url(id, save=True)

In [None]:
id = ghcnh_stations_df["id"].iloc[1]
print(id)
ghcnh.read_data_from_url(id, save=True)
ghcnh.convert_df_to_gpd()

In [None]:
# flags = ["L","o","F","U","D","d","W","K","C","T","S","h","V","w","N","E","p","H",
#          "0","1","2","3","4","5","6","7","8","9","10","11","12","13","14","15","16","17"
#         ]
# for f in flags:
#     size = len(ghcnh.station_data['temperature_Quality_Code'][ghcnh.station_data['temperature_Quality_Code']==f])
#     if f.isnumeric():
#         size = size + len(ghcnh.station_data['temperature_Quality_Code'][ghcnh.station_data['temperature_Quality_Code']==int(f)])
#     if size>0:
#         print("{}:\t{}".format(f,size))

In [None]:
ghcnh.station_data["temperature_Quality_Code"].unique()

In [None]:
print(ghcnh.station_data["temperature_Source_Code"].unique())

flags = [
    "L",
    "o",
    "F",
    "U",
    "D",
    "d",
    "W",
    "K",
    "C",
    "T",
    "S",
    "h",
    "V",
    "w",
    "N",
    "E",
    "p",
    "H",
    "0",
    "1",
    "2",
    "3",
    "4",
    "5",
    "6",
    "7",
    "8",
    "9",
    "10",
    "11",
    "12",
    "13",
    "14",
    "15",
    "16",
    "17",
    0,
    1,
    2,
    3,
    4,
    5,
    6,
    7,
    8,
    9,
    10,
    11,
    12,
    13,
    14,
    15,
    16,
    17,
]
for f in flags:
    size = len(
        ghcnh.station_data["temperature_Quality_Code"][
            ghcnh.station_data["temperature_Quality_Code"] == f
        ]
    )
    if size > 0:
        print("{} {}:\t{}".format(type(f), f, size))

In [None]:
ghcnh.station_data["temperature_Quality_Code"].transform(type).unique()

In [None]:
# (223+4+12255+2+4+7786+13)/len(ghcnh.station_data['temperature_Quality_Code'])

### Percentage of observation flagged

In [None]:
def percentage_of_flags(df, var=None, ghcn=False, show=False):
    if ghcn:
        qc_var = var + "_Quality_Code"
    else:
        qc_var = var + "_eraqc"

    isnan = df[qc_var].isna()
    flagged_values = df.loc[~isnan, qc_var]
    non_flagged_values = df.loc[isnan, qc_var]

    # To deal with GHCN legacy flags
    if ghcn:
        legacy = ghcnh.station_data["temperature_Quality_Code"].transform(
            lambda row: type(row) == int or type(row) == float
        )
        isnan = np.logical_and(~isnan, ~legacy)
        flagged_values = df.loc[isnan, qc_var]

    perc = 100 * len(flagged_values) / len(df[qc_var])

    if show:
        print("{:.4f} % flagged values".format(perc))

    return perc

In [None]:
var = "tas"
ghcn_var = era2ghcn_vars[var]

print("\n{}".format(specific_station))
print(df.iloc[[0, -1]]["time"].values)
perc = percentage_of_flags(df, var, ghcn=False, show=True)

print("\n{}".format(ghcnh.station_data["Station_ID"].iloc[0]))
print(ghcnh.station_data.iloc[[0, -1]]["time"].values)
perc = percentage_of_flags(ghcnh.station_data, ghcn_var, ghcn=True, show=True)

In [None]:
print("=============================")
print("ERA QA/QC:\t{}".format(specific_station))
print(df.iloc[[0, -1]]["time"].values)
perc = percentage_of_flags(df, var, ghcn=False, show=True)

for i in range(3):
    id = ghcnh_stations_df["id"].iloc[i]
    ghcnh.read_data_from_url(id, save=True)
    ghcnh.convert_df_to_gpd()
    ghcnh_df = ghcnh.station_data.copy()

    tmp_df = return_ghcn_vars(ghcnh_df, var)
    print("=============================")
    print("{}".format(id))
    print(ghcnh_df.iloc[[0, -1]]["time"].values)
    perc = percentage_of_flags(tmp_df, ghcn_var, ghcn=True, show=True)

In [None]:
print("=============================")
print("ERA QA/QC:\t{}".format(specific_station))
print(df.iloc[[0, -1]]["time"].values)
perc = percentage_of_flags(df, var, ghcn=False, show=True)

for i in range(3):
    id = ghcnh_stations_df["id"].iloc[i]
    ghcnh.read_data_from_url(id, save=True)
    ghcnh.convert_df_to_gpd()
    ghcnh_df = ghcnh.station_data.copy()

    tmp_df = return_ghcn_vars(ghcnh_df, var)
    print("=============================")
    print("{}".format(id))
    print(ghcnh_df.iloc[[0, -1]]["time"].values)
    perc = percentage_of_flags(tmp_df, ghcn_var, ghcn=True, show=True)

### Station flag/time/coords statistics

In [None]:
def station_statistics(df, var=None, ghcn=False, show=False):
    t0, t1 = df.iloc[[0, -1]]["time"].values

    nTimes = len(df)

    if ghcn:
        coords = [(lon, lat) for lon, lat in zip(df["Longitude"], df["Latitude"])]
    else:
        coords = [(lon, lat) for lon, lat in zip(df["lon"], df["lat"])]
    coords = set(coords)

    frequency = pd.Timedelta(df["time"].diff().median(), "h")

    if ghcn:
        variables = [var for var in df.columns if var in list(era2ghcn_vars.values())]
    else:
        variables = [var for var in df.columns if var in list(era2ghcn_vars.keys())]

    flag_percentages = {}
    for var in variables:
        flag_percentages[var] = percentage_of_flags(df, var=var, ghcn=ghcn, show=False)

    return variables, t0, t1, nTimes, frequency, flag_percentages, coords

In [None]:
ghcnh.station_data.columns

In [None]:
station_statistics(df)

In [None]:
station_statistics(ghcnh.station_data, ghcn=True)

In [None]:
def print_station_stats(stats):
    variables, t0, t1, nTimes, frequency, flag_percentages, coords = stats
    print("Variables: {}".format(variables))
    print("Timespan: {}-{}".format(t0, t1))
    print("Coordinates:")
    for c in coords:
        print("{}".format(c))
    print("Number of observations: {}".format(nTimes))
    print("Frequency: {}".format(frequency))
    print(".............................")
    print("Flag percentages:")
    print(".............................")
    for var, pc in flag_percentages.items():
        print("{}: {:.4f}%".format(var, pc))

In [None]:
print("=============================")
print("ERA QA/QC:\t{}".format(specific_station))
print("-----------------------------")
stats = station_statistics(df)
print_station_stats(stats)
for i in range(3):
    id = ghcnh_stations_df["id"].iloc[i]
    ghcnh.read_data_from_url(id, save=True)
    ghcnh.convert_df_to_gpd()
    ghcnh_df = ghcnh.station_data.copy()
    stats = station_statistics(ghcnh_df, ghcn=True)
    print("\n=============================")
    print("{}".format(id))
    print("-----------------------------")
    print_station_stats(stats)

In [None]:
ghcnh_stations_df.iloc[1]

In [None]:
ghcnh_df[vars[var] + "_Quality_Code"]

In [None]:
ghcnh_df[vars[var]].plot()
ghcnh_df.dropna(subset=vars[var] + "_Quality_Code")[vars[var]].plot(marker=".", lw=0)

In [None]:
fig, ax = plt.subplots(figsize=(10, 3))
pd.DataFrame(ghcnh_df).iloc[5000:5200].plot(
    x="time", y="temperature", marker="x", lw=0, ax=ax
)
pd.DataFrame(ghcnh_df).iloc[5000:5200].dropna(subset=vars[var] + "_Quality_Code").plot(
    x="time", y="temperature", marker="o", mfc="none", lw=0, ax=ax
)

In [None]:
len(ghcnh_df.dropna(subset=vars[var] + "_Quality_Code")) / len(ghcnh_df)

In [None]:
print(
    len(ghcnh_df[vars[var] + "_Quality_Code"].dropna())
    / len(ghcnh_df[vars[var] + "_Quality_Code"])
)

In [None]:
ghcnh_df[vars[var] + "_Quality_Code"].unique()

In [None]:
np.setdiff1d(ghcnh_df[vars[var] + "_Quality_Code"].dropna().index, ghcnh_df.index)

In [None]:
ind = pd.isna(ghcnh_df[vars[var] + "_Quality_Code"])
len(np.where(ind)[0])
len(plot_df[ind])

In [None]:
fig, ax = plt.subplots(figsize=(10, 3))
plot_df = pd.DataFrame(ghcnh_df)
plot_df.plot(
    x="time", y=vars[var], ax=ax, marker=".", lw=0.5, color="black", markersize=2
)
ax.fill_between(
    x=plot_df[ind].time,
    y1=ax.get_ylim()[0],
    y2=ax.get_ylim()[1],
    color="skyblue",
    alpha=0.5,
)
# plot_df = plot_df[ind]
# plot_df.plot(x="time", y=vars[var],
#              ax=ax, marker="o", lw=0.0, color="red",
#              markersize=4, label='Streaks', mfc='none')

In [None]:
fig, ax = plt.subplots(figsize=(10, 3))
plot_df = pd.DataFrame(ghcnh_df)
plot_df.plot(
    x="time", y=vars[var], ax=ax, marker=".", lw=0.5, color="black", markersize=2
)
plot_df = plot_df[plot_df[vars[var] + "_Quality_Code"] == "K"]
plot_df.plot(
    x="time",
    y=vars[var],
    ax=ax,
    marker="o",
    lw=0.0,
    color="red",
    markersize=4,
    label="Streaks",
    mfc="none",
)

In [None]:
fig, ax = plt.subplots(figsize=(10, 3))
plot_df = pd.DataFrame(ghcnh_df)
plot_df.plot(
    x="time", y=vars[var], ax=ax, marker=".", lw=0.5, color="black", markersize=2
)
plot_df = plot_df[plot_df[vars[var] + "_Quality_Code"] == "K"]
plot_df.plot(
    x="time",
    y=vars[var],
    ax=ax,
    marker="o",
    lw=0.0,
    color="red",
    markersize=4,
    label="Streaks",
    mfc="none",
)
ax.set_xlim(np.datetime64("2003-03-08"), np.datetime64("2003-03-23"))

In [None]:
fig, ax = plt.subplots(figsize=(10, 3))
plot_df = pd.DataFrame(ghcnh_df)
plot_df.plot(
    x="time", y=vars[var], ax=ax, marker=".", lw=0.5, color="black", markersize=2
)
plot_df = plot_df[plot_df[vars[var] + "_Quality_Code"] == "o"]
plot_df.plot(
    x="time",
    y=vars[var],
    ax=ax,
    marker="o",
    lw=0.0,
    color="red",
    markersize=4,
    label="Out of range",
    mfc="none",
)

In [None]:
fig, ax = plt.subplots(figsize=(10, 3))
plot_df = pd.DataFrame(ghcnh_df)
plot_df.plot(
    x="time", y=vars[var], ax=ax, marker=".", lw=0.5, color="black", markersize=2
)
plot_df = plot_df[plot_df[vars[var] + "_Quality_Code"] == "f"]
plot_df.plot(
    x="time",
    y=vars[var],
    ax=ax,
    marker="x",
    lw=0.0,
    color="red",
    markersize=4,
    label="Suspect or Error",
    mfc="none",
)

In [None]:
ghcnh_df.columns

In [None]:
ghcnh_df["relative_humidity_Measurement_Code"].unique()

In [None]:
ghcnh_df["relative_humidity_Quality_Code"].unique()

In [None]:
ghcnh_df["relative_humidity_Report_Type"].unique()

In [None]:
ghcnh_df["relative_humidity_Source_Code"].unique()

In [None]:
ghcnh_df["relative_humidity_Source_Station_ID"].unique()

In [None]:
# df[var+"_eraqc"]

In [None]:
s3 = s3fs.S3FileSystem(anon=False)

In [None]:
fname = specific_station
network = fname.split("_")[0]
s3_url = "s3://wecc-historical-wx/3_qaqc_wx_dev/{}/{}.nc".format(network, fname)

In [None]:
s3.download()

In [None]:
df.dropna(subset=vars["tas"])

In [None]:
len(df.temperature_Quality_Code.dropna()) / len(df.temperature_Quality_Code)

- ERA5 map
- Accumulated flags
- Interpolated stations or mean


In [None]:
ghcnh.convert_df_to_gpd(ghcnh.station_data)