# GHCNh false positive rate

In [1]:
# import libraries
import pandas as pd
import numpy as np
import xarray as xr
import geopandas as gpd
from geopandas import GeoDataFrame
from shapely.geometry import Point
import matplotlib.pyplot as plt
import cartopy.crs as ccrs
import cartopy.feature as cf
from matplotlib.ticker import MaxNLocator
import sys
ghcnh_lib_path = "/Users/hector/ERA_work/historical-obs-platform/test_platform/scripts/3_qaqc_data/qaqc_eval_notebooks/GHCNh"
sys.path.append(ghcnh_lib_path)
from GHCNh_lib import GHCNh # If GHCNh is was appended to path

%load_ext autoreload
%autoreload 2

In [2]:
%%time
ghcnh = GHCNh(stations_local=True)
ghcnh.select_wecc()
id = ghcnh.stations_df['id'].iloc[0]
ghcnh.read_data_from_url(id, save=True)
ghcnh.convert_df_to_gpd()
lon = ghcnh.station_data.Longitude.mean()
lat = ghcnh.station_data.Latitude.mean()
print("{}, {:.5f}, {:.5f}".format(id, lon, lat))
ghcnh.station_data.head(3)

  self.station_data = pd.read_csv(self.local_filename, sep='|')


CAW00025343, -133.05000, 54.25000
CPU times: user 3.09 s, sys: 280 ms, total: 3.37 s
Wall time: 3.37 s


Unnamed: 0,Station_ID,Station_name,Year,Month,Day,Hour,Minute,Latitude,Longitude,Elevation,...,precipitation_24_hour_Report_Type,precipitation_24_hour_Source_Code,precipitation_24_hour_Source_Station_ID,remarks,remarks_Measurement_Code,remarks_Quality_Code,remarks_Report_Type,remarks_Source_Code,remarks_Source_Station_ID,time
0,CAW00025343,LANGARA,1954,1,1,9,0,54.25,-133.05,41.1,...,,,,,,,,,,1954-01-01 09:00:00
1,CAW00025343,LANGARA,1954,1,1,21,0,54.25,-133.05,41.1,...,,,,,,,,,,1954-01-01 21:00:00
2,CAW00025343,LANGARA,1954,1,2,3,0,54.25,-133.05,41.1,...,,,,,,,,,,1954-01-02 03:00:00


In [28]:
def return_ghcn_vars(ghcn_df, input_var):
    '''
    Given an input variable, return GHCNh location variables and all relevant data variables,
    rather than utilizing the whole 240 cols, or having to know how ghcnh labels the cols.

    input_var must follow ERA naming scheme (tas, tdps, ps, pr, etc.)
    '''
    ghcnh_vars = pd.read_csv('ghcnh_data_headers.csv')

    # include station-ID, time, loc, elevation (cols 1-10)
    stn_info_cols = ['Station_ID', 'Station_name', 'time',
                     'Latitude','Longitude','Elevation']
    vars = {
        'tas': 'temperature',
        'tdps': 'dew_point_temperature',
        'tdps_derived': 'dew_point_temperature',
        'ps': 'station_level_pressure',
        'psl': 'station_level_pressure',
        'sfcWind_dir': 'wind_direction',
        'sfcWind': 'wind_speed',
        'tas': 'temperature',
        'hurs': 'relative_humidity',
        'rsds': "N/A",
        'pr': 'precipitation',
        'pr_1h': 'precipitation',
        'pr_5min': 'precipitation',
    }
    if input_var in vars.keys():
        i = ghcn_df.columns.get_loc(vars[input_var])
        j = i+6
        # For wind, include wind gust
        if input_var=="sfcWind":
            j = j+6
        var_cols = ghcn_df.iloc[:, i:j].columns
        return_cols = np.concatenate([stn_info_cols,var_cols])
        
        return ghcn_df.loc[:, return_cols], vars
    else:
        raise Exception(f"Variable {input_var} not in variables' dictionary")

In [29]:
df, vars = return_ghcn_vars(ghcnh.station_data, 'tas')

In [34]:
df.dropna(subset=vars["tas"])

Unnamed: 0,Station_ID,Station_name,time,Latitude,Longitude,Elevation,temperature,temperature_Measurement_Code,temperature_Quality_Code,temperature_Report_Type,temperature_Source_Code,temperature_Source_Station_ID
0,CAW00025343,LANGARA,1954-01-01 09:00:00,54.25,-133.05,41.1,3.9,,U,SAO-Airway,335.0,999999-25343
1,CAW00025343,LANGARA,1954-01-01 21:00:00,54.25,-133.05,41.1,1.7,,U,SAO-Airway,335.0,999999-25343
2,CAW00025343,LANGARA,1954-01-02 03:00:00,54.25,-133.05,41.1,3.3,,4,SAO-Airway,335.0,999999-25343
3,CAW00025343,LANGARA,1954-01-02 09:00:00,54.25,-133.05,41.1,2.8,,4,SAO-Airway,335.0,999999-25343
4,CAW00025343,LANGARA,1954-01-02 21:00:00,54.25,-133.05,41.1,1.1,,4,SAO-Airway,335.0,999999-25343
...,...,...,...,...,...,...,...,...,...,...,...,...
87612,CAW00025343,LANGARA,1999-12-31 06:00:00,54.25,-133.05,41.1,2.5,,1,SYMT-Synop,335.0,718990-25343
87613,CAW00025343,LANGARA,1999-12-31 12:00:00,54.25,-133.05,41.1,5.5,,1,SYMT-Synop,335.0,718990-25343
87614,CAW00025343,LANGARA,1999-12-31 15:00:00,54.25,-133.05,41.1,5.7,,1,SYMT-Synop,335.0,718990-25343
87615,CAW00025343,LANGARA,1999-12-31 18:00:00,54.25,-133.05,41.1,5.5,,1,SYMT-Synop,335.0,718990-25343


In [43]:
len(df.temperature_Quality_Code.dropna())/len(df.temperature_Quality_Code)

0.9602136547284804