# Station Matching

The goal of this notebook is to identify stations that changed IDs. The IDs fo these pairs of matching stations will be stored in a csv, which then be fed into concentation as a lookup table. Pairs will receive unique flags, with the older station receiving a different flag than the newer station.


## Step 0: Environment set-up

In [21]:
from shapely.geometry import Point
from shapely.ops import nearest_points

from functools import reduce
import datetime
from pandas import *
import boto3
import geopandas as gpd
import numpy as np
import pandas as pd
import xarray as xr
import matplotlib.pyplot as plt
from io import BytesIO, StringIO

import tempfile  # Used for downloading (and then deleting) netcdfs to local drive from s3 bucket

import s3fs

# import tempfile  # Used for downloading (and then deleting) netcdfs to local drive from s3 bucket
import os

# Silence warnings
import warnings
from shapely.errors import ShapelyDeprecationWarning

warnings.filterwarnings("ignore", category=RuntimeWarning)
warnings.filterwarnings(
    "ignore", category=ShapelyDeprecationWarning
)  # Warning is raised when creating Point object from coords. Can't figure out why.

plt.rcParams["figure.dpi"] = 300

In [4]:
# AWS credentials
s3 = s3fs.S3FileSystem #must be set to this to use such commands as ls
#s3 = boto3.resource('s3')
s3_client = boto3.client('s3')

## AWS buckets
bucket = "wecc-historical-wx"
qaqcdir = "3_qaqc_wx/VALLEYWATER/"
mergedir = "4_merge_wx/VALLEYWATER/"

In [5]:
# Define temporary directory in local drive for downloading data from S3 bucket
# If the directory doesn't exist, it will be created
# If we used zarr, this wouldn't be neccessary
temp_dir = "./tmp"
if not os.path.exists(temp_dir):
    os.mkdir(temp_dir)

In [6]:
def read_nc_from_s3_clean(network_name, station_id, temp_dir):
    """Read netcdf file containing station data for a single station of interest from AWS s3 bucket

    Parameters
    ----------
    network_name: str
        Name of network (i.e. "ASOSAWOS")
        Must correspond with a valid directory in the s3 bucket (i.e. "CAHYDRO", "CDEC", "ASOSAWOS")
    station_id: str
        Station identifier; i.e. the name of the netcdf file in the bucket (i.e. "ASOSAWOS_72012200114.nc")

    Returns
    -------
    station_data: xr.Dataset

    Notes
    -----
    The data is first downloaded from AWS into a tempfile, which is then deleted after xarray reads in the file
    I'd like to see us use a zarr workflow if possible to avoid this.

    """

    # Temp file for downloading from s3
    temp_file = tempfile.NamedTemporaryFile(
        dir=temp_dir, prefix="", suffix=".nc", delete=True
    )

    # Create s3 file system
    s3 = s3fs.S3FileSystem(anon=False)

    # Get URL to netcdf in S3
    s3_url = "s3://wecc-historical-wx/2_clean_wx/{}/{}.nc".format(
        network_name, station_id
    )

    # Read in the data using xarray
    s3_file_obj = s3.get(s3_url, temp_file.name)
    station_data = xr.open_dataset(temp_file.name, engine="h5netcdf").load()

    # Close temporary file
    temp_file.close()

    return station_data

In [7]:
def read_zarr_from_s3(station_id, temp_dir):
    """Read zarr file containing station data for a single station of interest from AWS s3 bucket

    Parameters
    ----------
    network_name: str
        Name of network (i.e. "ASOSAWOS")
        Must correspond with a valid directory in the s3 bucket (i.e. "CAHYDRO", "CDEC", "ASOSAWOS")
    station_id: str
        Station identifier; i.e. the name of the netcdf file in the bucket (i.e. "ASOSAWOS_72012200114.nc")

    Returns
    -------
    station_data: xr.Dataset

    Notes
    -----
    The data is first downloaded from AWS into a tempfile, which is then deleted after xarray reads in the file
    """

    # Temp file for downloading from s3
    temp_file = tempfile.NamedTemporaryFile(
        dir=temp_dir, prefix="", suffix=".zarr", delete=True
    )

    # Create s3 file system
    s3 = s3fs.S3FileSystem(anon=False)

    # Get URL to netcdf in S3
    s3_url = "s3://wecc-historical-wx/3_qaqc_wx/VALLEYWATER/VALLEYWATER_{}.zarr".format(
        station_id
    )
    print(s3_url)

    # Read in the data using xarray
    s3_file_obj = s3.get(s3_url, temp_file.name)
    station_data = xr.open_dataset(temp_file.name, engine="zarr").load()

    # Close temporary file
    temp_file.close()

    return station_data

In [8]:
def qaqc_ds_to_df(ds, verbose=False):
    """Converts xarray ds for a station to pandas df in the format needed for the pipeline

    Parameters
    ----------
    ds : xr.Dataset
        input data from the clean step
    verbose : bool, optional
        if True, provides runtime output to the terminal

    Returns
    -------
    df : pd.DataFrame
        converted xr.Dataset into dataframe
    MultiIndex : pd.Index
        multi-index of station and time
    attrs : list of str
        attributes from xr.Dataset
    var_attrs : list of str
        variable attributes from xr.Dataset
    era_qc_vars : list of str
        QAQC variables

    Notes
    -----
    This is the notebook friendly version (no logger statements).
    """
    ## Add qc_flag variable for all variables, including elevation;
    ## defaulting to nan for fill value that will be replaced with qc flag

    for key, val in ds.variables.items():
        if val.dtype == object:
            if key == "station":
                if str in [type(v) for v in ds[key].values]:
                    ds[key] = ds[key].astype(str)
            else:
                if str in [type(v) for v in ds.isel(station=0)[key].values]:
                    ds[key] = ds[key].astype(str)

    exclude_qaqc = [
        "time",
        "station",
        "lat",
        "lon",
        "qaqc_process",
        "sfcWind_method",
        "pr_duration",
        "pr_depth",
        "PREC_flag",
        "rsds_duration",
        "rsds_flag",
        "anemometer_height_m",
        "thermometer_height_m",
    ]  # lat, lon have different qc check

    raw_qc_vars = []  # qc_variable for each data variable, will vary station to station
    era_qc_vars = []  # our ERA qc variable
    old_era_qc_vars = []  # our ERA qc variable

    for var in ds.data_vars:
        if "q_code" in var:
            raw_qc_vars.append(
                var
            )  # raw qc variable, need to keep for comparison, then drop
        if "_qc" in var:
            raw_qc_vars.append(
                var
            )  # raw qc variables, need to keep for comparison, then drop
        if "_eraqc" in var:
            era_qc_vars.append(
                var
            )  # raw qc variables, need to keep for comparison, then drop
            old_era_qc_vars.append(var)

    print(f"era_qc existing variables:\n{era_qc_vars}")
    n_qc = len(era_qc_vars)

    for var in ds.data_vars:
        if var not in exclude_qaqc and var not in raw_qc_vars and "_eraqc" not in var:
            qc_var = var + "_eraqc"  # variable/column label

            # if qaqc var does not exist, adds new variable in shape of original variable with designated nan fill value
            if qc_var not in era_qc_vars:
                print(f"nans created for {qc_var}")
                ds = ds.assign({qc_var: xr.ones_like(ds[var]) * np.nan})
                era_qc_vars.append(qc_var)

    print("{} created era_qc variables".format(len(era_qc_vars) - len(old_era_qc_vars)))
    if len(era_qc_vars) != n_qc:
        print("{}".format(np.setdiff1d(old_era_qc_vars, era_qc_vars)))

    # Save attributes to inheret them to the QAQC'ed file
    attrs = ds.attrs
    var_attrs = {var: ds[var].attrs for var in list(ds.data_vars.keys())}

    with warnings.catch_warnings():
        warnings.filterwarnings("ignore", category=RuntimeWarning)
        df = ds.to_dataframe()

    # instrumentation heights
    if "anemometer_height_m" not in df.columns:
        try:
            df["anemometer_height_m"] = (
                np.ones(ds["time"].shape) * ds.anemometer_height_m
            )
        except:
            print("Filling anemometer_height_m with NaN.", flush=True)
            df["anemometer_height_m"] = np.ones(len(df)) * np.nan
        finally:
            pass
    if "thermometer_height_m" not in df.columns:
        try:
            df["thermometer_height_m"] = (
                np.ones(ds["time"].shape) * ds.thermometer_height_m
            )
        except:
            print("Filling thermometer_height_m with NaN.", flush=True)
            df["thermometer_height_m"] = np.ones(len(df)) * np.nan
        finally:
            pass

    # De-duplicate time axis
    df = df[~df.index.duplicated()].sort_index()

    # Save station/time multiindex
    MultiIndex = df.index
    station = df.index.get_level_values(0)
    df["station"] = station

    # Station pd.Series to str
    station = station.unique().values[0]

    # Convert time/station index to columns and reset index
    df = df.droplevel(0).reset_index()

    # Add time variables needed by multiple functions
    df["hour"] = pd.to_datetime(df["time"]).dt.hour
    df["day"] = pd.to_datetime(df["time"]).dt.day
    df["month"] = pd.to_datetime(df["time"]).dt.month
    df["year"] = pd.to_datetime(df["time"]).dt.year
    df["date"] = pd.to_datetime(df["time"]).dt.date

    return df  # , MultiIndex, attrs, var_attrs, era_qc_vars

## Step 1: Identify Matching Stations

### Identical latitiude and longitudes

In [None]:
### Read in ASOSAWOS stations

s3_cl = boto3.client("s3")  # for lower-level processes

asosawos = s3_cl.get_object(
    Bucket="wecc-historical-wx", Key="1_raw_wx/ASOSAWOS/stationlist_ASOSAWOS.csv"
)
asosawos_list = pd.read_csv(BytesIO(asosawos["Body"].read()))


print(asosawos_list) # 399 records (last two are nans)

In [10]:
### Round asosawos down to 3 decimal points of accuracy
asosawos_round = asosawos_list.round({"LAT": 3, "LON": 3})

In [13]:
print(asosawos_round[asosawos_round.duplicated(subset=['LAT','LON'], keep=False)])

     Unnamed: 0    NCDCID   WBAN    COOPID CALL             NAME  \
231         231  20022657  24027  487845.0  RKS  ROCK SPRINGS AP   
397         397  20022657  24027  487845.0  RKS  ROCK SPRINGS AP   

             ALTNAME        COUNTRY  ST      COUNTY     LAT      LON  \
231  ROCK SPRINGS AP  UNITED STATES  WY  SWEETWATER  41.595 -109.053   
397              NaN  UNITED STATES  WY  SWEETWATER  41.595 -109.053   

          ELEV  UTC                             STNTYPE   STARTDATE  \
231  2059.8384   -7  AIRWAYS,ASOS,AWOS,COOP,USHCN,WXSVC  20010517.0   
397  2059.8384   -7  AIRWAYS,ASOS,AWOS,COOP,USHCN,WXSVC         NaN   

    GHCN-DailyID  Barometer_elev  Anemometer_elev  
231  USW00024027          6763.0             33.0  
397          NaN             NaN              NaN  


### Stations within a certain distance

#### approach 1: using sjoin

In [16]:
# Convert the data into GeoDataFrames
# using EPSG 3310

gdf_asosawos = gpd.GeoDataFrame(asosawos_list, 
                        geometry=[Point(lon, lat) for lon, lat in zip(asosawos_list['LON'], asosawos_list['LAT'])],
                        crs="EPSG:4326").to_crs(epsg=3310)  

In [17]:
# Create a buffer around points in gdf1 (e.g., 10 km buffer)
gdf_asosawos['buffer'] = gdf_asosawos.geometry.buffer(.1)  # Buffer in degrees, 0.1 degrees approx equals 10 km

In [None]:
# Perform a spatial join using the buffer
merged = gpd.sjoin(gdf_asosawos, gdf_asosawos[['geometry', 'buffer']], how="inner", predicate="within")

# The 'merged' GeoDataFrame contains points from gdf_isd that are within the buffer around points in gdf_asosawos

In [None]:
print(merged) # there are not ISD stations within 10km of an ASOSAWOS station missed by the exact matching

In [None]:
# find the nearest point in the geodataframe

# insert emtpy column

gdf_asosawos['nearest_station'] = pd.Series(dtype='U16')
gdf_asosawos['distance'] = pd.Series(dtype='float32')


In [None]:
for index, row in gdf_asosawos.iterrows():
    point = row.geometry
    multipoint = gdf_asosawos.drop(index, axis=0).geometry.unary_union
    queried_geom, nearest_geom = nearest_points(point, multipoint)
    dist_from_point = 
    gdf_asosawos.loc[index, 'nearest_geometry'] = nearest_geom
    gdf_asosawos.loc[index, 'distance'] = nearest_geom

In [26]:
gdf_asosawos

Unnamed: 0.1,Unnamed: 0,NCDCID,WBAN,COOPID,CALL,NAME,ALTNAME,COUNTRY,ST,COUNTY,...,STNTYPE,STARTDATE,GHCN-DailyID,Barometer_elev,Anemometer_elev,geometry,buffer,nearest_station,distance,nearest_geometry
0,0,10001445,93107,,NKX,SAN DIEGO MIRAMAR NAS,SAN DIEGO MIRAMAR NAS,UNITED STATES,CA,SAN DIEGO,...,"ASOS,MILITARY",19460501.0,USW00093107,-99999.0,,POINT (268594.210 -567731.751),"POLYGON ((268594.310 -567731.751, 268594.310 -...",,,POINT (268380.7766827465 -573515.895529184)
1,1,20001392,93115,,NRS,IMPERIAL BEACH REAM FLD NAS,IMPERIAL BEACH REAM FLD NAS,UNITED STATES,CA,SAN DIEGO,...,ASOS,19900601.0,USW00093115,20.0,33.0,POINT (271115.479 -600748.790),"POLYGON ((271115.579 -600748.790, 271115.579 -...",,,POINT (282674.1091138162 -599519.0084458143)
2,2,20001424,23199,,NJK,EL CENTRO NAF,EL CENTRO NAF,UNITED STATES,CA,IMPERIAL,...,ASOS,19900601.0,USW00023199,-99999.0,33.0,POINT (406379.109 -566292.451),"POLYGON ((406379.209 -566292.451, 406379.208 -...",,,POINT (414534.71525172977 -565668.0918465396)
3,3,20003403,23061,50130.0,ALS,ALAMOSA-BERGMAN FLD,ALAMOSA-BERGMAN FLD,UNITED STATES,CO,ALAMOSA,...,"ASOS,COOP,PLCD,WXSVC",19920901.0,USW00023061,7536.0,33.0,POINT (1244561.911 28820.201),"POLYGON ((1244562.011 28820.201, 1244562.010 2...",,,POINT (1304213.5410201757 44831.97136883857)
4,4,20003486,93058,56740.0,PUB,PUEBLO MEM AP,PUEBLO MEM AP,UNITED STATES,CO,PUEBLO,...,"ASOS,COOP,PLCD,WXSVC",19921001.0,USW00093058,4655.0,33.0,POINT (1347557.192 140631.522),"POLYGON ((1347557.292 140631.522, 1347557.291 ...",,,POINT (1318986.009418833 179847.9929464655)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
396,396,30003059,94086,,PNA,PINEDALE WENZ FLD AP,,UNITED STATES,WY,SUBLETTE,...,AWOS,,,,,POINT (835061.761 575502.976),"POLYGON ((835061.861 575502.976, 835061.861 57...",,,POINT (812853.3423008773 548915.6029244149)
397,397,20022657,24027,487845.0,RKS,ROCK SPRINGS AP,,UNITED STATES,WY,SWEETWATER,...,"AIRWAYS,ASOS,AWOS,COOP,USHCN,WXSVC",,,,,POINT (911903.967 450185.982),"POLYGON ((911904.067 450185.982, 911904.067 45...",,,POINT (911903.9670986698 450185.98208982404)
398,398,30083363,342,,FWZ,S PASS,,UNITED STATES,WY,FREMONT,...,AWOS,,,,,POINT (922215.161 554338.873),"POLYGON ((922215.261 554338.873, 922215.261 55...",,,POINT (922956.2946331993 587702.7931789225)
399,399,30083498,475,,SAA,SHIVELY FIELD AP,,UNITED STATES,WY,CARBON,...,AWOS,,,,,POINT (1098509.279 457151.554),"POLYGON ((1098509.379 457151.554, 1098509.378 ...",,,POINT (1062617.9563132052 492836.3751639202)


#### approach 2: distance function

In [20]:
## function to calculate the distance between points

def distance_sort_filter(row, df2, buffer=None, id=False):

    dist = df2.geometry.distance(row).sort_values()

    if buffer:
        dist = dist[dist<buffer]

    if id:
        distances = {df2.loc[idx]['WBAN']:value for idx,value in zip(dist.index, dist.values)}
    else:
        distances = {idx:value for idx,value in zip(dist.index, dist.values)}
    
    return distances

### Generate a csv with three columns
1. ID: station ID (which column do I use for this?)
2. match flag: stations that match each other have the same flags (1,2,3,4,etc.)
3. age flag: which station in each pair is older (1,2)

In [27]:
target = asosawos_round[asosawos_round['WBAN']==24027]
target

Unnamed: 0.1,Unnamed: 0,NCDCID,WBAN,COOPID,CALL,NAME,ALTNAME,COUNTRY,ST,COUNTY,LAT,LON,ELEV,UTC,STNTYPE,STARTDATE,GHCN-DailyID,Barometer_elev,Anemometer_elev
231,231,20022657,24027,487845.0,RKS,ROCK SPRINGS AP,ROCK SPRINGS AP,UNITED STATES,WY,SWEETWATER,41.595,-109.053,2059.8384,-7,"AIRWAYS,ASOS,AWOS,COOP,USHCN,WXSVC",20010517.0,USW00024027,6763.0,33.0
397,397,20022657,24027,487845.0,RKS,ROCK SPRINGS AP,,UNITED STATES,WY,SWEETWATER,41.595,-109.053,2059.8384,-7,"AIRWAYS,ASOS,AWOS,COOP,USHCN,WXSVC",,,,


In [None]:
# Run function - generate station map
def get_maps(subdf1, subdf2, shapepath):
    
    # ------------------------------------------------------------------------------------------------------------
    # Make a geodataframe.
    gdf1 = gpd.GeoDataFrame(subdf1, geometry=gpd.points_from_xy(subdf1.LON, subdf1.LAT))
    gdf2 = gpd.GeoDataFrame(subdf2, geometry=gpd.points_from_xy(subdf2.LON, subdf2.LAT))

    gdf1.set_crs(epsg=4326, inplace=True) # Set CRS
    gdf2.set_crs(epsg=4326, inplace=True) # Set CRS
    
    # Project data to match base tiles.
    gdf1_wm = gdf1.to_crs(epsg=3857) # Web mercator
    gdf2_wm = gdf2.to_crs(epsg=3857) # Web mercator

    # Read in geometry of continental US.
    us = gpd.read_file(shapepath)

    # Remove territories, AK, HI
    rem_list = ["HI", "AK", "MP", "GU", "AS", "PR", "VI"]
    us = us.loc[us.STUSPS.isin(rem_list) == False]

    # Use to clip stations
    us = us.to_crs(epsg = 3857)

    gdf1_us = gdf1_wm.clip(us)
    gdf2_us = gdf2_wm.clip(us)

    # ------------------------------------------------------------------------------------------------------------
    # Version 1 - full map
    ax = gdf1_us.plot(color='blue', figsize=(15, 15), alpha=0.6, markersize=5, legend=True)
    gdf2_us.plot(ax=ax, color='red', alpha=0.6, markersize=5, legend=True)  # Plot gdf2 on same axis, with a different color

    # Add basemap
    cx.add_basemap(ax, source=cx.providers.CartoDB.Positron)

    # Set up the axis
    ax.set_axis_off()

    # Optional: Create custom legend
    ax.legend(['asosawos', 'isd'], loc='upper right')

    
shapepath = "s3://wecc-historical-wx/0_maps/tl_2021_us_state"

### Find matches within a certain distance - Approach 3

using spatial join