# Station Matching


## Environment set-up

In [1]:
from shapely.geometry import Point
from shapely.ops import nearest_points

from functools import reduce
import datetime
from pandas import *
import boto3
import geopandas as gpd
import numpy as np
import pandas as pd
import xarray as xr
import matplotlib.pyplot as plt
from io import BytesIO, StringIO

## New logger function
from log_config import logger

# Import qaqc stage calc functions
try:
    from QAQC_pipeline import *
except:
    print("Error importing QAQC_pipeline.py")

# import tempfile  # Used for downloading (and then deleting) netcdfs to local drive from s3 bucket
import os

# Silence warnings
import warnings
from shapely.errors import ShapelyDeprecationWarning

warnings.filterwarnings("ignore", category=RuntimeWarning)
warnings.filterwarnings(
    "ignore", category=ShapelyDeprecationWarning
)  # Warning is raised when creating Point object from coords. Can't figure out why.

plt.rcParams["figure.dpi"] = 300

In [2]:
# AWS credentials
s3 = boto3.resource("s3")
s3_cl = boto3.client("s3")

## AWS buckets
bucket = "wecc-historical-wx"
qaqcdir = "3_qaqc_wx/"
mergedir = "4_merge_wx/"

## Step 1: Identify candidates for concatenation and upload to AWS

We do so by identifying stations with exactly matching latitudes and longitudes.

In [31]:
# A list of networks to be checked for concatenation
target_networks = [
    "VALLEYWATER"
]  # ["ASOSAWOS","VALLEYWATER", "MARITIME"]

In [32]:
def concatenation_check(station_list):
    """
    This function flags stations that need to be concatenated.

    Rules
    ------
        1.) Stations are flagged if they have identical latitudes and longitudes

    Parameters
    ------
        station_list: pd.DataFrame
            list of station information

    Returns
    -------
        if success:
            new_station_list: pd.DataFrame
                input station list with a flag column assigning an integer to each group of repeat latitudes and longitudes

        if failure:
            None

    """
    ##### Flag stations with identical latitudes and longitudes, then assign each group a unique integer

    # List of possible variable names for longitudes and latitudes
    lat_lon_list = ["LAT", "LON", "latitude", "longitude", "LATITUDE", "LONGITUDE", 'lat','lon']
    # Extract the latitude and longitude variable names from the input dataframe
    lat_lon_cols = [col for col in station_list.columns if col in lat_lon_list]

    # Generate column flagging duplicate latitudes and longitudes
    station_list["concat_subset"] = station_list.duplicated(
        subset=lat_lon_cols, keep=False
    )
    # within each group of identical latitudes and longitudes, assign a unique integer
    station_list["concat_subset"] = (
        station_list[station_list["concat_subset"] == True].groupby(lat_lon_cols).ngroup()
    )

    ##### Order station list by flag
    concat_station_list = station_list.sort_values("concat_subset")

    ##### Keep only flagged stations
    concat_station_list = concat_station_list[~concat_station_list["concat_subset"].isna()]

    ##### Format final list
    # Convert flags to integers - this is necessary for the final concatenation step
    concat_station_list["concat_subset"] = concat_station_list["concat_subset"].astype(
        "int32"
    )
    # Now keep only the ERA-ID and flag column
    era_id_list = ['ERA-ID','era-id']
    era_id_col = [col for col in station_list.columns if col in era_id_list]
    concat_station_list = concat_station_list[era_id_col + ["concat_subset"]]

    # Standardize ERA id to "ERA-ID" (this is specific to Valleywater stations)
    if 'era-id' in era_id_col:
        concat_station_list.rename(columns={"era-id": "ERA-ID"}, inplace=True)

    return concat_station_list

In [33]:
def apply_concat_check(station_names_list):
    """
    This function applies the conatenation check to a list of target stations. 
    It then upload a csv containing the ERA IDs and concatenation subset ID for 
    all identified stations in a network.

    Parameters
    ------
        station__names_list: pd.DataFrame
            list of target station names

    Returns
    -------
        if success:
            uploads list of stations to be concatenated to AWS
        if failure:
            None

    """
    final_list = pd.DataFrame([])
    for station in station_names_list:

        ##### Import station list of target station
        key = "2_clean_wx/{}/stationlist_{}_cleaned.csv".format(station,station)
        bucket_name = "wecc-historical-wx"
        list_import = s3_cl.get_object(
            Bucket=bucket,
            Key=key,
        )
        station_list = pd.read_csv(BytesIO(list_import["Body"].read()))

        ##### Apply concatenation check
        concat_list = concatenation_check(station_list)

        ##### Rename the flags for each subset to <station>_<subset number>
        concat_list["concat_subset"] = station + '_' + concat_list["concat_subset"].astype(str)

        ##### Append to final list of stations to concatenate
        final_list = pd.concat([final_list,concat_list])

        ##### Upload to QAQC directory in AWS
        new_buffer = StringIO()
        final_list.to_csv(new_buffer, index = False)
        content = new_buffer.getvalue()

        # the csv is stored in each station folder within 3_qaqc_wx
        s3_cl.put_object(
            Bucket = bucket_name,
            Body = content,
            Key = qaqcdir + station + "/concat_list_{}.csv".format(station)
        )
        
    return None

In [34]:
apply_concat_check(target_networks)

NameError: name 'pd' is not defined

## Step 2: Concatenate Stations

### The functions

In [None]:
def _multiindex_concat_nooverlap(m_old, m_new, name):
    """
    Formats MultiIndex, ensuring that there are no duplicate times in the time index.

    Rules
    ------
        1.) Drop duplicate times

    Parameters
    ------
        m_old: xr.Dataset
            older weather station dataset
        m_new: xr.Dataset   
            newer weather station dataset
        name: str
            newer station name
        
    Returns
    -------
        if success:
            return a dataframe with a re-formatted MultiIndex
        if failure:
            None
    """

    # combine time indices of two multiindexes
    tidx = (
        pd.concat(
            [
                pd.Series(m_old.get_level_values("time").values),
                pd.Series(m_new.get_level_values("time").values),
            ]
        )
        .reset_index()
        .drop(columns="index")
    )

    # idenitify if there are duplicate times
    tidx = tidx.rename(columns={0: "time"})
    tidx = tidx.sort_values("time").drop_duplicates(subset=["time"])

    # PULL the station name from m_new and set to the same length
    stnidx = (
        pd.Series(name, index=np.arange(len(tidx)), name="station")
        .reset_index()
        .drop(columns="index")
    )

    # combine into new df
    df_new = pd.concat([stnidx, tidx], axis=1)

    return df_new

In [None]:
def _concat_export_help(
    df_concat, network_name, attrs_new, station_names
):
    """
    Prepares the final concatenated dataset for export by 
    - updating the attributes and 
    - converting one of the mulit-index levels to the correct datatype
    then exports the dataset to AWS

    Rules
    ------
        1.) retains the name of the newest station

    Parameters
    ------
        df_concat: pd.DataFrame
            dataframe of concatenated dataframes
        network_name: str
            weather station network
        attrs_new: list of str
            attributes of newer dataframe that was input to concatenation
        station_name_new: str
            name of newer station
        station_name_old: str
            name of older station
        station_names: list of str
            library of station names, included the single new station name and a string of all the older station names

    Returns
    -------
        if success:
            None
            exports dataset of concatenated dataframes to AWS
        if failure:
            None
    """

    # Delete unnecessary columns and set index
    df_concat = df_concat.drop(["hour", "day", "month", "year", "date"], axis=1)
    df_to_export = df_concat.set_index(["station", "time"])

    # Convert concatenated dataframe to dataset
    ds_concat = df_to_export.to_xarray()

    # Convert datatype of station coordinate
    ds_concat.coords["station"] = ds_concat.coords["station"].astype("<U20")

    # Include past attributes
    for i in attrs_new:
        ds_concat.attrs[i] = attrs_new[i]

    # Update 'history' attribute
    timestamp = datetime.datetime.utcnow().strftime("%m-%d-%Y, %H:%M:%S")
    ds_concat.attrs["history"] = ds_concat.attrs[
        "history"
    ] + " \nstation_matching.ipynb run on {} UTC".format(timestamp)

    # Update 'comment' attribute
    ds_concat.attrs["comment"] = (
        "Intermediary data product. This data has been subjected to cleaning, QA/QC, but may not have been standardized."
    )

    # Extract old and new station names from name dictionary
    station_name_new = station_names["station_name_new"]
    station_name_old = station_names["old_stations"]

    # Add new qaqc_files_merged attribute
    ds_concat.attrs["qaqc_files_merged"] = (
        "{}, {} merged. Overlap retained from newer station data.".format(
            station_name_old,
            station_name_new  # extract old and new station names from name dictionary
        )
    )

    ## Export
    # ! a test name is used below
    # ! the final name will be that of the newer dataframe
    export_url = "s3://wecc-historical-wx/3_qaqc_wx/{}/{}_{}.zarr".format(
        network_name, "TEST_concat", station_name_new
    )
    print("Exporting....", export_url)
    # ds_concat.to_zarr(export_url, mode="w") ## WHEN READY TO EXPORT

    # ! output final concatenated dataset for testing
    return ds_concat 

    # return None

In [None]:
def _overlap_concat(df_new,df_old):
    """
    Handles the cases in which there is overlap between the two input stations

    Rules
    ------
        1.) concatenation: keep the newer station data in the time range in which both stations overlap

    Parameters
    ------
        df_new: pd.DataFrame
            weather station network
        df_old: pd.DataFrame
            weather station network

    Returns
    -------
        if success:
            return final concatenated dataset
        if failure:
            None
    """

    df_overlap = df_new[df_new["time"].isin(df_old["time"])]

    ##### Split datframes into subsets #####

    # Remove data in time overlap between old and new
    df_old_cleaned = df_old[~df_old["time"].isin(df_overlap["time"])]
    df_new_cleaned = df_new[~df_new["time"].isin(df_overlap["time"])]

    ##### Concatenate subsets #####
    df_concat = pd.concat([df_old_cleaned, df_overlap, df_new_cleaned])

    return df_concat

In [92]:
def _df_concat(df_1, df_2, attrs_1, attrs_2):
    """
    Performs concatenation of input datasets, handling two cases
        1.) temporal overlap between the datasets
        2.) no temporal overlap

    Rules
    ------
        1.) concatenation: keep the newer station data in the time range in which both stations overlap

    Parameters
    ------
        df_1: pd.DataFrame
            station data
        df_2: pd.DataFrame
            dtation data
        attrs_1: list of str
            attributes of df_1
        attrs_2:
            attributes of df_2

    Returns
    -------
        if success:
        returns
            df_concat: concatenated dataframe
            stn_n_to_keep: name of newer station
            stn_n_to_drop: name of older station
            attrs_new: attributes for newer station

        if failure:
            None
    """

    # determine which dataset is older
    if df_1["time"].max() < df_2["time"].max():
        # if df_1 has an earlier end tiem than df_2, then d_2 is newer
        # we also grab the name of the newer station in this step, for use later
        df_new = df_2
        attrs_new = attrs_2
        df_old = df_1

    else:
        df_new = df_1
        attrs_new = attrs_1
        df_old = df_2

    stn_n_to_keep = df_new["station"].unique()[0]
    stn_n_to_drop = df_old["station"].unique()[0]
    print(f"Station will be concatenated and saved as: {stn_n_to_keep}")

    # now set things up to determine if there is temporal overlap between df_new and df_old
    df_overlap = df_new[df_new["time"].isin(df_old["time"])]

    # If there is no overlap between the two time series, just concatenate
    if len(df_overlap) == 0:
        print("No overlap!")
        df_concat = pd.merge(df_old, df_new, how="outer")
        df_concat["station"] = stn_n_to_keep

    # If overlap exists, split into subsets and concatenate
    else:
        print("There is overlap")
        df_concat = _overlap_concat(df_old, df_new)

    return df_concat, stn_n_to_keep, stn_n_to_drop, attrs_new

In [94]:
def _more_than_2(network_name,stns_to_pair):
    """
    Performs pairwise concatenation on subsets of more than two stations flagged for concatenation

    Rules
    ------
        1.) concatenation: keep the newer station data in the time range in which both stations overlap

    Parameters
    ------
        network_name: string
            weather station network
        stns_to_pair: pd.DataFrame
            dataframe of the input station names

    Returns
    -------
        if success:
            returns concatenated dataframe, dictionary of old and new station names, and attributes of newest station
        if failure:
            None
    """

    print("Concatenating the following stations:", stns_to_pair)

    # Load datasets into a list
    datasets = [
        xr.open_zarr(
            "s3://wecc-historical-wx/3_qaqc_wx/{}/{}.zarr".format(
                network_name, stn
            ),
            consolidated=True,
        )
        for stn in stns_to_pair['ERA-ID']
    ]

    # Sort datasets by their max 'time'
    datasets_sorted = sorted(datasets, key=lambda ds: ds['time'].max())

    # Store station names, in order from oldest to newest
    names = [ds.coords["station"].values[0] for ds in datasets_sorted]

    print('newest station:', names[-1])

    # Setup for the while loop
    ds_1 = datasets_sorted[0]
    df_1, MultiIndex_1, attrs_1, var_attrs_1, era_qc_vars_1 = qaqc_ds_to_df(
        ds_1, verbose=False
    )
    i = 0
    end = len(datasets_sorted) -1

    while i < end:

        print('iteration:', i)

        ds_2 = datasets_sorted[i+1]
        df_2, MultiIndex_2, attrs_2, var_attrs_2, era_qc_vars_2 = qaqc_ds_to_df(
            ds_2, verbose=False
        )

        # Send to helper function for concatenation
        df_concat, stn_n_to_keep, stn_n_to_drop, attrs_new = _df_concat(
            df_1, df_2, attrs_1, attrs_2
        )

        df_1 = df_concat
        attrs_1 = attrs_new

        i += 1

    # Construct station names list, for updating attributes
    newest_station = names[-1] # Get last station name from station name list
    older_stations = ", ".join(names[:-1]) # Create a string containing all older station names
    station_names = {"station_name_new": newest_station, "old_stations": older_stations}

    print('Progression concatenation for 2+ stations is complete.')

    return df_concat, station_names, attrs_new

In [96]:
def concatenate_station_pairs2(network_name):
    """
    Coordinates the concatenation of input datasets and exports the final concatenated dataset.
    Also returns a list of the ERA-IDs of all stations that are concatenated.

    Parameters
    ------
        network_name: string
            weather station network

    Returns
    -------
        if success:
            return list of ERA-IDs are stations that are concatenated
            all processed datasets are exported to the merge folder in AWS and the original datasets are deleted
        if failure:
            None
    Notes
    -------
    Uses the following helper functions
        _df_concat(): concatenates two dataframes
        _overlap_concat(): used by _df_concat() to concatenate two stations with overlapping time ranges
        _more_than_2(): handles subsets with more than two stations, passing pairs to _df_concat() iteratively
        _concat_export_help(): formats and exports concatenated dataframe

    """
    # Initiate empty list, to which we will iteratively add the ERA-IDs of stations that are concatenated
    final_concat_list = []

    # Read in full concat station list
    print(network_name)
    concat_list = pd.read_csv(
        f"s3://wecc-historical-wx/3_qaqc_wx/{network_name}/concat_list_{network_name}.csv"
    )

    # Identify stns within designated network
    concat_by_network = concat_list.loc[
        concat_list.concat_subset.str.contains(network_name)
    ]

    # ! for testing
    concat_by_network = concat_list[concat_list["concat_subset"] == "ASOSAWOS_3"]
    # ! for testing

    # For MARITIME, remove these stations becuase they're actually separate stations
    if network_name == 'MARITIME':
        unique_pair_names = unique_pair_names[1:]
        unique_pair_name = unique_pair_name[~unique_pair_name["ERA-ID"].isin['MARITIME_LJPC1','MARITIME_LJAC1']]
    else: 
        pass

    unique_pair_names = concat_by_network.concat_subset.unique()
    print(
        f"There are {len(concat_by_network)} stations to be concatenated into {len(unique_pair_names)} station pairs within {network_name}..."
    )

    print(unique_pair_names)

    # Set up pairs
    for pair in unique_pair_names:
        print(pair)
        # pull out stations corresponding to pair name
        stns_to_pair = concat_by_network.loc[concat_by_network.concat_subset == pair]

        if len(stns_to_pair) == 2:  # 2 stations to concat together
            print("\n", stns_to_pair)

            # import this subset of datasets and convert to dataframe
            url_1 = "s3://wecc-historical-wx/3_qaqc_wx/{}/{}.zarr".format(
                network_name, stns_to_pair.iloc[0]["ERA-ID"]
            )
            url_2 = "s3://wecc-historical-wx/3_qaqc_wx/{}/{}.zarr".format(
                network_name, stns_to_pair.iloc[1]["ERA-ID"]
            )

            print("Retrieving....", url_1)
            print("Retrieving....", url_2)
            ds_1 = xr.open_zarr(url_1)
            ds_2 = xr.open_zarr(url_2)

            # convert to dataframes with corresponding information
            df_1, MultiIndex_1, attrs_1, var_attrs_1, era_qc_vars_1 = qaqc_ds_to_df(
                ds_1, verbose=False
            )
            df_2, MultiIndex_2, attrs_2, var_attrs_2, era_qc_vars_2 = qaqc_ds_to_df(
                ds_2, verbose=False
            )

            ##### Send to helper function for concatenation
            df_concat, stn_n_to_keep, stn_n_to_drop, attrs_new = _df_concat(
                df_1, df_2, attrs_1, attrs_2
            )

            station_names ={"station_name_new":stn_n_to_keep, "old_stations":stn_n_to_drop}

            ds_final = _concat_export_help(
                df_concat,
                network_name,
                attrs_new,
                station_names  # stn_n_to_keep, stn_n_to_drop
            )

            final_concat_list.extend(stns_to_pair["ERA-ID"].tolist())

            # return ds_final, final_concat_list

        else:
            # If there are more than 2 stations in the given subset, pass to _more_than_2()
            print("More than 2 stations within a subset")
            df_concat, station_names, attrs_new = _more_than_2(
                network_name,
                stns_to_pair
            )

            if df_concat is None: # If the concentation failed
                print('Concatenation of >2 stations was unsuccessful')
            else: # If it was successful, move on to the next steps
                # add station names to station name list
                final_concat_list.extend(stns_to_pair["ERA-ID"].tolist())

                ds_final = _concat_export_help(
                    df_concat,
                    network_name,
                    attrs_new,
                    station_names  # stn_n_to_keep, stn_n_to_drop
                )
    print("Concatenated stations: ", final_concat_list)
    return ds_final, final_concat_list
    # return final_concat_list

### Test

In [97]:
network_name = "ASOSAWOS"

In [98]:
ds_to_export, final_concat_list = concatenate_station_pairs2(network_name)

ASOSAWOS
There are 3 stations to be concatenated into 1 station pairs within ASOSAWOS...
['ASOSAWOS_3']
ASOSAWOS_3
More than 2 stations within a subset
Concatenating the following stations:                  ERA-ID concat_subset
6  ASOSAWOS_74003503145    ASOSAWOS_3
7  ASOSAWOS_72280503145    ASOSAWOS_3
8  ASOSAWOS_69960403145    ASOSAWOS_3
newest station: ASOSAWOS_74003503145
iteration: 0
Station will be concatenated and saved as: ASOSAWOS_69960403145
No overlap!
iteration: 1
Station will be concatenated and saved as: ASOSAWOS_74003503145
There is overlap
Progression concatenation for 2+ stations is complete.
Exporting.... s3://wecc-historical-wx/3_qaqc_wx/ASOSAWOS/TEST_concat_ASOSAWOS_74003503145.zarr
Concatenated stations:  ['ASOSAWOS_74003503145', 'ASOSAWOS_72280503145', 'ASOSAWOS_69960403145']


### CHECK

In [30]:
concat_list = pd.read_csv(
    f"s3://wecc-historical-wx/3_qaqc_wx/ASOSAWOS/concat_list_ASOSAWOS.csv"
)

In [31]:
stns_to_pair = concat_list[concat_list['concat_subset']=='ASOSAWOS_3']

In [32]:
stns_to_pair

Unnamed: 0,ERA-ID,concat_subset
6,ASOSAWOS_74003503145,ASOSAWOS_3
7,ASOSAWOS_72280503145,ASOSAWOS_3
8,ASOSAWOS_69960403145,ASOSAWOS_3


In [None]:
datasets = [ 
    xr.open_zarr(
        "s3://wecc-historical-wx/3_qaqc_wx/{}/{}.zarr".format(
            network_name, stn
        ),
        consolidated=True,
    )
    for stn in stns_to_pair['ERA-ID']
]

In [48]:
datasets_sorted = sorted(datasets, key=lambda ds: ds['time'].max()) # oldest is first

In [49]:
print(datasets_sorted[0].time.max())
print(datasets_sorted[1].time.max())
print(datasets_sorted[2].time.max())

<xarray.DataArray 'time' ()>
array('1987-12-31T23:00:00.000000000', dtype='datetime64[ns]')
<xarray.DataArray 'time' ()>
array('2009-12-31T23:51:00.000000000', dtype='datetime64[ns]')
<xarray.DataArray 'time' ()>
array('2022-08-31T23:57:00.000000000', dtype='datetime64[ns]')


In [58]:
names =[ ds.coords["station"].values[0] for ds in datasets_sorted]


In [99]:
ds_to_export

## Step 4: Mark stations that have been concatenated