# Station Matching


## Environment set-up

In [None]:
from shapely.geometry import Point
from shapely.ops import nearest_points

from functools import reduce
import datetime
from pandas import *
import boto3
import geopandas as gpd
import numpy as np
import pandas as pd
import xarray as xr
import matplotlib.pyplot as plt
from io import BytesIO, StringIO

## New logger function
from log_config import logger

# Import qaqc stage calc functions
try:
    from QAQC_pipeline import *
except:
    print("Error importing QAQC_pipeline.py")

# import tempfile  # Used for downloading (and then deleting) netcdfs to local drive from s3 bucket
import os

# Silence warnings
import warnings
from shapely.errors import ShapelyDeprecationWarning

warnings.filterwarnings("ignore", category=RuntimeWarning)
warnings.filterwarnings(
    "ignore", category=ShapelyDeprecationWarning
)  # Warning is raised when creating Point object from coords. Can't figure out why.

plt.rcParams["figure.dpi"] = 300

In [None]:
# AWS credentials
s3 = boto3.resource("s3")
s3_cl = boto3.client("s3")

## AWS buckets
bucket = "wecc-historical-wx"
qaqcdir = "3_qaqc_wx/"
mergedir = "4_merge_wx/"

## Step 1: Identify candidates for concatenation and upload to AWS

We do so by identifying stations with exactly matching latitudes and longitudes.

In [103]:
# A list of networks to be checked for concatenation
target_networks = ["ASOSAWOS","VALLEYWATER", "MARITIME"]

In [104]:
def concatenation_check(station_list):
    """
    This function flags stations that need to be concatenated.

    Rules
    ------
        1.) Stations are flagged if they have identical latitudes and longitudes

    Parameters
    ------
        station_list: pd.DataFrame
            list of station information

    Returns
    -------
        if success:
            new_station_list: pd.DataFrame
                input station list with a flag column assigning an integer to each group of repeat latitudes and longitudes

        if failure:
            None

    """
    ##### Flag stations with identical latitudes and longitudes, then assign each group a unique integer

    # List of possible variable names for longitudes and latitudes
    lat_lon_list = ["LAT", "LON", "latitude", "longitude", "LATITUDE", "LONGITUDE", 'lat','lon']
    # Extract the latitude and longitude variable names from the input dataframe
    lat_lon_cols = [col for col in station_list.columns if col in lat_lon_list]

    # Generate column flagging duplicate latitudes and longitudes
    station_list["concat_subset"] = station_list.duplicated(
        subset=lat_lon_cols, keep=False
    )
    # within each group of identical latitudes and longitudes, assign a unique integer
    station_list["concat_subset"] = (
        station_list[station_list["concat_subset"] == True].groupby(lat_lon_cols).ngroup()
    )

    ##### Order station list by flag
    concat_station_list = station_list.sort_values("concat_subset")

    ##### Keep only flagged stations
    concat_station_list = concat_station_list[~concat_station_list["concat_subset"].isna()]

    ##### Format final list
    # Convert flags to integers - this is necessary for the final concatenation step
    concat_station_list["concat_subset"] = concat_station_list["concat_subset"].astype(
        "int32"
    )
    # Now keep only the ERA-ID and flag column
    era_id_list = ['ERA-ID','era-id']
    era_id_col = [col for col in station_list.columns if col in era_id_list]
    concat_station_list = concat_station_list[era_id_col + ["concat_subset"]]

    # Standardize ERA id to "ERA-ID" (this is specific to Valleywater stations)
    if 'era-id' in era_id_col:
        concat_station_list.rename(columns={"era-id": "ERA-ID"}, inplace=True)

    return concat_station_list

In [105]:
def apply_concat_check(station_names_list):
    """
    This function applies the conatenation check to a list of target stations. 
    It then upload a csv containing the ERA IDs and concatenation subset ID for 
    all identified stations in a network.

    Parameters
    ------
        station__names_list: pd.DataFrame
            list of target station names

    Returns
    -------
        if success:
            uploads list of stations to be concatenated to AWS
        if failure:
            None

    """
    final_list = pd.DataFrame([])
    for station in station_names_list:

        ##### Import station list of target station
        key = "2_clean_wx/{}/stationlist_{}_cleaned.csv".format(station,station)
        bucket_name = "wecc-historical-wx"
        list_import = s3_cl.get_object(
            Bucket=bucket,
            Key=key,
        )
        station_list = pd.read_csv(BytesIO(list_import["Body"].read()))

        ##### Apply concatenation check
        concat_list = concatenation_check(station_list)

        ##### Rename the flags for each subset to <station>_<subset number>
        concat_list["concat_subset"] = station + '_' + concat_list["concat_subset"].astype(str)

        ##### Append to final list of stations to concatenate
        final_list = pd.concat([final_list,concat_list])

        ##### Upload to QAQC directory in AWS
        new_buffer = StringIO()
        final_list.to_csv(new_buffer, index = False)
        content = new_buffer.getvalue()

        # the csv is stored in each station folder within 3_qaqc_wx
        s3_cl.put_object(
            Bucket = bucket_name,
            Body = content,
            Key = qaqcdir + station + "/concat_list_{}.csv".format(station)
        )
        
    return None

In [106]:
apply_concat_check(target_networks)

## Step 2: Concatenate Stations

### The functions

In [120]:
def concatenate_station_pairs(network_name):
    """
    Concatenates two input datasets, deletes the originals, and exports the final concatenated dataset. 
    Also returns a list of the ERA-IDs of all stations that are concatenated.

    Rules
    ------
        1.) concatenation: keep the newer station data in the time range in which both stations overlap

    Parameters
    ------
        network_name: string
            weather station network

    Returns
    -------
        if success: 
            return list of ERA-IDs are stations that are concatenated
            all processed datasets are exported to the merge folder in AWS and the original datasets are deleted
        if failure:
            None
    """
    ##### Read in concatenation list of input network
    network_list = s3_cl.get_object(
        Bucket=bucket,
        Key="3_qaqc_wx/{}/concat_list_{}.csv".format(
            network_name, network_name, network_name
        ),
    )
    concat_list = pd.read_csv(BytesIO(network_list["Body"].read()))

    # ! you can truncate the concat list here, for testing
    concat_list = concat_list.head(4)
    # ! end

    subset_number = len(concat_list['concat_subset'].unique())

    # initiate empty list, to which we will iteratively add the ERA-IDs of stations that are concatenated
    final_concat_list = []

    for i in range(0,subset_number):

        # count the number of staions in subset i
        subset_i = concat_list[
            concat_list["concat_subset"].str.contains("{}".format(i))
        ]

        n = subset_i.count()[0]

        # if there are only two stations, proceed with concatenation
        if n == 2:
            try: 
                # retrieve ERA IDs in this subset of stations
                station_1 = subset_i["ERA-ID"].iloc[0]
                station_2 = subset_i["ERA-ID"].iloc[1]

                final_concat_list.append(station_1)
                final_concat_list.append(station_2)

                # import this subset of datasets and convert to dataframe
                url_1 = "s3://wecc-historical-wx/3_qaqc_wx/{}/{}.zarr".format(
                    network_name, station_1
                )
                url_2 = "s3://wecc-historical-wx/3_qaqc_wx/{}/{}.zarr".format(
                    network_name, station_2
                )

                ds_1 = xr.open_zarr(url_1)
                ds_2 = xr.open_zarr(url_2)

                df_1,MultiIndex_1,attrs_1,var_attrs_1,era_qc_vars_1 = qaqc_ds_to_df(ds_1, verbose=False)
                df_2, MultiIndex_2, attrs_2, var_attrs_2, era_qc_vars_2 = (qaqc_ds_to_df(ds_2, verbose=False))

                # determine which dataset is older
                if df_1["time"].max() < df_2["time"].max():
                    # if df_1 has an earlier end tiem than df_2, then d_2 is newer
                    # we also grab the name of the newer station in this step, for use later
                    df_new = df_2
                    ds_new = ds_2
                    MultiIndex_new = MultiIndex_2
                    attrs_new = attrs_2

                    df_old = df_1
                    ds_old = ds_1
                    MultiIndex_old = MultiIndex_1

                else:
                    df_new = df_1
                    ds_new = df_1
                    MultiIndex_new = MultiIndex_2
                    attrs_new = attrs_2

                    df_old = df_2
                    ds_old = ds_2
                    MultiIndex_old = MultiIndex_2

                # now set things up to determine if there is temporal overlap between df_new and df_old
                df_overlap = df_new[df_new["time"].isin(df_old["time"])]

                # if there is no overlap between the two time series, just concatenate
                if len(df_overlap) == 0:
                    df_concat = concat([df_old, df_new])

                # if not, split into subsets and concatenate
                else:
                    ##### Split datframes into subsets #####

                    # Remove data in time overlap between old and new
                    df_old_cleaned = df_old[~df_old["time"].isin(df_overlap["time"])]
                    df_new_cleaned = df_new[~df_new["time"].isin(df_overlap["time"])]

                    ##### Concatenate subsets #####
                    df_concat = concat([df_old_cleaned, df_overlap, df_new_cleaned])

                # ##### Now prepare the final concatenated dataframe for export
                station_name_new = MultiIndex_new.get_level_values("station")[1]
                MultiIndex_concat = MultiIndex_new.union(MultiIndex_old)
                MultiIndex_concat = pd.MultiIndex.from_tuples(
                    [(station_name_new, lvl1) for _, lvl1 in MultiIndex_concat],
                    names=MultiIndex_concat.names,
                )

                # drop duplicate rows that were potentially generated in the concatenation process
                df_concat = df_concat.drop_duplicates(subset=["time"])

                # drop 'station' and 'time'columns
                df_concat = df_concat.drop(["station", "time","hour","day","month","year","date"], axis=1)

                df_concat.index = MultiIndex_concat 

                # Convert concatenated dataframe to dataset
                ds_concat = df_concat.to_xarray()

                # #### Prepare for export #####

                # Convert datatype of station coordinate
                ds_concat.coords["station"] = ds_concat.coords["station"].astype("<U20")

                # # Include past attributes
                ds_concat.attrs.update(attrs_new)

                # Update 'history' attribute
                timestamp = datetime.datetime.utcnow().strftime("%m-%d-%Y, %H:%M:%S")
                ds_concat.attrs["history"] = ds_concat.attrs[
                    "history"
                ] + " \n maritime_merge.ipynb run on {} UTC".format(timestamp)

                # Update 'comment' attribute
                ds_concat.attrs["comment"] = (
                    "Final v1 data product. This data has been subjected to cleaning, QA/QC, and standardization."
                )

                # Add new qaqc_files_merged attribute
                station_name_old = MultiIndex_old.get_level_values("station")[1]
                ds_concat.attrs["qaqc_files_merged"] = (
                    "{}, {} merged. Overlap retained from newer station data.".format(
                        station_name_old, station_name_new
                    )
                )

                # ## Export ### 
                # ! a test name is used below 
                # ! the final name will be that of the newer dataframe
                # export_url = "s3://wecc-historical-wx/3_qaqc_wx/{}/{}_{}.zarr".format(
                #     network_name, "test_concat", station_name_new
                # )
                # ds_concat.to_zarr(export_url, mode="w")
            except Exception as e:
                print(
                    "Error concatenation stations of subset {}: {}".format(subset_i, e)
                )
        # if there are more than two stations in the subset, continue
        else:
            continue

    # return final_concat_list # ! this will be the final return statement, below is inlcluded for testing
    return (
        df_new,
        df_old,
        df_concat,
        ds_concat,
        final_concat_list,
    )  

### TEST

In [122]:
network_name = "MARITIME" # "VALLEYWATER", "MARITIME"

#### Test option 1

Run concatenate_station_pairs() as is, so the function does not export and instead returns df_concat, df_new, df_old, and df_overlap

In [123]:
(
    df_new,
    df_old,
    df_concat,
    ds_concat,
    final_concat_list,
) = concatenate_station_pairs(network_name)

Error concatenation stations of subset                  ERA-ID concat_subset
0  ASOSAWOS_99999903053    ASOSAWOS_0
1  ASOSAWOS_A0001403053    ASOSAWOS_0: group not found at path ''
Error concatenation stations of subset                  ERA-ID concat_subset
2  ASOSAWOS_72269593041    ASOSAWOS_1
3  ASOSAWOS_99999993041    ASOSAWOS_1: group not found at path ''


UnboundLocalError: local variable 'df_new' referenced before assignment

In [None]:
df_concat = df_concat.reset_index(level="time")

#### Test option 2: 

Run concatenate_station_pairs() with the first return statement uncommented and the second commented, and the export section uncommented. So that the function actually exports the concatenated datasets. I've generated all the concatention lists (for VALLEYWATER, MARITIME, and ASOSAWOS) needed to run the function.

In [None]:
output = concatenate_station_pairs(network_name)

In [None]:
# import output
# TODO: you'll need to change the url
url_output = "s3://wecc-historical-wx/3_qaqc_wx/{}/test_concat_{}.zarr".format(
    network_name, network_name
)

# TODO: open_zarr will be used for QAQC'd datasets
ds_concat = xr.open_zarr(url_output)

df_concat = ds_concat.to_dataframe()

In [None]:
network_list = s3_cl.get_object(
    Bucket=bucket,
    Key="3_qaqc_wx/{}/{}_concat_list_{}.csv".format(
        network_name, network_name, network_name
    ),
)
concat_list = pd.read_csv(BytesIO(network_list["Body"].read()))
station_1 = concat_list["ERA-ID"].iloc[0]
station_2 = concat_list["ERA-ID"].iloc[1]

# import this subset of datasets and convert to dataframe
url_1 = "s3://wecc-historical-wx/3_qaqc_wx/{}/{}.zarr".format(network_name, station_1)
url_2 = "s3://wecc-historical-wx/3_qaqc_wx/{}/{}.zarr".format(network_name, station_2)

ds_1 = xr.open_zarr(url_1)
ds_2 = xr.open_zarr(url_2)

df_1 = ds_1.to_dataframe()
df_2 = ds_2.to_dataframe()

In [None]:
# extract time index for plotting
df_1 = df_1.reset_index(level="time")
df_2 = df_2.reset_index(level="time")


df_concat = df_concat.reset_index(level="time")

In [None]:
if df_1["time"].max() < df_2["time"].max(): 
    # if df_1 has an earlier end tiem than df_2, then d_2 is newer
    # we also grab the name of the newer station in this step, for use later
    df_new = df_2
    ds_new = ds_2

    df_old = df_1
    ds_old = ds_1
else:
    df_new = df_1
    ds_new = ds_1

    df_old = df_2
    ds_old = ds_2

#### Onward

In [None]:
ds_concat

In [None]:
df_concat.head(4)

Check overlap

In [None]:
# now set things up to determine if there is temporal overlap between df_new and df_old
df_new_overlap = df_new[df_new["time"].isin(df_concat["time"])]
df_concat_overlap = df_concat[df_concat["time"].isin(df_new["time"])]

In [None]:
df_new_overlap.head(4)

In [None]:
df_concat_overlap.head(4)

Plot the two original datasets

In [None]:
vis_var = 'ps'

In [None]:
# Create a figure with a specific size
plt.figure(figsize=(8, 4))

# Plotting the time series of given dataframe
plt.plot(df_new["time"], df_new[vis_var])

# Plotting the time series of given dataframe
plt.plot(df_old["time"], df_old[vis_var])

# Giving title to the chart using plt.title
plt.title("input dfs")

# rotating the x-axis tick labels at 30degree
# towards right
plt.xticks(rotation=30, ha="right")

# Providing x and y label to the chart
plt.xlabel("time")
plt.ylabel(vis_var)

Plot the output dataset

In [None]:
# Create a figure with a specific size
plt.figure(figsize=(8, 4))

# Plotting the time series of given dataframe
plt.plot(df_concat["time"], df_concat[vis_var])

# Giving title to the chart using plt.title
plt.title("concatenated df")

# rotating the x-axis tick labels at 30degree
# towards right
plt.xticks(rotation=30, ha="right")

# Providing x and y label to the chart
plt.xlabel("time")
plt.ylabel(vis_var)

## Step 4: Mark stations that have been concatenated