# Station Matching


## Environment set-up

In [37]:
from shapely.geometry import Point
from shapely.ops import nearest_points

from functools import reduce
import datetime
from pandas import *
import boto3
import geopandas as gpd
import numpy as np
import pandas as pd
import xarray as xr
import matplotlib.pyplot as plt
from io import BytesIO, StringIO

## New logger function
from log_config import logger

# Import qaqc stage calc functions
try:
    from QAQC_pipeline import *
except:
    print("Error importing QAQC_pipeline.py")

# import tempfile  # Used for downloading (and then deleting) netcdfs to local drive from s3 bucket
import os

# Silence warnings
import warnings
from shapely.errors import ShapelyDeprecationWarning

warnings.filterwarnings("ignore", category=RuntimeWarning)
warnings.filterwarnings(
    "ignore", category=ShapelyDeprecationWarning
)  # Warning is raised when creating Point object from coords. Can't figure out why.

plt.rcParams["figure.dpi"] = 300

In [38]:
# AWS credentials
s3 = boto3.resource("s3")
s3_cl = boto3.client("s3")

## AWS buckets
bucket = "wecc-historical-wx"
qaqcdir = "3_qaqc_wx/"
mergedir = "4_merge_wx/"

## Step 1: Identify candidates for concatenation and upload to AWS

We do so by identifying stations with exactly matching latitudes and longitudes.

In [31]:
# A list of networks to be checked for concatenation
target_networks = [
    "VALLEYWATER"
]  # ["ASOSAWOS","VALLEYWATER", "MARITIME"]

In [32]:
def concatenation_check(station_list):
    """
    This function flags stations that need to be concatenated.

    Rules
    ------
        1.) Stations are flagged if they have identical latitudes and longitudes

    Parameters
    ------
        station_list: pd.DataFrame
            list of station information

    Returns
    -------
        if success:
            new_station_list: pd.DataFrame
                input station list with a flag column assigning an integer to each group of repeat latitudes and longitudes

        if failure:
            None

    """
    ##### Flag stations with identical latitudes and longitudes, then assign each group a unique integer

    # List of possible variable names for longitudes and latitudes
    lat_lon_list = ["LAT", "LON", "latitude", "longitude", "LATITUDE", "LONGITUDE", 'lat','lon']
    # Extract the latitude and longitude variable names from the input dataframe
    lat_lon_cols = [col for col in station_list.columns if col in lat_lon_list]

    # Generate column flagging duplicate latitudes and longitudes
    station_list["concat_subset"] = station_list.duplicated(
        subset=lat_lon_cols, keep=False
    )
    # within each group of identical latitudes and longitudes, assign a unique integer
    station_list["concat_subset"] = (
        station_list[station_list["concat_subset"] == True].groupby(lat_lon_cols).ngroup()
    )

    ##### Order station list by flag
    concat_station_list = station_list.sort_values("concat_subset")

    ##### Keep only flagged stations
    concat_station_list = concat_station_list[~concat_station_list["concat_subset"].isna()]

    ##### Format final list
    # Convert flags to integers - this is necessary for the final concatenation step
    concat_station_list["concat_subset"] = concat_station_list["concat_subset"].astype(
        "int32"
    )
    # Now keep only the ERA-ID and flag column
    era_id_list = ['ERA-ID','era-id']
    era_id_col = [col for col in station_list.columns if col in era_id_list]
    concat_station_list = concat_station_list[era_id_col + ["concat_subset"]]

    # Standardize ERA id to "ERA-ID" (this is specific to Valleywater stations)
    if 'era-id' in era_id_col:
        concat_station_list.rename(columns={"era-id": "ERA-ID"}, inplace=True)

    return concat_station_list

In [33]:
def apply_concat_check(station_names_list):
    """
    This function applies the conatenation check to a list of target stations. 
    It then upload a csv containing the ERA IDs and concatenation subset ID for 
    all identified stations in a network.

    Parameters
    ------
        station__names_list: pd.DataFrame
            list of target station names

    Returns
    -------
        if success:
            uploads list of stations to be concatenated to AWS
        if failure:
            None

    """
    final_list = pd.DataFrame([])
    for station in station_names_list:

        ##### Import station list of target station
        key = "2_clean_wx/{}/stationlist_{}_cleaned.csv".format(station,station)
        bucket_name = "wecc-historical-wx"
        list_import = s3_cl.get_object(
            Bucket=bucket,
            Key=key,
        )
        station_list = pd.read_csv(BytesIO(list_import["Body"].read()))

        ##### Apply concatenation check
        concat_list = concatenation_check(station_list)

        ##### Rename the flags for each subset to <station>_<subset number>
        concat_list["concat_subset"] = station + '_' + concat_list["concat_subset"].astype(str)

        ##### Append to final list of stations to concatenate
        final_list = pd.concat([final_list,concat_list])

        ##### Upload to QAQC directory in AWS
        new_buffer = StringIO()
        final_list.to_csv(new_buffer, index = False)
        content = new_buffer.getvalue()

        # the csv is stored in each station folder within 3_qaqc_wx
        s3_cl.put_object(
            Bucket = bucket_name,
            Body = content,
            Key = qaqcdir + station + "/concat_list_{}.csv".format(station)
        )
        
    return None

In [34]:
apply_concat_check(target_networks)

NameError: name 'pd' is not defined

## Step 2: Concatenate Stations

### The functions

In [21]:
def _multiindex_concat_nooverlap(m_old, m_new, name):
    """

    Rules
    ------
        1.) concatenation: keep the newer station data in the time range in which both stations overlap

    Parameters
    ------
        network_name: string
            weather station network

    Returns
    -------
        if success:
            return list of ERA-IDs are stations that are concatenated
            all processed datasets are exported to the merge folder in AWS and the original datasets are deleted
        if failure:
            None
    """

    # combine time indices of two multiindexes
    tidx = (
        pd.concat(
            [
                pd.Series(m_old.get_level_values("time").values),
                pd.Series(m_new.get_level_values("time").values),
            ]
        )
        .reset_index()
        .drop(columns="index")
    )

    # idenitify if there are duplicate times
    tidx = tidx.rename(columns={0: "time"})
    tidx = tidx.sort_values("time").drop_duplicates(subset=["time"])

    # PULL the station name from m_new and set to the same length
    stnidx = (
        pd.Series(name, index=np.arange(len(tidx)), name="station")
        .reset_index()
        .drop(columns="index")
    )

    # combine into new df (ugh)
    df_ugh = pd.concat([stnidx, tidx], axis=1)

    return df_ugh

In [54]:
def _concat_export_help(
    df_concat, network_name, attrs_new, station_names
):
    """
    Prepares the final concatenated dataset for export by 
    - updating the attributes and 
    - converting one of the mulit-index levels to the correct datatype
    then export the final dataset to AWS

    Rules
    ------
        1.) retains the name the newest station

    Parameters
    ------
        df_concat: pd.DataFrame
            dataframe of concatenated dataframes
        network_name: string
            weather station network
        attrs_new: pd.Dictionary
            attributes of newer dataframe that was input to concatenation
        station_name_new: string
            name of newer station
        station_name_old: string
            name of older station
        station_names: dictionary
            library of station names

    Returns
    -------
        if success:
            None
            export dataset of concatenated dataframes to AWS
        if failure:
            None
    """

    # Delete unnecessary columns and set index
    df_concat = df_concat.drop(["hour", "day", "month", "year", "date"], axis=1)
    df_to_export = df_concat.set_index(["station", "time"])

    ## Convert concatenated dataframe to dataset -- seeing duplicate timestamps here -- the exact same length as df2?
    ds_concat = df_to_export.to_xarray()

    # Convert datatype of station coordinate
    ds_concat.coords["station"] = ds_concat.coords["station"].astype("<U20")

    # Include past attributes -- do this manually?
    for i in attrs_new:
        ds_concat.attrs[i] = attrs_new[i]

    # Update 'history' attribute
    timestamp = datetime.datetime.utcnow().strftime("%m-%d-%Y, %H:%M:%S")
    ds_concat.attrs["history"] = ds_concat.attrs[
        "history"
    ] + " \nstation_matching.ipynb run on {} UTC".format(timestamp)

    # Update 'comment' attribute
    ds_concat.attrs["comment"] = (
        "Intermediary data product. This data has been subjected to cleaning, QA/QC, but may not have been standardized."
    )

    station_name_new = station_names['station_name_new']
    old_stations = station_names['old_stations'] 

    # Add new qaqc_files_merged attribute
    ds_concat.attrs["qaqc_files_merged"] = (
        "{}, {} merged. Overlap retained from newer station data.".format(
            old_stations, station_name_new
        )
    )

    ## Export ###
    # ! a test name is used below
    # ! the final name will be that of the newer dataframe
    export_url = "s3://wecc-historical-wx/3_qaqc_wx/{}/{}_{}.zarr".format(
        network_name, "TEST_concat", station_name_new
    )
    print("Exporting....", export_url)
    # ds_concat.to_zarr(export_url, mode="w") ## WHEN READY TO EXPORT

    return ds_concat #None

In [46]:
def _overlap_concat(df_new,df_old):
    """
    Handles the cases in which there is overlap between the two input stations

    Rules
    ------
        1.) concatenation: keep the newer station data in the time range in which both stations overlap

    Parameters
    ------
        network_name: string
            weather station network

    Returns
    -------
        if success:
            return final concatenated dataset
        if failure:
            None
    """

    df_overlap = df_new[df_new["time"].isin(df_old["time"])]

    ##### Split datframes into subsets #####

    # Remove data in time overlap between old and new
    df_old_cleaned = df_old[~df_old["time"].isin(df_overlap["time"])]
    df_new_cleaned = df_new[~df_new["time"].isin(df_overlap["time"])]

    ##### Concatenate subsets #####
    df_concat = pd.concat([df_old_cleaned, df_overlap, df_new_cleaned])

    return df_concat

In [None]:
def _more_than_2(network_name,stns_to_pair):
    """
    Perform pairwise concatenation on subsets of more than two stations flagged for concatenation

    Rules
    ------
        1.) concatenation: keep the newer station data in the time range in which both stations overlap

    Parameters
    ------
        network_name: string
            weather station network
        stns_to_pair: pd.DataFrame
            dataframe of the input station names

    Returns
    -------
        if success:
            return final concatenated dataframe
        if failure:
            None
    """

    print("\n", stns_to_pair)

    # Step 1: Load datasets
    datasets = [
        xr.open_zarr(
            "s3://wecc-historical-wx/3_qaqc_wx/{}/{}.zarr".format(
                network_name, stn
            ),
            consolidated=True,
        )
        for stn in stns_to_pair['ERA-ID']
    ]

    # Sort by max time
    datasets_sorted = sorted(datasets, key=lambda ds: ds.time.max().item(), reverse=True)

    # Progressive combination
    result = datasets_sorted[0]

    for ds in datasets_sorted[1:]:
        start1, end1 = result.time.min().item(), result.time.max().item()
        start2, end2 = ds.time.min().item(), ds.time.max().item()

        # overlap = not (end1 < start2 or end2 < start1)

        # if overlap:
        #     df_concat = _overlap_concat(result, ds)
        # else:
        #     df_concat = xr.concat([result, ds], dim="time").sortby("time")

    # Construct station names list, for updating attributes
    newest_station = datasets_sorted["ERA-ID"].iloc[0]
    older_stations = ", ".join(datasets_sorted.iloc[1:, 0].astype(str))
    station_names = {"station_name_new": newest_station, "old_stations": older_stations}

    return df_concat, station_names

In [48]:
def _ds_concat(ds_1, ds_2):
    """
    Carry out concatenation 

    Rules
    ------
        1.) concatenation: keep the newer station data in the time range in which both stations overlap

    Parameters
    ------
        network_name: string
            weather station network

    Returns
    -------
        if success:
            return list of ERA-IDs are stations that are concatenated
            all processed datasets are exported to the merge folder in AWS and the original datasets are deleted
        if failure:
            None
    """

    # convert to dataframes with corresponding information
    df_1, MultiIndex_1, attrs_1, var_attrs_1, era_qc_vars_1 = qaqc_ds_to_df(
        ds_1, verbose=False
    )
    df_2, MultiIndex_2, attrs_2, var_attrs_2, era_qc_vars_2 = qaqc_ds_to_df(
        ds_2, verbose=False
    )

    # determine which dataset is older
    if df_1["time"].max() < df_2["time"].max():
        # if df_1 has an earlier end tiem than df_2, then d_2 is newer
        # we also grab the name of the newer station in this step, for use later
        df_new = df_2
        attrs_new = attrs_2
        df_old = df_1

    else:
        df_new = df_1
        attrs_new = attrs_1
        df_old = df_2

    stn_n_to_keep = df_new["station"].unique()[0]
    stn_n_to_drop = df_old["station"].unique()[0]
    print(f"Station will be concatenated and saved as: {stn_n_to_keep}")

    # now set things up to determine if there is temporal overlap between df_new and df_old
    df_overlap = df_new[df_new["time"].isin(df_old["time"])]

    # If there is no overlap between the two time series, just concatenate
    if len(df_overlap) == 0:
        print("No overlap!")
        df_concat = pd.merge(df_old, df_new, how="outer")
        df_concat["station"] = stn_n_to_keep

    # If overlap exists, split into subsets and concatenate
    else:
        print("There is overlap")
        df_concat = _overlap_concat(df_old, df_new)

    return df_concat, stn_n_to_keep, stn_n_to_drop, attrs_new

In [66]:
def concatenate_station_pairs2(network_name):
    """
    Coordinates the concatenation of input datasets and exports the final concatenated dataset.
    Also returns a list of the ERA-IDs of all stations that are concatenated.

    Rules
    ------
        1.) concatenation: keep the newer station data in the time range in which both stations overlap

    Parameters
    ------
        network_name: string
            weather station network

    Returns
    -------
        if success:
            return list of ERA-IDs are stations that are concatenated
            all processed datasets are exported to the merge folder in AWS and the original datasets are deleted
        if failure:
            None
    Notes
    -------
    Uses the following helper functions
        _ds_concat(): concatenates two datasets
        _overlap_concat(): used by _ds_concat() to concatenates two stations with overlapping time ranges
        _more_than_2(): handles subsets with more than two stations, passing pairs to _ds_concat() iteratively
        _concat_export_help(): formats and exports concatenated dataset

    """
    # Initiate empty list, to which we will iteratively add the ERA-IDs of stations that are concatenated
    final_concat_list = []

    # Read in full concat station list
    print(network_name)
    concat_list = pd.read_csv(
        f"s3://wecc-historical-wx/3_qaqc_wx/{network_name}/concat_list_{network_name}.csv"
    )

    # Identify stns within designated network
    concat_by_network = concat_list.loc[
        concat_list.concat_subset.str.contains(network_name)
    ]

    # For MARITIME, remove these stations becuase they're actually separate stations
    if network_name == 'MARITIME':
        unique_pair_names = unique_pair_names[1:]
        unique_pair_name = unique_pair_name[~unique_pair_name["ERA-ID"].isin['MARITIME_LJPC1','MARITIME_LJAC1']]
    else: 
        pass

    # ! TESTING
    concat_by_network = concat_by_network.head(12)
    # ! TESTING

    unique_pair_names = concat_by_network.concat_subset.unique()
    print(
        f"There are {len(concat_by_network)} stations to be concatenated into {len(unique_pair_names)} station pairs within {network_name}..."
    )

    print(unique_pair_names)

    # Set up pairs
    for pair in unique_pair_names:
        print(pair)
        # pull out stations corresponding to pair name
        stns_to_pair = concat_by_network.loc[concat_by_network.concat_subset == pair]

        if len(stns_to_pair) == 2:  # 2 stations to concat together
            print("\n", stns_to_pair)

            # import this subset of datasets and convert to dataframe
            url_1 = "s3://wecc-historical-wx/3_qaqc_wx/{}/{}.zarr".format(
                network_name, stns_to_pair.iloc[0]["ERA-ID"]
            )
            url_2 = "s3://wecc-historical-wx/3_qaqc_wx/{}/{}.zarr".format(
                network_name, stns_to_pair.iloc[1]["ERA-ID"]
            )

            print("Retrieving....", url_1)
            print("Retrieving....", url_2)
            ds_1 = xr.open_zarr(url_1)
            ds_2 = xr.open_zarr(url_2)

            ##### Send to helper function for concatenation
            df_concat, stn_n_to_keep, stn_n_to_drop, attrs_new = _ds_concat(
                ds_1, ds_2
            )

            station_names ={"station_name_new":stn_n_to_keep, "old_stations":stn_n_to_drop}

            ds_final = _concat_export_help(
                df_concat,
                network_name,
                attrs_new,
                station_names  # stn_n_to_keep, stn_n_to_drop
            )

            final_concat_list.extend(stns_to_pair["ERA-ID"].tolist())

            # return ds_final, final_concat_list

        else:
            # If there are more than 2 stations in the given subset, pass to _more_than_2()
            print("More than 2 stations within a subset")
            df_concat, station_names = _more_than_2(
                network_name,
                stns_to_pair,
            )

            if df_concat is None: # If the concentation failed
                print('Concatenation of >2 stations was unsuccessful')
            else: # If it was successful, move on to the next steps
                # add station names to station name list
                final_concat_list.extend(stns_to_pair["ERA-ID"].tolist())

                ds_final = _concat_export_help(
                    df_concat,
                    network_name,
                    attrs_new,
                    station_names  # stn_n_to_keep, stn_n_to_drop
                )
    print("Concatenated stations: ", final_concat_list)
    return ds_final, final_concat_list
    # return final_concat_list

In [65]:
ds_to_export, final_concat_list = concatenate_station_pairs2("ASOSAWOS")

ASOSAWOS
There are 12 stations to be concatenated into 5 station pairs within ASOSAWOS...
['ASOSAWOS_0' 'ASOSAWOS_1' 'ASOSAWOS_2' 'ASOSAWOS_3' 'ASOSAWOS_4']
ASOSAWOS_0

                  ERA-ID concat_subset
0  ASOSAWOS_99999903053    ASOSAWOS_0
1  ASOSAWOS_A0001403053    ASOSAWOS_0
Retrieving.... s3://wecc-historical-wx/3_qaqc_wx/ASOSAWOS/ASOSAWOS_99999903053.zarr
Retrieving.... s3://wecc-historical-wx/3_qaqc_wx/ASOSAWOS/ASOSAWOS_A0001403053.zarr
Station will be concatenated and saved as: ASOSAWOS_A0001403053
No overlap!
Exporting.... s3://wecc-historical-wx/3_qaqc_wx/ASOSAWOS/TEST_concat_ASOSAWOS_A0001403053.zarr
Concatenated stations:  ['ASOSAWOS_99999903053', 'ASOSAWOS_A0001403053']
ASOSAWOS_1

                  ERA-ID concat_subset
2  ASOSAWOS_72269593041    ASOSAWOS_1
3  ASOSAWOS_99999993041    ASOSAWOS_1
Retrieving.... s3://wecc-historical-wx/3_qaqc_wx/ASOSAWOS/ASOSAWOS_72269593041.zarr
Retrieving.... s3://wecc-historical-wx/3_qaqc_wx/ASOSAWOS/ASOSAWOS_99999993041.zarr
Station w

  df_concat = pd.merge(df_old, df_new, how="outer")


Exporting.... s3://wecc-historical-wx/3_qaqc_wx/ASOSAWOS/TEST_concat_ASOSAWOS_72269593041.zarr
Concatenated stations:  ['ASOSAWOS_99999903053', 'ASOSAWOS_A0001403053', 'ASOSAWOS_72269593041', 'ASOSAWOS_99999993041']
ASOSAWOS_2

                  ERA-ID concat_subset
4  ASOSAWOS_72272093063    ASOSAWOS_2
5  ASOSAWOS_72272193063    ASOSAWOS_2
Retrieving.... s3://wecc-historical-wx/3_qaqc_wx/ASOSAWOS/ASOSAWOS_72272093063.zarr
Retrieving.... s3://wecc-historical-wx/3_qaqc_wx/ASOSAWOS/ASOSAWOS_72272193063.zarr
Station will be concatenated and saved as: ASOSAWOS_72272193063
There is overlap
Exporting.... s3://wecc-historical-wx/3_qaqc_wx/ASOSAWOS/TEST_concat_ASOSAWOS_72272193063.zarr
Concatenated stations:  ['ASOSAWOS_99999903053', 'ASOSAWOS_A0001403053', 'ASOSAWOS_72269593041', 'ASOSAWOS_99999993041', 'ASOSAWOS_72272093063', 'ASOSAWOS_72272193063']
ASOSAWOS_3
More than 2 stations within a subset

                  ERA-ID concat_subset
6  ASOSAWOS_74003503145    ASOSAWOS_3
7  ASOSAWOS_722805

TypeError: unhashable type: 'DataArray'

In [None]:
ds_to_export

#### original function

In [10]:
def concatenate_station_pairs(network_name):
    """
    Concatenates two input datasets, deletes the originals, and exports the final concatenated dataset. 
    Also returns a list of the ERA-IDs of all stations that are concatenated.

    Rules
    ------
        1.) concatenation: keep the newer station data in the time range in which both stations overlap

    Parameters
    ------
        network_name: string
            weather station network

    Returns
    -------
        if success: 
            return list of ERA-IDs are stations that are concatenated
            all processed datasets are exported to the merge folder in AWS and the original datasets are deleted
        if failure:
            None
    """
    ##### Read in concatenation list of input network
    network_list = s3_cl.get_object(
        Bucket=bucket,
        Key="3_qaqc_wx/{}/concat_list_{}.csv".format(
            network_name, network_name, network_name
        ),
    )
    concat_list = pd.read_csv(BytesIO(network_list["Body"].read()))

    # ! you can truncate the concat list here, for testing
    concat_list = concat_list.head(2)
    # ! end

    subset_number = len(concat_list['concat_subset'].unique())

    # initiate empty list, to which we will iteratively add the ERA-IDs of stations that are concatenated
    final_concat_list = []

    for i in range(0,subset_number):

        # count the number of staions in subset i
        subset_i = concat_list[
            concat_list["concat_subset"].str.contains("{}".format(i))
        ]

        n = subset_i.count()[0]

        # if there are only two stations, proceed with concatenation
        if n == 2:
            try: 
                # retrieve ERA IDs in this subset of stations
                station_1 = subset_i["ERA-ID"].iloc[0]
                station_2 = subset_i["ERA-ID"].iloc[1]

                # import this subset of datasets and convert to dataframe
                url_1 = "s3://wecc-historical-wx/3_qaqc_wx/{}/{}.zarr".format(
                    network_name, station_1
                )
                url_2 = "s3://wecc-historical-wx/3_qaqc_wx/{}/{}.zarr".format(
                    network_name, station_2
                )

                ds_1 = xr.open_zarr(url_1)
                ds_2 = xr.open_zarr(url_2)

                df_1,MultiIndex_1,attrs_1,var_attrs_1,era_qc_vars_1 = qaqc_ds_to_df(ds_1, verbose=False)
                df_2, MultiIndex_2, attrs_2, var_attrs_2, era_qc_vars_2 = qaqc_ds_to_df(ds_2, verbose=False)

                # determine which dataset is older
                if df_1["time"].max() < df_2["time"].max():
                    # if df_1 has an earlier end tiem than df_2, then d_2 is newer
                    # we also grab the name of the newer station in this step, for use later
                    df_new = df_2
                    ds_new = ds_2
                    MultiIndex_new = MultiIndex_2
                    attrs_new = attrs_2

                    df_old = df_1
                    ds_old = ds_1
                    MultiIndex_old = MultiIndex_1

                else:
                    df_new = df_1
                    ds_new = df_1
                    MultiIndex_new = MultiIndex_2
                    attrs_new = attrs_2

                    df_old = df_2
                    ds_old = ds_2
                    MultiIndex_old = MultiIndex_2

                # now set things up to determine if there is temporal overlap between df_new and df_old
                df_overlap = df_new[df_new["time"].isin(df_old["time"])]

                # if there is no overlap between the two time series, just concatenate
                if len(df_overlap) == 0:
                    df_concat = concat([df_old, df_new])

                # if not, split into subsets and concatenate
                else:
                    ##### Split datframes into subsets #####

                    # Remove data in time overlap between old and new
                    df_old_cleaned = df_old[~df_old["time"].isin(df_overlap["time"])]
                    df_new_cleaned = df_new[~df_new["time"].isin(df_overlap["time"])]

                    ##### Concatenate subsets #####
                    df_concat = concat([df_old_cleaned, df_overlap, df_new_cleaned])

                # ##### Now prepare the final concatenated dataframe for export
                station_name_new = MultiIndex_new.get_level_values("station")[1]
                
                # ! This is where Neil and I made the change to address the issues
                # ! 
                MultiIndex_old = pd.MultiIndex.from_tuples(
                    [(station_name_new, lvl1) for _, lvl1 in MultiIndex_old],
                    names=MultiIndex_new.names,
                )

                MultiIndex_concat = MultiIndex_new.union(MultiIndex_old)

                # drop duplicate rows that were potentially generated in the concatenation process
                df_concat = df_concat.drop_duplicates(subset=["time"])

                # drop 'station' and 'time'columns
                df_concat = df_concat.drop(["station", "time","hour","day","month","year","date"], axis=1)

                print('length of MultiIndex_new')
                print(len(MultiIndex_new))
                print("length of MultiIndex_old")
                print(len(MultiIndex_old))
                print("length of MultiIndex_concat")
                print(len(MultiIndex_concat))

                print("length of df_new")
                print(len(df_new))
                print("length of df_old")
                print(len(df_old))
                print("length of df_concat")
                print(len(df_concat))

                # ! This is where the issue! MultiIndex_concat and df_concat have difference lengths
                df_concat.index = MultiIndex_concat

                # # Convert concatenated dataframe to dataset
                # ds_concat = df_concat.to_xarray()

                # # #### Prepare for export #####

                # # Convert datatype of station coordinate
                # ds_concat.coords["station"] = ds_concat.coords["station"].astype("<U20")

                # # # Include past attributes
                # ds_concat.attrs.update(attrs_new)

                # # Update 'history' attribute
                # timestamp = datetime.datetime.utcnow().strftime("%m-%d-%Y, %H:%M:%S")
                # ds_concat.attrs["history"] = ds_concat.attrs[
                #     "history"
                # ] + " \n maritime_merge.ipynb run on {} UTC".format(timestamp)

                # # Update 'comment' attribute
                # ds_concat.attrs["comment"] = (
                #     "Final v1 data product. This data has been subjected to cleaning, QA/QC, and standardization."
                # )

                # # Add new qaqc_files_merged attribute
                # station_name_old = MultiIndex_old.get_level_values("station")[1]
                # ds_concat.attrs["qaqc_files_merged"] = (
                #     "{}, {} merged. Overlap retained from newer station data.".format(
                #         station_name_old, station_name_new
                #     )
                # )

                # ! this is here the renaming will go

                # !

                # ## Export ###
                # ! a test name is used below
                # ! the final name will be that of the newer dataframe
                # export_url = "s3://wecc-historical-wx/3_qaqc_wx/{}/{}_{}.zarr".format(
                #     network_name, "test_concat", station_name_new
                # )
                # ds_concat.to_zarr(export_url, mode="w")

                # record that the stations were concatenated
                final_concat_list.append(station_1)
                final_concat_list.append(station_2)

            except Exception as e:
                print(
                    "Error concatenating subset {}: {}".format(subset_i, e)
                )
        # if there are more than two stations in the subset, continue
        else:
            continue

    # return final_concat_list # ! this will be the final return statement, below is inlcluded for testing
    # return (
    #     df_new,
    #     df_old,
    #     df_concat,
    #     ds_concat,
    #     final_concat_list,
    # )

    return df_1, df_2, MultiIndex_1, MultiIndex_2, df_concat

### TEST

In [8]:
network_name = "MARITIME" # "VALLEYWATER", "MARITIME"

In [9]:
df_1, df_2, MultiIndex_1, MultiIndex_2 = concatenate_station_pairs(network_name)

length of MultiIndex_new
135209
length of MultiIndex_old
135209
length of MultiIndex_concat
135209
length of df_new
1409901
length of df_old
135209
length of df_concat
1509221
Error concatenating subset            ERA-ID concat_subset
0  MARITIME_LJAC1    MARITIME_0
1  MARITIME_LJPC1    MARITIME_0: Length mismatch: Expected axis has 1509221 elements, new values have 135209 elements


ValueError: too many values to unpack (expected 4)

In [None]:
# LJAC1 - this should be new
print(df_1['time'].min())
print(df_1["time"].max())

2005-04-01 02:00:00
2022-08-31 23:54:00


In [None]:
# LJPC1
print(df_2["time"].min())
print(df_2["time"].max())

2005-01-01 01:30:00
2022-08-31 23:20:00


In [6]:
# determine which dataset is older
if df_2["time"].max() > df_1["time"].max():
    # if df_1 has an earlier end tiem than df_2, then d_2 is newer
    # we also grab the name of the newer station in this step, for use later
    df_new = df_2
    MultiIndex_new = MultiIndex_2

    df_old = df_1
    MultiIndex_old = MultiIndex_1

else:
    df_new = df_1
    ds_new = df_1
    MultiIndex_new = MultiIndex_1

    df_old = df_2
    MultiIndex_old = MultiIndex_2

# now set things up to determine if there is temporal overlap between df_new and df_old
df_overlap = df_new[df_new["time"].isin(df_old["time"])]

In [7]:
# if there is no overlap between the two time series, just concatenate
if len(df_overlap) == 0:
    df_concat = concat([df_old, df_new])

# if not, split into subsets and concatenate
else:
    ##### Split datframes into subsets #####

    # Remove data in time overlap between old and new
    df_old_cleaned = df_old[~df_old["time"].isin(df_overlap["time"])]
    df_new_cleaned = df_new[~df_new["time"].isin(df_overlap["time"])]

    ##### Concatenate subsets #####
    df_concat = concat([df_old_cleaned, df_overlap, df_new_cleaned])

In [51]:
# ##### Now prepare the final concatenated dataframe for export
station_name_new = MultiIndex_new.get_level_values("station")[1]

MultiIndex_old = pd.MultiIndex.from_tuples(
    [(station_name_new, lvl1) for _, lvl1 in MultiIndex_old],
    names=MultiIndex_new.names,
)

MultiIndex_concat = MultiIndex_new.union(MultiIndex_old)


# MultiIndex_concat = pd.MultiIndex.from_tuples(
#     [(station_name_new, lvl1) for _, lvl1 in MultiIndex_concat],
#     names=MultiIndex_concat.names,
# )

In [54]:
# drop duplicate rows that were potentially generated in the concatenation process
df_concat = df_concat.drop_duplicates(subset=["time"])

# drop 'station' and 'time'columns
df_concat = df_concat.drop(["station", "time","hour","day","month","year","date"], axis=1)

df_concat.index = MultiIndex_concat

# Convert concatenated dataframe to dataset
ds_concat = df_concat.to_xarray()

In [55]:
ds_concat

Union is the issue - mismatch in timesteps

In [33]:
print(df_1['time'].min())
print(df_1["time"].max())

2005-04-01 02:00:00
2022-08-31 23:54:00


In [34]:
print(df_2["time"].min())
print(df_2["time"].max())

2005-01-01 01:30:00
2022-08-31 23:20:00


df_concat should span 2005-01-01 01:30:00 - 2022-08-31 23:54:00

In [53]:
len(MultiIndex_concat)

1509221

In [10]:
len(df_concat)

1509221

In [211]:
df_concat.columns

Index(['time', 'anemometer_height_m', 'elevation', 'elevation_eraqc', 'lat',
       'lon', 'sfcWind', 'sfcWind_dir', 'sfcWind_dir_eraqc', 'sfcWind_eraqc',
       'tas', 'tas_eraqc', 'thermometer_height_m', 'station', 'hour', 'day',
       'month', 'year', 'date', 'ps', 'ps_eraqc'],
      dtype='object')

In [11]:
# drop duplicate rows that were potentially generated in the concatenation process
df_concat_drop_dups = df_concat.drop_duplicates(subset=["time"])

In [12]:
len(df_concat_drop_dups)

1509221

In [38]:
# MultiIndex_concat and df_concat_drop_dups['time']
index_time = list(MultiIndex_1.get_level_values("time"))
#df_time = list(df_concat_drop_dups['time'])

In [45]:
MultiIndex_1.get_level_values("time")

DatetimeIndex(['2005-04-01 02:00:00', '2005-04-01 03:00:00',
               '2005-04-01 04:00:00', '2005-04-01 05:00:00',
               '2005-04-01 06:00:00', '2005-04-01 07:00:00',
               '2005-04-01 08:00:00', '2005-04-01 09:00:00',
               '2005-04-01 10:00:00', '2005-04-01 11:00:00',
               ...
               '2022-08-31 23:00:00', '2022-08-31 23:06:00',
               '2022-08-31 23:12:00', '2022-08-31 23:18:00',
               '2022-08-31 23:24:00', '2022-08-31 23:30:00',
               '2022-08-31 23:36:00', '2022-08-31 23:42:00',
               '2022-08-31 23:48:00', '2022-08-31 23:54:00'],
              dtype='datetime64[ns]', name='time', length=1409901, freq=None)

In [48]:
df_new['time']

0         2005-04-01 02:00:00
1         2005-04-01 03:00:00
2         2005-04-01 04:00:00
3         2005-04-01 05:00:00
4         2005-04-01 06:00:00
                  ...        
1409896   2022-08-31 23:30:00
1409897   2022-08-31 23:36:00
1409898   2022-08-31 23:42:00
1409899   2022-08-31 23:48:00
1409900   2022-08-31 23:54:00
Name: time, Length: 1409901, dtype: datetime64[ns]

In [46]:
df_concat['time']

0         2005-01-01 01:30:00
1         2005-01-01 02:30:00
2         2005-01-01 03:30:00
3         2005-01-01 04:30:00
4         2005-01-01 05:30:00
                  ...        
1409896   2022-08-31 23:30:00
1409897   2022-08-31 23:36:00
1409898   2022-08-31 23:42:00
1409899   2022-08-31 23:48:00
1409900   2022-08-31 23:54:00
Name: time, Length: 1509221, dtype: datetime64[ns]

In [214]:
dups = df_concat[df_concat['time'].duplicated(keep=False)]

In [215]:
dups

Unnamed: 0,time,anemometer_height_m,elevation,elevation_eraqc,lat,lon,sfcWind,sfcWind_dir,sfcWind_dir_eraqc,sfcWind_eraqc,...,tas_eraqc,thermometer_height_m,station,hour,day,month,year,date,ps,ps_eraqc


In [None]:
# drop 'station' and 'time'columns
df_concat = df_concat.drop(
    ["station", "time", "hour", "day", "month", "year", "date"], axis=1
)

df_concat.index = MultiIndex_concat

#### Test option 1

Run concatenate_station_pairs() as is, so the function does not export and instead returns df_concat, df_new, df_old, and df_overlap

In [130]:
(
    df_new,
    df_old,
    df_concat,
    ds_concat,
    final_concat_list,
) = concatenate_station_pairs(network_name)

Error concatenation stations of subset            ERA-ID concat_subset
0  MARITIME_LJAC1    MARITIME_0
1  MARITIME_LJPC1    MARITIME_0: Length mismatch: Expected axis has 1509221 elements, new values have 135209 elements
Error concatenation stations of subset            ERA-ID concat_subset
2  MARITIME_ICAC1    MARITIME_1
3  MARITIME_SMOC1    MARITIME_1: Length mismatch: Expected axis has 1367110 elements, new values have 282368 elements


UnboundLocalError: local variable 'ds_concat' referenced before assignment

In [None]:
df_concat = df_concat.reset_index(level="time")

#### Test option 2: 

Run concatenate_station_pairs() with the first return statement uncommented and the second commented, and the export section uncommented. So that the function actually exports the concatenated datasets. I've generated all the concatention lists (for VALLEYWATER, MARITIME, and ASOSAWOS) needed to run the function.

In [None]:
output = concatenate_station_pairs(network_name)

In [None]:
# import output
# TODO: you'll need to change the url
url_output = "s3://wecc-historical-wx/3_qaqc_wx/{}/test_concat_{}.zarr".format(
    network_name, network_name
)

# TODO: open_zarr will be used for QAQC'd datasets
ds_concat = xr.open_zarr(url_output)

df_concat = ds_concat.to_dataframe()

In [None]:
network_list = s3_cl.get_object(
    Bucket=bucket,
    Key="3_qaqc_wx/{}/{}_concat_list_{}.csv".format(
        network_name, network_name, network_name
    ),
)
concat_list = pd.read_csv(BytesIO(network_list["Body"].read()))
station_1 = concat_list["ERA-ID"].iloc[0]
station_2 = concat_list["ERA-ID"].iloc[1]

# import this subset of datasets and convert to dataframe
url_1 = "s3://wecc-historical-wx/3_qaqc_wx/{}/{}.zarr".format(network_name, station_1)
url_2 = "s3://wecc-historical-wx/3_qaqc_wx/{}/{}.zarr".format(network_name, station_2)

ds_1 = xr.open_zarr(url_1)
ds_2 = xr.open_zarr(url_2)

df_1 = ds_1.to_dataframe()
df_2 = ds_2.to_dataframe()

In [None]:
# extract time index for plotting
df_1 = df_1.reset_index(level="time")
df_2 = df_2.reset_index(level="time")


df_concat = df_concat.reset_index(level="time")

In [None]:
if df_1["time"].max() < df_2["time"].max(): 
    # if df_1 has an earlier end tiem than df_2, then d_2 is newer
    # we also grab the name of the newer station in this step, for use later
    df_new = df_2
    ds_new = ds_2

    df_old = df_1
    ds_old = ds_1
else:
    df_new = df_1
    ds_new = ds_1

    df_old = df_2
    ds_old = ds_2

#### Onward

In [None]:
ds_concat

In [None]:
df_concat.head(4)

Check overlap

In [None]:
# now set things up to determine if there is temporal overlap between df_new and df_old
df_new_overlap = df_new[df_new["time"].isin(df_concat["time"])]
df_concat_overlap = df_concat[df_concat["time"].isin(df_new["time"])]

In [None]:
df_new_overlap.head(4)

In [None]:
df_concat_overlap.head(4)

Plot the two original datasets

In [None]:
vis_var = 'ps'

In [None]:
# Create a figure with a specific size
plt.figure(figsize=(8, 4))

# Plotting the time series of given dataframe
plt.plot(df_new["time"], df_new[vis_var])

# Plotting the time series of given dataframe
plt.plot(df_old["time"], df_old[vis_var])

# Giving title to the chart using plt.title
plt.title("input dfs")

# rotating the x-axis tick labels at 30degree
# towards right
plt.xticks(rotation=30, ha="right")

# Providing x and y label to the chart
plt.xlabel("time")
plt.ylabel(vis_var)

Plot the output dataset

In [None]:
# Create a figure with a specific size
plt.figure(figsize=(8, 4))

# Plotting the time series of given dataframe
plt.plot(df_concat["time"], df_concat[vis_var])

# Giving title to the chart using plt.title
plt.title("concatenated df")

# rotating the x-axis tick labels at 30degree
# towards right
plt.xticks(rotation=30, ha="right")

# Providing x and y label to the chart
plt.xlabel("time")
plt.ylabel(vis_var)

## Step 4: Mark stations that have been concatenated