# Station Matching

The goal of this notebook is to identify stations that changed IDs. This has been known to occur for Maritime and ASOSOAWOS stations.


## Environment set-up

In [1]:
from shapely.geometry import Point
from shapely.ops import nearest_points

from functools import reduce
import datetime
from pandas import *
import boto3
import geopandas as gpd
import numpy as np
import pandas as pd
import xarray as xr
import matplotlib.pyplot as plt
from io import BytesIO, StringIO

import tempfile  # Used for downloading (and then deleting) netcdfs to local drive from s3 bucket

import s3fs

# import tempfile  # Used for downloading (and then deleting) netcdfs to local drive from s3 bucket
import os

# Silence warnings
import warnings
from shapely.errors import ShapelyDeprecationWarning

warnings.filterwarnings("ignore", category=RuntimeWarning)
warnings.filterwarnings(
    "ignore", category=ShapelyDeprecationWarning
)  # Warning is raised when creating Point object from coords. Can't figure out why.

plt.rcParams["figure.dpi"] = 300

In [2]:
# AWS credentials
s3 = boto3.resource("s3")
s3_cl = boto3.client("s3")

## AWS buckets
bucket = "wecc-historical-wx"
qaqcdir = "3_qaqc_wx/"
mergedir = "4_merge_wx/"

In [3]:
# Define temporary directory in local drive for downloading data from S3 bucket
# If the directory doesn't exist, it will be created
# If we used zarr, this wouldn't be neccessary
temp_dir = "./tmp"
if not os.path.exists(temp_dir):
    os.mkdir(temp_dir)

In [4]:
def read_nc_from_s3_clean(network_name, station_id, temp_dir):
    """Read netcdf file containing station data for a single station of interest from AWS s3 bucket

    Parameters
    ----------
    network_name: str
        Name of network (i.e. "ASOSAWOS")
        Must correspond with a valid directory in the s3 bucket (i.e. "CAHYDRO", "CDEC", "ASOSAWOS")
    station_id: str
        Station identifier; i.e. the name of the netcdf file in the bucket (i.e. "ASOSAWOS_72012200114.nc")

    Returns
    -------
    station_data: xr.Dataset

    Notes
    -----
    The data is first downloaded from AWS into a tempfile, which is then deleted after xarray reads in the file
    I'd like to see us use a zarr workflow if possible to avoid this.

    """

    # Temp file for downloading from s3
    temp_file = tempfile.NamedTemporaryFile(
        dir=temp_dir, prefix="", suffix=".nc", delete=True
    )

    # Create s3 file system
    s3 = s3fs.S3FileSystem(anon=False)

    # Get URL to netcdf in S3
    s3_url = "s3://wecc-historical-wx/2_clean_wx/{}/{}.nc".format(
        network_name, station_id
    )

    # Read in the data using xarray
    s3_file_obj = s3.get(s3_url, temp_file.name)
    station_data = xr.open_dataset(temp_file.name, engine="h5netcdf").load()

    # Close temporary file
    temp_file.close()

    return station_data

In [5]:
def read_zarr_from_s3(station_id, temp_dir):
    """Read zarr file containing station data for a single station of interest from AWS s3 bucket

    Parameters
    ----------
    network_name: str
        Name of network (i.e. "ASOSAWOS")
        Must correspond with a valid directory in the s3 bucket (i.e. "CAHYDRO", "CDEC", "ASOSAWOS")
    station_id: str
        Station identifier; i.e. the name of the netcdf file in the bucket (i.e. "ASOSAWOS_72012200114.nc")

    Returns
    -------
    station_data: xr.Dataset

    Notes
    -----
    The data is first downloaded from AWS into a tempfile, which is then deleted after xarray reads in the file
    """

    # Temp file for downloading from s3
    temp_file = tempfile.NamedTemporaryFile(
        dir=temp_dir, prefix="", suffix=".zarr", delete=True
    )

    # Create s3 file system
    s3 = s3fs.S3FileSystem(anon=False)

    # Get URL to netcdf in S3
    s3_url = "s3://wecc-historical-wx/3_qaqc_wx/VALLEYWATER/VALLEYWATER_{}.zarr".format(
        station_id
    )
    print(s3_url)

    # Read in the data using xarray
    s3_file_obj = s3.get(s3_url, temp_file.name)
    station_data = xr.open_dataset(temp_file.name, engine="zarr").load()

    # Close temporary file
    temp_file.close()

    return station_data

In [6]:
def qaqc_ds_to_df_concat(ds, verbose=False):
    """Converts xarray ds for a station to pandas df in the format needed for the pipeline

    Parameters
    ----------
    ds : xr.Dataset
        input data from the clean step
    verbose : bool, optional
        if True, provides runtime output to the terminal

    Returns
    -------
    df : pd.DataFrame
        converted xr.Dataset into dataframe
    MultiIndex : pd.Index
        multi-index of station and time
    attrs : list of str
        attributes from xr.Dataset
    var_attrs : list of str
        variable attributes from xr.Dataset
    era_qc_vars : list of str
        QAQC variables

    Notes
    -------
    Replaced all logger.info() statements with print()
    """

    ## Add qc_flag variable for all variables, including elevation;
    ## defaulting to nan for fill value that will be replaced with qc flag

    for key, val in ds.variables.items():
        if val.dtype == object:
            if key == "station":
                if str in [type(v) for v in ds[key].values]:
                    ds[key] = ds[key].astype(str)
            else:
                if str in [type(v) for v in ds.isel(station=0)[key].values]:
                    ds[key] = ds[key].astype(str)

    exclude_qaqc = [
        "time",
        "station",
        "lat",
        "lon",
        "qaqc_process",
        "sfcWind_method",
        "pr_duration",
        "pr_depth",
        "PREC_flag",
        "rsds_duration",
        "rsds_flag",
        "anemometer_height_m",
        "thermometer_height_m",
    ]  # lat, lon have different qc check

    raw_qc_vars = []  # qc_variable for each data variable, will vary station to station
    era_qc_vars = []  # our ERA qc variable
    # old_era_qc_vars = []  # our ERA qc variable

    for var in ds.data_vars:
        if "q_code" in var:
            raw_qc_vars.append(
                var
            )  # raw qc variable, need to keep for comparison, then drop
        if "_qc" in var:
            raw_qc_vars.append(
                var
            )  # raw qc variables, need to keep for comparison, then drop

    print("Existing observation and QC variables: {}".format(list(ds.keys())))

    # only in-fill nans for valid variables
    for var in ds.data_vars:
        if var not in exclude_qaqc and var not in raw_qc_vars and "_eraqc" not in var:
            qc_var = var + "_eraqc"  # variable/column label

            # if qaqc var does not exist, adds new variable in shape of original variable with designated nan fill value
            if qc_var not in era_qc_vars:
                ds = ds.assign({qc_var: xr.ones_like(ds[var]) * np.nan})
                era_qc_vars.append(qc_var)
                print(
                    "nans created for {}".format(qc_var),
                )
                ds = ds.assign({qc_var: xr.ones_like(ds[var]) * np.nan})

    n_qc = len(era_qc_vars)  # determine length of eraqc variables per station
    print("Created {0} era_qc variables: {1}".format(n_qc, era_qc_vars))

    # Save attributes to inheret them to the QAQC'ed file
    attrs = ds.attrs
    var_attrs = {var: ds[var].attrs for var in list(ds.data_vars.keys())}

    with warnings.catch_warnings():
        warnings.filterwarnings("ignore", category=RuntimeWarning)
        df = ds.to_dataframe()

    # instrumentation heights
    if "anemometer_height_m" not in df.columns:
        try:
            df["anemometer_height_m"] = (
                np.ones(ds["time"].shape) * ds.anemometer_height_m
            )
        except:
            print("Filling anemometer_height_m with NaN.")
            df["anemometer_height_m"] = np.ones(len(df)) * np.nan

    if "thermometer_height_m" not in df.columns:
        try:
            df["thermometer_height_m"] = (
                np.ones(ds["time"].shape) * ds.thermometer_height_m
            )
        except:
            print("Filling thermometer_height_m with NaN.")
            df["thermometer_height_m"] = np.ones(len(df)) * np.nan

    # De-duplicate time axis
    df = df[~df.index.duplicated()].sort_index()

    # Save station/time multiindex
    MultiIndex = df.index
    station = df.index.get_level_values(0)
    df["station"] = station

    # Station pd.Series to str
    station = station.unique().values[0]

    # Convert time/station index to columns and reset index
    df = df.droplevel(0).reset_index()

    # Add time variables needed by multiple functions
    df["hour"] = pd.to_datetime(df["time"]).dt.hour
    df["day"] = pd.to_datetime(df["time"]).dt.day
    df["month"] = pd.to_datetime(df["time"]).dt.month
    df["year"] = pd.to_datetime(df["time"]).dt.year
    df["date"] = pd.to_datetime(df["time"]).dt.date

    return df, MultiIndex, attrs, var_attrs, era_qc_vars

## Step 1: Identify candidates for concatenation and upload to AWS

We do so by identifying stations with exactly matching latitudes and longitudes.

In [21]:
# A list of networks to be checked for concatenation
target_networks = ["ASOSAWOS","VALLEYWATER", "MARITIME"]

In [22]:
def concatenation_check(station_list):
    """
    This function flags stations that need to be concatenated.

    Rules
    ------
        1.) Stations are flagged if they have identical latitudes and longitudes

    Parameters
    ------
        station_list: pd.DataFrame
            list of station information

    Returns
    -------
        if success:
            new_station_list: pd.DataFrame
                input station list with a flag column assigning an integer to each group of repeat latitudes and longitudes

        if failure:
            None

    """
    ##### Flag stations with identical latitudes and longitudes, then assign each group a unique integer

    # List of possible variable names for longitudes and latitudes
    lat_lon_list = ["LAT", "LON", "latitude", "longitude", "LATITUDE", "LONGITUDE", 'lat','lon']
    # Extract the latitude and longitude variable names from the input dataframe
    lat_lon_cols = [col for col in station_list.columns if col in lat_lon_list]

    # Generate column flagging duplicate latitudes and longitudes
    station_list["concat_subset"] = station_list.duplicated(
        subset=lat_lon_cols, keep=False
    )
    # within each group of identical latitudes and longitudes, assign a unique integer
    station_list["concat_subset"] = (
        station_list[station_list["concat_subset"] == True].groupby(lat_lon_cols).ngroup()
    )

    ##### Order station list by flag
    concat_station_list = station_list.sort_values("concat_subset")

    ##### Keep only flagged stations
    concat_station_list = concat_station_list[~concat_station_list["concat_subset"].isna()]

    ##### Format final list
    # Convert flags to integers - this is necessary for the final concatenation step
    concat_station_list["concat_subset"] = concat_station_list["concat_subset"].astype(
        "int32"
    )
    # Now keep only the ERA-ID and flag column
    era_id_list = ['ERA-ID','era-id']
    era_id_col = [col for col in station_list.columns if col in era_id_list]
    concat_station_list = concat_station_list[era_id_col + ["concat_subset"]]

    # Standardize ERA id to "ERA-ID" (this is specific to Valleywater stations)
    if 'era-id' in era_id_col:
        concat_station_list.rename(columns={"era-id": "ERA-ID"}, inplace=True)

    return concat_station_list

In [23]:
def apply_concat_check(station_names_list):
    """
    This function applies the conatenation check to a list of target stations. 
    It then upload a csv containing the ERA IDs and concatenation subset ID for 
    all identified stations in a network.

    Parameters
    ------
        station__names_list: pd.DataFrame
            list of target station names

    Returns
    -------
        if success:
            uploads list of stations to be concatenated to AWS
        if failure:
            None

    """
    final_list = pd.DataFrame([])
    for station in station_names_list:

        ##### Import station list of target station
        key = "2_clean_wx/{}/stationlist_{}_cleaned.csv".format(station,station)
        bucket_name = "wecc-historical-wx"
        list_import = s3_cl.get_object(
            Bucket=bucket,
            Key=key,
        )
        station_list = pd.read_csv(BytesIO(list_import["Body"].read()))

        ##### Apply concatenation check
        concat_list = concatenation_check(station_list)

        ##### Rename the flags for each subset to <station>_<subset number>
        concat_list["concat_subset"] = station + '_' + concat_list["concat_subset"].astype(str)

        ##### Append to final list of stations to concatenate
        final_list = pd.concat([final_list,concat_list])

        ##### Upload to QAQC directory in AWS
        new_buffer = StringIO()
        final_list.to_csv(new_buffer, index = False)
        content = new_buffer.getvalue()

        # the csv is stored in each station folder within 3_qaqc_wx
        s3_cl.put_object(
            Bucket = bucket_name,
            Body = content,
            Key = qaqcdir + station + "/concat_list_{}.csv".format(station)
        )
        
    return None

In [24]:
apply_concat_check(target_networks)

## Step 2: Concatenate Stations

### The functions

In [7]:
# Lists of variables to be assigned

float64_variables = [
    "anemometer_height_m",
    "elevation",
    "lat",
    "lon",
    "pr_15min",
    "thermometer_height_m",
    "ps",
    "tas",
    "tdps",
    "pr",
    "sfcWind",
    "sfcWind_dir",
    "ps_altimeter",
    "pr_duration",
]
U16_variables = [
    "qaqc_process",
    "sfcWind_method",
    "ps_eraqc",
    "tas_eraqc",
    "tdps_eraqc",
    "pr_eraqc",
    "sfcWind_eraqc",
    "sfcWind_dir_eraqc",
    "elevation_eraqc",
    "ps_altimeter_eraqc",
    "pr_15min_eraqc",
    "ps_qc",
    "ps_altimeter_qc",
    "psl_qc",
    "tas_qc",
    "tdps_qc",
    "pr_qc",
    "pr_depth_qc",
    "sfcWind_qc",
    "sfcWind_dir_qc",
]


In [8]:
def convert_datatypes(ds):
    """
    Converts the datatypes of variables in a dataset based on external libraries. 
    Used in the station concatenation function.

    Parameters
    ------
        ds: xr.Dataset
            weather station network

    Returns
    -------
        if success:
            output dataset with coverted datatypes
        if failure:
            None
    Notes
    -------
    Uses the following externally defined dictionaries to assign datatypes to variables:
    float32_variables: List
            list of variables that will be converted to datatpe "float64"
    U16_variables: List
            list of variables that will be converted to datatpe "<U16"
    """
    # Generate lists of variables from the external dicionaries that are actually present in the input dataset
    existing_float64 = [
        key for key in float64_variables if key in list(ds.keys())
    ]
    existing_U16 = [key for key in U16_variables if key in list(ds.keys())]

    # Convert the datatypes of those variables, but only if those variables exist
    if len(existing_float64) == 0:
        pass
    else:
        ds[existing_float64] = ds[existing_float64].astype("float64")
    
    if len(existing_U16) == 0:
        pass
    else: 
        ds[existing_U16] = ds[existing_U16].astype("<U16")

    # And of the coordinates as well
    ds.coords["station"] = ds.coords["station"].astype("<U16")

    return ds

In [67]:
def concatenate_station_pairs(network_name):
    """
    Concatenates two input datasets, deletes the originals, and exports the final concatenated dataset. 
    Also returns a list of the ERA-IDs of all stations that are concatenated.

    Rules
    ------
        1.) concatenation: keep the newer station data in the time range in which both stations overlap

    Parameters
    ------
        network_name: string
            weather station network

    Returns
    -------
        if success: 
            return list of ERA-IDs are stations that are concatenated
            all processed datasets are exported to the merge folder in AWS and the original datasets are deleted
        if failure:
            None
    """
    ##### Read in concatenation list of input network
    network_list = s3_cl.get_object(
        Bucket=bucket,
        Key="3_qaqc_wx/{}/concat_list_{}.csv".format(
            network_name, network_name, network_name
        ),
    )
    concat_list = pd.read_csv(BytesIO(network_list["Body"].read()))

    # ! truncate the concat list
    concat_list = concat_list.head(2)
    # ! end

    subset_number = len(concat_list['concat_subset'].unique())

    # initiate empty list, to which we will iteratively add the ERA-IDs of stations that are concatenated
    final_concat_list = []

    for i in range(0,subset_number):

        # count the number of staions in subset i
        subset_i = concat_list[
            concat_list["concat_subset"].str.contains("{}".format(i))
        ]

        n = subset_i.count()[0]

        # if there are only two stations, proceed with concatenation
        if n == 2:
            try: 
                # retrieve ERA IDs in this subset of stations
                station_1 = subset_i["ERA-ID"].iloc[0]
                station_2 = subset_i["ERA-ID"].iloc[1]

                final_concat_list.append(station_1)
                final_concat_list.append(station_2)

                # import this subset of datasets and convert to dataframe
                url_1 = "s3://wecc-historical-wx/3_qaqc_wx/{}/{}.zarr".format(
                    network_name, station_1
                )
                url_2 = "s3://wecc-historical-wx/3_qaqc_wx/{}/{}.zarr".format(
                    network_name, station_2
                )

                ds_1 = xr.open_zarr(url_1)
                ds_2 = xr.open_zarr(url_2)

                # ! troubleshooting begings
                # df_1 = ds_1.to_dataframe()
                # df_2 = ds_2.to_dataframe()

                df_1,MultiIndex_1,attrs_1,var_attrs_1,era_qc_vars_1 = qaqc_ds_to_df_concat(ds_1, verbose=False)
                df_2, MultiIndex_2, attrs_2, var_attrs_2, era_qc_vars_2 = (qaqc_ds_to_df_concat(ds_2, verbose=False))
                # ! troubleshooting ends

                # # apply reset index only to 'time', as we will need that for concatenation
                # df_1 = df_1.reset_index(level="time")
                # df_2 = df_2.reset_index(level="time")

                # determine which dataset is older
                if df_1["time"].max() < df_2["time"].max():
                    # if df_1 has an earlier end tiem than df_2, then d_2 is newer
                    # we also grab the name of the newer station in this step, for use later
                    df_new = df_2
                    ds_new = ds_2
                    MultiIndex_new = MultiIndex_2
                    attrs_new = attrs_2
                    var_attrs_new = var_attrs_2

                    df_old = df_1
                    MultiIndex_old = MultiIndex_1
                    attrs_old = attrs_1
                    var_attrs_old = var_attrs_1

                else:
                    df_new = df_1
                    ds_new = df_1
                    MultiIndex_new = MultiIndex_2
                    attrs_new = attrs_2
                    var_attrs_new = var_attrs_2

                    df_old = df_2
                    MultiIndex_old = MultiIndex_2
                    attrs_old = attrs_2
                    var_attrs_old = var_attrs_2

                # now set things up to determine if there is temporal overlap between df_new and df_old
                df_overlap = df_new[df_new["time"].isin(df_old["time"])]

                # if there is no overlap between the two time series, just concatenate
                if len(df_overlap) == 0:
                    df_concat = concat([df_old, df_new])

                # if not, split into subsets and concatenate
                else:
                    ##### Split datframes into subsets #####

                    # Remove data in time overlap between old and new
                    df_old_cleaned = df_old[~df_old["time"].isin(df_overlap["time"])]
                    df_new_cleaned = df_new[~df_new["time"].isin(df_overlap["time"])]

                    ##### Concatenate subsets #####
                    df_concat = concat([df_old_cleaned, df_overlap, df_new_cleaned])

                # ##### Now prepare the final concatenated dataframe for export
                station_name_new = MultiIndex_new.get_level_values("station")[1]
                final_station_name = "{}".format(station_name_new)
                MultiIndex_concat = MultiIndex_new.union(MultiIndex_old)
                MultiIndex_concat = pd.MultiIndex.from_tuples(
                    [(final_station_name, lvl1) for _, lvl1 in MultiIndex_concat],
                    names=MultiIndex_concat.names,
                )

                # drop duplicate rows that were potentially generated in the concatenation process
                df_concat = df_concat.drop_duplicates(subset=["time"])

                # drop 'station' and 'time'columns
                df_concat = df_concat.drop(["station", "time"], axis=1)

                df_concat.index = MultiIndex_concat 

                # # Add 'time' back into multi index
                # df_concat.set_index("time", append=True, inplace=True)

                # Convert concatenated dataframe to dataset
                ds_concat = df_concat.to_xarray()

                # #### Update attributes and datatypes #####

                # # Include past attributes
                ds_concat.attrs.update(attrs_new)

                # Update 'history' attribute
                timestamp = datetime.datetime.utcnow().strftime("%m-%d-%Y, %H:%M:%S")
                ds_concat.attrs["history"] = ds_concat.attrs[
                    "history"
                ] + " \n maritime_merge.ipynb run on {} UTC".format(timestamp)

                # Update 'comment' attribute
                ds_concat.attrs["comment"] = (
                    "Final v1 data product. This data has been subjected to cleaning, QA/QC, and standardization."
                )

                # Add new qaqc_files_merged attribute
                station_name_old = MultiIndex_old.get_level_values("station")[1]
                ds_concat.attrs["qaqc_files_merged"] = (
                    "{}, {} merged. Overlap retained from newer station data.".format(
                        station_name_old, station_name_new
                    )
                )

                # ## Export ###
                # export_url = "s3://wecc-historical-wx/3_qaqc_wx/{}/{}_{}.zarr".format(
                #     network_name, "test_concat", station_name_new
                # )
                # ds_concat.to_zarr(export_url, mode="w")
            except Exception as e:
                print(
                    "Error concatenation stations of subset {}: {}".format(subset_i, e)
                )
        # if there are more than two stations in the subset, continue
        else:
            continue

    # return final_concat_list
    return (
        ds_new,
        ds_concat,
    )  # df_concat, df_new, df_old, ds_concat

### TEST

In [57]:
network_name = "ASOSAWOS"

In [68]:
(
    ds_new,
    ds_concat,
) = concatenate_station_pairs(network_name)

Existing observation and QC variables: ['anemometer_height_m', 'elevation', 'elevation_eraqc', 'lat', 'lon', 'ps', 'ps_altimeter', 'ps_altimeter_eraqc', 'ps_altimeter_qc', 'ps_eraqc', 'ps_qc', 'psl_qc', 'qaqc_process', 'sfcWind', 'sfcWind_dir', 'sfcWind_dir_eraqc', 'sfcWind_dir_qc', 'sfcWind_eraqc', 'sfcWind_method', 'sfcWind_qc', 'tas', 'tas_eraqc', 'tas_qc', 'tdps', 'tdps_eraqc', 'tdps_qc', 'thermometer_height_m']
nans created for elevation_eraqc
nans created for ps_eraqc
nans created for ps_altimeter_eraqc
nans created for sfcWind_eraqc
nans created for sfcWind_dir_eraqc
nans created for tas_eraqc
nans created for tdps_eraqc
Created 7 era_qc variables: ['elevation_eraqc', 'ps_eraqc', 'ps_altimeter_eraqc', 'sfcWind_eraqc', 'sfcWind_dir_eraqc', 'tas_eraqc', 'tdps_eraqc']
Existing observation and QC variables: ['anemometer_height_m', 'elevation', 'elevation_eraqc', 'lat', 'lon', 'ps', 'ps_altimeter', 'ps_altimeter_eraqc', 'ps_altimeter_qc', 'ps_eraqc', 'ps_qc', 'psl_qc', 'qaqc_process'

In [69]:
ds_new

Unnamed: 0,Array,Chunk
Bytes,3.07 kiB,3.07 kiB
Shape,"(1, 393)","(1, 393)"
Count,2 Graph Layers,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 3.07 kiB 3.07 kiB Shape (1, 393) (1, 393) Count 2 Graph Layers 1 Chunks Type float64 numpy.ndarray",393  1,

Unnamed: 0,Array,Chunk
Bytes,3.07 kiB,3.07 kiB
Shape,"(1, 393)","(1, 393)"
Count,2 Graph Layers,1 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,3.07 kiB,3.07 kiB
Shape,"(1, 393)","(1, 393)"
Count,2 Graph Layers,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 3.07 kiB 3.07 kiB Shape (1, 393) (1, 393) Count 2 Graph Layers 1 Chunks Type float64 numpy.ndarray",393  1,

Unnamed: 0,Array,Chunk
Bytes,3.07 kiB,3.07 kiB
Shape,"(1, 393)","(1, 393)"
Count,2 Graph Layers,1 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,3.07 kiB,3.07 kiB
Shape,"(1, 393)","(1, 393)"
Count,2 Graph Layers,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 3.07 kiB 3.07 kiB Shape (1, 393) (1, 393) Count 2 Graph Layers 1 Chunks Type float64 numpy.ndarray",393  1,

Unnamed: 0,Array,Chunk
Bytes,3.07 kiB,3.07 kiB
Shape,"(1, 393)","(1, 393)"
Count,2 Graph Layers,1 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,3.07 kiB,3.07 kiB
Shape,"(1, 393)","(1, 393)"
Count,2 Graph Layers,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 3.07 kiB 3.07 kiB Shape (1, 393) (1, 393) Count 2 Graph Layers 1 Chunks Type float64 numpy.ndarray",393  1,

Unnamed: 0,Array,Chunk
Bytes,3.07 kiB,3.07 kiB
Shape,"(1, 393)","(1, 393)"
Count,2 Graph Layers,1 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,3.07 kiB,3.07 kiB
Shape,"(1, 393)","(1, 393)"
Count,2 Graph Layers,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 3.07 kiB 3.07 kiB Shape (1, 393) (1, 393) Count 2 Graph Layers 1 Chunks Type float64 numpy.ndarray",393  1,

Unnamed: 0,Array,Chunk
Bytes,3.07 kiB,3.07 kiB
Shape,"(1, 393)","(1, 393)"
Count,2 Graph Layers,1 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,3.07 kiB,3.07 kiB
Shape,"(1, 393)","(1, 393)"
Count,2 Graph Layers,1 Chunks
Type,int64,numpy.ndarray
"Array Chunk Bytes 3.07 kiB 3.07 kiB Shape (1, 393) (1, 393) Count 2 Graph Layers 1 Chunks Type int64 numpy.ndarray",393  1,

Unnamed: 0,Array,Chunk
Bytes,3.07 kiB,3.07 kiB
Shape,"(1, 393)","(1, 393)"
Count,2 Graph Layers,1 Chunks
Type,int64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,3.07 kiB,3.07 kiB
Shape,"(1, 393)","(1, 393)"
Count,2 Graph Layers,1 Chunks
Type,int64,numpy.ndarray
"Array Chunk Bytes 3.07 kiB 3.07 kiB Shape (1, 393) (1, 393) Count 2 Graph Layers 1 Chunks Type int64 numpy.ndarray",393  1,

Unnamed: 0,Array,Chunk
Bytes,3.07 kiB,3.07 kiB
Shape,"(1, 393)","(1, 393)"
Count,2 Graph Layers,1 Chunks
Type,int64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,3.07 kiB,3.07 kiB
Shape,"(1, 393)","(1, 393)"
Count,2 Graph Layers,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 3.07 kiB 3.07 kiB Shape (1, 393) (1, 393) Count 2 Graph Layers 1 Chunks Type float64 numpy.ndarray",393  1,

Unnamed: 0,Array,Chunk
Bytes,3.07 kiB,3.07 kiB
Shape,"(1, 393)","(1, 393)"
Count,2 Graph Layers,1 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.54 kiB,1.54 kiB
Shape,"(1, 393)","(1, 393)"
Count,3 Graph Layers,1 Chunks
Type,numpy.ndarray,
"Array Chunk Bytes 1.54 kiB 1.54 kiB Shape (1, 393) (1, 393) Count 3 Graph Layers 1 Chunks Type numpy.ndarray",393  1,

Unnamed: 0,Array,Chunk
Bytes,1.54 kiB,1.54 kiB
Shape,"(1, 393)","(1, 393)"
Count,3 Graph Layers,1 Chunks
Type,numpy.ndarray,

Unnamed: 0,Array,Chunk
Bytes,3.07 kiB,3.07 kiB
Shape,"(1, 393)","(1, 393)"
Count,2 Graph Layers,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 3.07 kiB 3.07 kiB Shape (1, 393) (1, 393) Count 2 Graph Layers 1 Chunks Type float64 numpy.ndarray",393  1,

Unnamed: 0,Array,Chunk
Bytes,3.07 kiB,3.07 kiB
Shape,"(1, 393)","(1, 393)"
Count,2 Graph Layers,1 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.54 kiB,1.54 kiB
Shape,"(1, 393)","(1, 393)"
Count,3 Graph Layers,1 Chunks
Type,numpy.ndarray,
"Array Chunk Bytes 1.54 kiB 1.54 kiB Shape (1, 393) (1, 393) Count 3 Graph Layers 1 Chunks Type numpy.ndarray",393  1,

Unnamed: 0,Array,Chunk
Bytes,1.54 kiB,1.54 kiB
Shape,"(1, 393)","(1, 393)"
Count,3 Graph Layers,1 Chunks
Type,numpy.ndarray,

Unnamed: 0,Array,Chunk
Bytes,1.54 kiB,1.54 kiB
Shape,"(1, 393)","(1, 393)"
Count,3 Graph Layers,1 Chunks
Type,numpy.ndarray,
"Array Chunk Bytes 1.54 kiB 1.54 kiB Shape (1, 393) (1, 393) Count 3 Graph Layers 1 Chunks Type numpy.ndarray",393  1,

Unnamed: 0,Array,Chunk
Bytes,1.54 kiB,1.54 kiB
Shape,"(1, 393)","(1, 393)"
Count,3 Graph Layers,1 Chunks
Type,numpy.ndarray,

Unnamed: 0,Array,Chunk
Bytes,1.54 kiB,1.54 kiB
Shape,"(1, 393)","(1, 393)"
Count,3 Graph Layers,1 Chunks
Type,numpy.ndarray,
"Array Chunk Bytes 1.54 kiB 1.54 kiB Shape (1, 393) (1, 393) Count 3 Graph Layers 1 Chunks Type numpy.ndarray",393  1,

Unnamed: 0,Array,Chunk
Bytes,1.54 kiB,1.54 kiB
Shape,"(1, 393)","(1, 393)"
Count,3 Graph Layers,1 Chunks
Type,numpy.ndarray,

Unnamed: 0,Array,Chunk
Bytes,3.07 kiB,3.07 kiB
Shape,"(1, 393)","(1, 393)"
Count,2 Graph Layers,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 3.07 kiB 3.07 kiB Shape (1, 393) (1, 393) Count 2 Graph Layers 1 Chunks Type float64 numpy.ndarray",393  1,

Unnamed: 0,Array,Chunk
Bytes,3.07 kiB,3.07 kiB
Shape,"(1, 393)","(1, 393)"
Count,2 Graph Layers,1 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,3.07 kiB,3.07 kiB
Shape,"(1, 393)","(1, 393)"
Count,2 Graph Layers,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 3.07 kiB 3.07 kiB Shape (1, 393) (1, 393) Count 2 Graph Layers 1 Chunks Type float64 numpy.ndarray",393  1,

Unnamed: 0,Array,Chunk
Bytes,3.07 kiB,3.07 kiB
Shape,"(1, 393)","(1, 393)"
Count,2 Graph Layers,1 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,3.07 kiB,3.07 kiB
Shape,"(1, 393)","(1, 393)"
Count,2 Graph Layers,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 3.07 kiB 3.07 kiB Shape (1, 393) (1, 393) Count 2 Graph Layers 1 Chunks Type float64 numpy.ndarray",393  1,

Unnamed: 0,Array,Chunk
Bytes,3.07 kiB,3.07 kiB
Shape,"(1, 393)","(1, 393)"
Count,2 Graph Layers,1 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.54 kiB,1.54 kiB
Shape,"(1, 393)","(1, 393)"
Count,3 Graph Layers,1 Chunks
Type,numpy.ndarray,
"Array Chunk Bytes 1.54 kiB 1.54 kiB Shape (1, 393) (1, 393) Count 3 Graph Layers 1 Chunks Type numpy.ndarray",393  1,

Unnamed: 0,Array,Chunk
Bytes,1.54 kiB,1.54 kiB
Shape,"(1, 393)","(1, 393)"
Count,3 Graph Layers,1 Chunks
Type,numpy.ndarray,

Unnamed: 0,Array,Chunk
Bytes,3.07 kiB,3.07 kiB
Shape,"(1, 393)","(1, 393)"
Count,2 Graph Layers,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 3.07 kiB 3.07 kiB Shape (1, 393) (1, 393) Count 2 Graph Layers 1 Chunks Type float64 numpy.ndarray",393  1,

Unnamed: 0,Array,Chunk
Bytes,3.07 kiB,3.07 kiB
Shape,"(1, 393)","(1, 393)"
Count,2 Graph Layers,1 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.54 kiB,1.54 kiB
Shape,"(1, 393)","(1, 393)"
Count,3 Graph Layers,1 Chunks
Type,numpy.ndarray,
"Array Chunk Bytes 1.54 kiB 1.54 kiB Shape (1, 393) (1, 393) Count 3 Graph Layers 1 Chunks Type numpy.ndarray",393  1,

Unnamed: 0,Array,Chunk
Bytes,1.54 kiB,1.54 kiB
Shape,"(1, 393)","(1, 393)"
Count,3 Graph Layers,1 Chunks
Type,numpy.ndarray,

Unnamed: 0,Array,Chunk
Bytes,1.54 kiB,1.54 kiB
Shape,"(1, 393)","(1, 393)"
Count,3 Graph Layers,1 Chunks
Type,numpy.ndarray,
"Array Chunk Bytes 1.54 kiB 1.54 kiB Shape (1, 393) (1, 393) Count 3 Graph Layers 1 Chunks Type numpy.ndarray",393  1,

Unnamed: 0,Array,Chunk
Bytes,1.54 kiB,1.54 kiB
Shape,"(1, 393)","(1, 393)"
Count,3 Graph Layers,1 Chunks
Type,numpy.ndarray,

Unnamed: 0,Array,Chunk
Bytes,3.07 kiB,3.07 kiB
Shape,"(1, 393)","(1, 393)"
Count,2 Graph Layers,1 Chunks
Type,int64,numpy.ndarray
"Array Chunk Bytes 3.07 kiB 3.07 kiB Shape (1, 393) (1, 393) Count 2 Graph Layers 1 Chunks Type int64 numpy.ndarray",393  1,

Unnamed: 0,Array,Chunk
Bytes,3.07 kiB,3.07 kiB
Shape,"(1, 393)","(1, 393)"
Count,2 Graph Layers,1 Chunks
Type,int64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,3.07 kiB,3.07 kiB
Shape,"(1, 393)","(1, 393)"
Count,2 Graph Layers,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 3.07 kiB 3.07 kiB Shape (1, 393) (1, 393) Count 2 Graph Layers 1 Chunks Type float64 numpy.ndarray",393  1,

Unnamed: 0,Array,Chunk
Bytes,3.07 kiB,3.07 kiB
Shape,"(1, 393)","(1, 393)"
Count,2 Graph Layers,1 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.54 kiB,1.54 kiB
Shape,"(1, 393)","(1, 393)"
Count,3 Graph Layers,1 Chunks
Type,numpy.ndarray,
"Array Chunk Bytes 1.54 kiB 1.54 kiB Shape (1, 393) (1, 393) Count 3 Graph Layers 1 Chunks Type numpy.ndarray",393  1,

Unnamed: 0,Array,Chunk
Bytes,1.54 kiB,1.54 kiB
Shape,"(1, 393)","(1, 393)"
Count,3 Graph Layers,1 Chunks
Type,numpy.ndarray,

Unnamed: 0,Array,Chunk
Bytes,3.07 kiB,3.07 kiB
Shape,"(1, 393)","(1, 393)"
Count,2 Graph Layers,1 Chunks
Type,int64,numpy.ndarray
"Array Chunk Bytes 3.07 kiB 3.07 kiB Shape (1, 393) (1, 393) Count 2 Graph Layers 1 Chunks Type int64 numpy.ndarray",393  1,

Unnamed: 0,Array,Chunk
Bytes,3.07 kiB,3.07 kiB
Shape,"(1, 393)","(1, 393)"
Count,2 Graph Layers,1 Chunks
Type,int64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,3.07 kiB,3.07 kiB
Shape,"(1, 393)","(1, 393)"
Count,2 Graph Layers,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 3.07 kiB 3.07 kiB Shape (1, 393) (1, 393) Count 2 Graph Layers 1 Chunks Type float64 numpy.ndarray",393  1,

Unnamed: 0,Array,Chunk
Bytes,3.07 kiB,3.07 kiB
Shape,"(1, 393)","(1, 393)"
Count,2 Graph Layers,1 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.54 kiB,1.54 kiB
Shape,"(1, 393)","(1, 393)"
Count,3 Graph Layers,1 Chunks
Type,numpy.ndarray,
"Array Chunk Bytes 1.54 kiB 1.54 kiB Shape (1, 393) (1, 393) Count 3 Graph Layers 1 Chunks Type numpy.ndarray",393  1,

Unnamed: 0,Array,Chunk
Bytes,1.54 kiB,1.54 kiB
Shape,"(1, 393)","(1, 393)"
Count,3 Graph Layers,1 Chunks
Type,numpy.ndarray,

Unnamed: 0,Array,Chunk
Bytes,3.07 kiB,3.07 kiB
Shape,"(1, 393)","(1, 393)"
Count,2 Graph Layers,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 3.07 kiB 3.07 kiB Shape (1, 393) (1, 393) Count 2 Graph Layers 1 Chunks Type float64 numpy.ndarray",393  1,

Unnamed: 0,Array,Chunk
Bytes,3.07 kiB,3.07 kiB
Shape,"(1, 393)","(1, 393)"
Count,2 Graph Layers,1 Chunks
Type,float64,numpy.ndarray


In [66]:
ds_concat

#### Test option 1

Run concatenate_station_pairs() as is, so the function does not export and instead returns df_concat, df_new, df_old, and df_overlap

In [None]:
df_concat, df_new, df_old, ds_concat = concatenate_station_pairs(network_name)

In [None]:
df_concat = df_concat.reset_index(level="time")

#### Test option 2: 

Run concatenate_station_pairs() with the first return statement uncommented and the second commented, and the export section uncommented. So that the function actually exports the concatenated datasets. I've generated all the concatention lists (for VALLEYWATER, MARITIME, and ASOSAWOS) needed to run the function.

In [None]:
output = concatenate_station_pairs(network_name)

In [None]:
# import output
# TODO: you'll need to change the url
url_output = "s3://wecc-historical-wx/3_qaqc_wx/{}/test_concat_{}.zarr".format(
    network_name, network_name
)

# TODO: open_zarr will be used for QAQC'd datasets
ds_concat = xr.open_zarr(url_output)

df_concat = ds_concat.to_dataframe()

In [None]:
network_list = s3_cl.get_object(
    Bucket=bucket,
    Key="3_qaqc_wx/{}/{}_concat_list_{}.csv".format(
        network_name, network_name, network_name
    ),
)
concat_list = pd.read_csv(BytesIO(network_list["Body"].read()))
station_1 = concat_list["ERA-ID"].iloc[0]
station_2 = concat_list["ERA-ID"].iloc[1]

# import this subset of datasets and convert to dataframe
url_1 = "s3://wecc-historical-wx/3_qaqc_wx/{}/{}.zarr".format(network_name, station_1)
url_2 = "s3://wecc-historical-wx/3_qaqc_wx/{}/{}.zarr".format(network_name, station_2)

ds_1 = xr.open_zarr(url_1)
ds_2 = xr.open_zarr(url_2)

df_1 = ds_1.to_dataframe()
df_2 = ds_2.to_dataframe()

In [None]:
# extract time index for plotting
df_1 = df_1.reset_index(level="time")
df_2 = df_2.reset_index(level="time")


df_concat = df_concat.reset_index(level="time")

In [None]:
if df_1["time"].max() < df_2["time"].max(): 
    # if df_1 has an earlier end tiem than df_2, then d_2 is newer
    # we also grab the name of the newer station in this step, for use later
    df_new = df_2
    ds_new = ds_2

    df_old = df_1
    ds_old = ds_1
else:
    df_new = df_1
    ds_new = ds_1

    df_old = df_2
    ds_old = ds_2

#### Onward

In [20]:
ds_concat

In [18]:
df_concat.head(4)

Unnamed: 0_level_0,time,anemometer_height_m,elevation,elevation_eraqc,lat,lon,pr,pr_depth_qc,pr_duration,pr_eraqc,...,psl,psl_eraqc,sfcWind,sfcWind_dir,sfcWind_dir_eraqc,sfcWind_eraqc,tas,tas_eraqc,tdps,tdps_eraqc
station,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ASOSAWOS_A0002694297,1996-07-02 07:59:00,10.06,61.0,,48.217,-122.633,0.0,9.0,1 days,,...,,,,,,,,,,
ASOSAWOS_A0002694297,1996-07-03 07:59:00,10.06,61.0,,48.217,-122.633,0.5,9.0,1 days,,...,,,,,,,,,,
ASOSAWOS_A0002694297,1996-07-04 07:59:00,10.06,61.0,,48.217,-122.633,6.4,9.0,1 days,,...,,,,,,,,,,
ASOSAWOS_A0002694297,1996-07-05 07:59:00,10.06,61.0,,48.217,-122.633,0.5,9.0,1 days,,...,,,,,,,,,,


Check overlap

In [19]:
# now set things up to determine if there is temporal overlap between df_new and df_old
df_new_overlap = df_new[df_new["time"].isin(df_concat["time"])]
df_concat_overlap = df_concat[df_concat["time"].isin(df_new["time"])]

In [None]:
df_new_overlap.head(4)

In [None]:
df_concat_overlap.head(4)

Plot the two original datasets

In [None]:
# Create a figure with a specific size
plt.figure(figsize=(8, 4))

# Plotting the time series of given dataframe
plt.plot(df_new["time"], df_new["pr"])

# Plotting the time series of given dataframe
plt.plot(df_old["time"], df_old["pr"])

# Giving title to the chart using plt.title
plt.title("input dfs")

# rotating the x-axis tick labels at 30degree
# towards right
plt.xticks(rotation=30, ha="right")

# Providing x and y label to the chart
plt.xlabel("time")
plt.ylabel("pr_15min")

plt.show()

Plot the output dataset

In [None]:
# Create a figure with a specific size
plt.figure(figsize=(8, 4))

# Plotting the time series of given dataframe
plt.plot(df_concat["time"], df_concat["pr"])

# Giving title to the chart using plt.title
plt.title("concatenated df")

# rotating the x-axis tick labels at 30degree
# towards right
plt.xticks(rotation=30, ha="right")

# Providing x and y label to the chart
plt.xlabel("time")
plt.ylabel("pr_15min")

plt.show()

## Step 4: Mark stations that have been concatenated