# Station Matching

The goal of this notebook is to identify stations that changed IDs. The IDs fo these pairs of matching stations will be stored in a csv, which then be fed into concentation as a lookup table. Pairs will receive unique flags, with the older station receiving a different flag than the newer station.


## Environment set-up

In [2]:
from shapely.geometry import Point
from shapely.ops import nearest_points

from functools import reduce
import datetime
from pandas import *
import boto3
import geopandas as gpd
import numpy as np
import pandas as pd
import xarray as xr
import matplotlib.pyplot as plt
from io import BytesIO, StringIO

import tempfile  # Used for downloading (and then deleting) netcdfs to local drive from s3 bucket

import s3fs

# import tempfile  # Used for downloading (and then deleting) netcdfs to local drive from s3 bucket
import os

# Silence warnings
import warnings
from shapely.errors import ShapelyDeprecationWarning

warnings.filterwarnings("ignore", category=RuntimeWarning)
warnings.filterwarnings(
    "ignore", category=ShapelyDeprecationWarning
)  # Warning is raised when creating Point object from coords. Can't figure out why.

plt.rcParams["figure.dpi"] = 300

In [3]:
# AWS credentials
s3 = s3fs.S3FileSystem #must be set to this to use such commands as ls
#s3 = boto3.resource('s3')
s3_client = boto3.client('s3')

## AWS buckets
bucket = "wecc-historical-wx"
qaqcdir = "3_qaqc_wx/VALLEYWATER/"
mergedir = "4_merge_wx/VALLEYWATER/"

In [4]:
# Define temporary directory in local drive for downloading data from S3 bucket
# If the directory doesn't exist, it will be created
# If we used zarr, this wouldn't be neccessary
temp_dir = "./tmp"
if not os.path.exists(temp_dir):
    os.mkdir(temp_dir)

In [5]:
def read_nc_from_s3_clean(network_name, station_id, temp_dir):
    """Read netcdf file containing station data for a single station of interest from AWS s3 bucket

    Parameters
    ----------
    network_name: str
        Name of network (i.e. "ASOSAWOS")
        Must correspond with a valid directory in the s3 bucket (i.e. "CAHYDRO", "CDEC", "ASOSAWOS")
    station_id: str
        Station identifier; i.e. the name of the netcdf file in the bucket (i.e. "ASOSAWOS_72012200114.nc")

    Returns
    -------
    station_data: xr.Dataset

    Notes
    -----
    The data is first downloaded from AWS into a tempfile, which is then deleted after xarray reads in the file
    I'd like to see us use a zarr workflow if possible to avoid this.

    """

    # Temp file for downloading from s3
    temp_file = tempfile.NamedTemporaryFile(
        dir=temp_dir, prefix="", suffix=".nc", delete=True
    )

    # Create s3 file system
    s3 = s3fs.S3FileSystem(anon=False)

    # Get URL to netcdf in S3
    s3_url = "s3://wecc-historical-wx/2_clean_wx/{}/{}.nc".format(
        network_name, station_id
    )

    # Read in the data using xarray
    s3_file_obj = s3.get(s3_url, temp_file.name)
    station_data = xr.open_dataset(temp_file.name, engine="h5netcdf").load()

    # Close temporary file
    temp_file.close()

    return station_data

In [6]:
def read_zarr_from_s3(station_id, temp_dir):
    """Read zarr file containing station data for a single station of interest from AWS s3 bucket

    Parameters
    ----------
    network_name: str
        Name of network (i.e. "ASOSAWOS")
        Must correspond with a valid directory in the s3 bucket (i.e. "CAHYDRO", "CDEC", "ASOSAWOS")
    station_id: str
        Station identifier; i.e. the name of the netcdf file in the bucket (i.e. "ASOSAWOS_72012200114.nc")

    Returns
    -------
    station_data: xr.Dataset

    Notes
    -----
    The data is first downloaded from AWS into a tempfile, which is then deleted after xarray reads in the file
    """

    # Temp file for downloading from s3
    temp_file = tempfile.NamedTemporaryFile(
        dir=temp_dir, prefix="", suffix=".zarr", delete=True
    )

    # Create s3 file system
    s3 = s3fs.S3FileSystem(anon=False)

    # Get URL to netcdf in S3
    s3_url = "s3://wecc-historical-wx/3_qaqc_wx/VALLEYWATER/VALLEYWATER_{}.zarr".format(
        station_id
    )
    print(s3_url)

    # Read in the data using xarray
    s3_file_obj = s3.get(s3_url, temp_file.name)
    station_data = xr.open_dataset(temp_file.name, engine="zarr").load()

    # Close temporary file
    temp_file.close()

    return station_data

In [7]:
def qaqc_ds_to_df(ds, verbose=False):
    """Converts xarray ds for a station to pandas df in the format needed for the pipeline

    Parameters
    ----------
    ds : xr.Dataset
        input data from the clean step
    verbose : bool, optional
        if True, provides runtime output to the terminal

    Returns
    -------
    df : pd.DataFrame
        converted xr.Dataset into dataframe
    MultiIndex : pd.Index
        multi-index of station and time
    attrs : list of str
        attributes from xr.Dataset
    var_attrs : list of str
        variable attributes from xr.Dataset
    era_qc_vars : list of str
        QAQC variables

    Notes
    -----
    This is the notebook friendly version (no logger statements).
    """
    ## Add qc_flag variable for all variables, including elevation;
    ## defaulting to nan for fill value that will be replaced with qc flag

    for key, val in ds.variables.items():
        if val.dtype == object:
            if key == "station":
                if str in [type(v) for v in ds[key].values]:
                    ds[key] = ds[key].astype(str)
            else:
                if str in [type(v) for v in ds.isel(station=0)[key].values]:
                    ds[key] = ds[key].astype(str)

    exclude_qaqc = [
        "time",
        "station",
        "lat",
        "lon",
        "qaqc_process",
        "sfcWind_method",
        "pr_duration",
        "pr_depth",
        "PREC_flag",
        "rsds_duration",
        "rsds_flag",
        "anemometer_height_m",
        "thermometer_height_m",
    ]  # lat, lon have different qc check

    raw_qc_vars = []  # qc_variable for each data variable, will vary station to station
    era_qc_vars = []  # our ERA qc variable
    old_era_qc_vars = []  # our ERA qc variable

    for var in ds.data_vars:
        if "q_code" in var:
            raw_qc_vars.append(
                var
            )  # raw qc variable, need to keep for comparison, then drop
        if "_qc" in var:
            raw_qc_vars.append(
                var
            )  # raw qc variables, need to keep for comparison, then drop
        if "_eraqc" in var:
            era_qc_vars.append(
                var
            )  # raw qc variables, need to keep for comparison, then drop
            old_era_qc_vars.append(var)

    print(f"era_qc existing variables:\n{era_qc_vars}")
    n_qc = len(era_qc_vars)

    for var in ds.data_vars:
        if var not in exclude_qaqc and var not in raw_qc_vars and "_eraqc" not in var:
            qc_var = var + "_eraqc"  # variable/column label

            # if qaqc var does not exist, adds new variable in shape of original variable with designated nan fill value
            if qc_var not in era_qc_vars:
                print(f"nans created for {qc_var}")
                ds = ds.assign({qc_var: xr.ones_like(ds[var]) * np.nan})
                era_qc_vars.append(qc_var)

    print("{} created era_qc variables".format(len(era_qc_vars) - len(old_era_qc_vars)))
    if len(era_qc_vars) != n_qc:
        print("{}".format(np.setdiff1d(old_era_qc_vars, era_qc_vars)))

    # Save attributes to inheret them to the QAQC'ed file
    attrs = ds.attrs
    # var_attrs = {var: ds[var].attrs for var in list(ds.data_vars.keys())}

    with warnings.catch_warnings():
        warnings.filterwarnings("ignore", category=RuntimeWarning)
        df = ds.to_dataframe()

    # instrumentation heights
    if "anemometer_height_m" not in df.columns:
        try:
            df["anemometer_height_m"] = (
                np.ones(ds["time"].shape) * ds.anemometer_height_m
            )
        except:
            print("Filling anemometer_height_m with NaN.", flush=True)
            df["anemometer_height_m"] = np.ones(len(df)) * np.nan
        finally:
            pass
    if "thermometer_height_m" not in df.columns:
        try:
            df["thermometer_height_m"] = (
                np.ones(ds["time"].shape) * ds.thermometer_height_m
            )
        except:
            print("Filling thermometer_height_m with NaN.", flush=True)
            df["thermometer_height_m"] = np.ones(len(df)) * np.nan
        finally:
            pass

    # De-duplicate time axis
    df = df[~df.index.duplicated()].sort_index()

    # Save station/time multiindex
    MultiIndex = df.index
    station = df.index.get_level_values(0)
    df["station"] = station

    # Station pd.Series to str
    station = station.unique().values[0]

    # Convert time/station index to columns and reset index
    df = df.droplevel(0).reset_index()

    # Add time variables needed by multiple functions
    df["hour"] = pd.to_datetime(df["time"]).dt.hour
    df["day"] = pd.to_datetime(df["time"]).dt.day
    df["month"] = pd.to_datetime(df["time"]).dt.month
    df["year"] = pd.to_datetime(df["time"]).dt.year
    df["date"] = pd.to_datetime(df["time"]).dt.date

    return df  # , MultiIndex, attrs, var_attrs, era_qc_vars

## Step 1: Load station lists for testing

In [13]:
### Read in ASOSAWOS stations

s3_cl = boto3.client("s3")  # for lower-level processes

asosawos = s3_cl.get_object(
    Bucket="wecc-historical-wx", Key="2_clean_wx/ASOSAWOS/stationlist_ASOSAWOS_cleaned.csv"
)
asosawos_list = pd.read_csv(BytesIO(asosawos["Body"].read()))

In [14]:
valleywater = s3_cl.get_object(
    Bucket="wecc-historical-wx", Key="2_clean_wx/VALLEYWATER/stationlist_VALLEYWATER_cleaned.csv"
)
valleywater_list = pd.read_csv(BytesIO(valleywater["Body"].read()))

In [15]:
maritime = s3_cl.get_object(
    Bucket="wecc-historical-wx", Key="2_clean_wx/MARITIME/stationlist_MARITIME_cleaned.csv"
)
maritime_list = pd.read_csv(BytesIO(maritime["Body"].read()))

## Step 2: Identify candidates for concatenation

We do so using 1 of two methods
1. repeat lat, lons
2. using matching IDs, for stations in which those exist (NOT currently used)

### TESTING: using ICAO values

In [None]:
repeat_list = asosawos_list[asosawos_list.duplicated(subset=['ICAO'], keep=False)] 

# how many unique ICAO duplicates are there?
print(len(repeat_list['ICAO'].unique()))

print(repeat_list.groupby('ICAO').count().max())

46
ERA-ID              6
USAF                6
WBAN                6
STATION NAME        6
CTRY                6
                   ..
sfcWind_dir         6
sfcWind_dir_nobs    6
rsds                6
rsds_nobs           6
total_nobs          6
Length: 67, dtype: int64


In [18]:
print(len(repeat_list['ICAO'].unique()))

46


#### Investigate problem station KMLF

In [19]:
kmlf = repeat_list[repeat_list['ICAO']=='KMLF']

In [20]:
kmlf[['STATION NAME','LAT','LON','start_time','end_time']]

Unnamed: 0,STATION NAME,LAT,LON,start_time,end_time
61,MILFORD MUNICIPAL AP,38.417,-113.017,1948-07-23,1996-12-31
154,MILFORD MUNICIPAL AP,38.417,-113.017,1977-01-01,1983-04-30
166,MILFORD MUNICIPAL AP,38.417,-113.017,1983-05-01,1985-05-31
170,MILFORD MUNICIPAL AP,38.417,-113.017,1985-06-01,1989-05-01
185,MILFORD MUNI BRISCOE,38.417,-113.017,1997-01-01,2022-12-31
226,MILFORD MUNICIPAL AIRPORT,38.423,-113.011,2005-01-01,2022-12-31


### TESTING: using station locations (lat, lons)

In [53]:
# test dataframe

test = asosawos_list

In [None]:
lat_lon_list = ['LAT', 'LON', 'latitude', 'longitude', 'LATITUDE', 'LONGITUDE']


In [57]:
lat_lon_cols = [col for col in test.columns if col in lat_lon_list]

In [58]:
lat_lon_cols

['LAT', 'LON']

In [59]:
test['concat_flag'] = asosawos_list.duplicated(subset=lat_lon_cols, keep=False)

In [60]:
test['concat_flag'] = test[test['concat_flag']==True].groupby(lat_lon_cols).ngroup()

In [61]:
time_var_list = ['end_time','end-date']
end_time_col = [col for col in test.columns if col in time_var_list]

In [62]:
end_time_col

['end_time']

In [63]:
test = test.sort_values('concat_flag')
test = test.groupby(["concat_flag"]).apply(lambda x: x.sort_values(end_time_col)).reset_index(drop=True)

### Final Function

In [68]:
def concatenation_check(station_list):
    """
    This function flags stations that need to be concatenated 

    Rules
    ------
        1.) Stations are flagged as needing to be concatenated if they have identical latitudes and longitudes

    Parameters
    ------
        station_list: pd.DataFrame
            list of station information

    Returns
    -------
        if success:
            new_station_list: pd.DataFrame
                input station list with a flag column assigning and integer to each group of repeat latitudes and longitudes

        if failure:
            None
    Notes
    -------
    

    Example
    -------


    """
    ##### flag stations with identical lat lons, then assign each group a unique integer

    # list of possible variables names for longitudes and latitudes
    lat_lon_list = ['LAT', 'LON', 'latitude', 'longitude', 'LATITUDE', 'LONGITUDE']
    # extract the latitude and longitude variable names from the input dataframe
    lat_lon_cols = [col for col in station_list.columns if col in lat_lon_list]

    # generate column flagging duplicate latitudes and longitudes
    station_list['concat_flag'] = station_list.duplicated(subset=lat_lon_cols, keep=False)
    # within each group of identical latitudes and longitudes, assign a unique integer
    station_list['concat_flag'] = station_list[station_list['concat_flag']==True].groupby(lat_lon_cols).ngroup()
    
    ##### order station list by flag
    new_station_list = station_list.sort_values('concat_flag')

    ##### keep only flagged stations
    new_station_list = new_station_list[~new_station_list['concat_flag'].isna()]
    
    # # sort by end_time or end-date, depending on the station TODO: this is not necessary
    # time_var_list = ['end_time','end-date']
    # end_time_or_date = [col for col in station_list.columns if col in time_var_list]
    # new_station_list = new_station_list.groupby('concat_flag').apply(lambda x: x.sort_values(end_time_or_date)).reset_index(drop=True)

    return new_station_list

In [69]:
asosawos_out = concatenation_check(asosawos_list)
maritime_out = concatenation_check(maritime_list)
valleywater_out = concatenation_check(valleywater_list)


### Check that the check includes stations already flagged for concatenation

MARITIME MTYC1 - MEYC1

MARITIME SMOC1 - ICAC1

In [70]:
maritime_out

# Flagged Stations:
# MARITIME_LJAC1 <=> MARITIME_LJPC1
# MARITIME_ICAC1 <=> MARITIME_SMOC1	
# MARITIME_MEYC1 <=> MARITIME_MTYC1 <=> MARITIME_MYXC1

Unnamed: 0,ERA-ID,STATION_ID,OWNER,NAME,LOCATION,LATITUDE,LONGITUDE,in_terr_wecc,in_mar_wecc,NETWORK,...,hurs,hurs_nobs,sfcWind,sfcWind_nobs,sfcWind_dir,sfcWind_dir_nobs,rsds,rsds_nobs,total_nobs,concat_flag
18,MARITIME_LJAC1,ljac1,O,"9410230 - La Jolla, CA","32.867 N 117.257 W (32&#176;52'1"" N 117&#176;1...",32.867,-117.257,Y,,MARITIME,...,N,0,Y,1098253,Y,1099588,N,0,1409901,0.0
19,MARITIME_LJPC1,ljpc1,R,"La Jolla, CA (073)","32.867 N 117.257 W (32&#176;52'0"" N 117&#176;1...",32.867,-117.257,Y,,MARITIME,...,N,0,Y,105188,Y,104295,N,0,135209,0.0
17,MARITIME_ICAC1,icac1,O,9410840 - Santa Monica Pier,"34.008 N 118.500 W (34&#176;0'28"" N 118&#176;2...",34.008,-118.5,Y,,MARITIME,...,N,0,Y,997061,Y,996677,N,0,1084742,1.0
42,MARITIME_SMOC1,smoc1,O,"9410840 - Santa Monica, CA","34.008 N 118.500 W (34&#176;0'30"" N 118&#176;3...",34.008,-118.5,Y,,MARITIME,...,N,0,Y,52823,Y,53110,N,0,282368,1.0
23,MARITIME_MEYC1,meyc1,O,"9413450 - Monterey, CA","36.605 N 121.889 W (36&#176;36'18"" N 121&#176;...",36.605,-121.889,Y,,MARITIME,...,N,0,Y,415573,Y,415191,N,0,538763,2.0
24,MARITIME_MTYC1,mtyc1,O,"9413450 - Monterey, CA","36.605 N 121.889 W (36&#176;36'18"" N 121&#176;...",36.605,-121.889,Y,,MARITIME,...,N,0,Y,237168,Y,236901,N,0,874261,2.0
25,MARITIME_MYXC1,myxc1,CN,"Monterrey, CA","36.605 N 121.889 W (36&#176;36'18"" N 121&#176;...",36.605,-121.889,Y,,MARITIME,...,N,0,N,0,N,0,N,0,0,2.0


The previously identified stations are indeed flagged. Along with an additional pair: MARITIME_LJAC1 and MARITIME_LJPC1. And a third station included with MARITIME_MEYC1 abd MARITIME_MTYC1: MARITIME_MYXC1.

## Step 3: Filter out duplicates

Potential ways to check that two stations are duplicates
1. identical total_nobs
2. identical ERA IDs
3. identical end or start times 

In [62]:
# Extract flagged stations

asosawos_dup = asosawos_out[~asosawos_out['concat_flag'].isna()]
valleywater_dup = valleywater_out[~valleywater_out['concat_flag'].isna()]
maritime_dup = maritime_out[~maritime_out['concat_flag'].isna()]

In [65]:
valleywater_dup

Unnamed: 0,era-id,longitude,latitude,elevation,start-date,end-date,cleaned,time_cleaned,network,pr_15min_nobs,total_nobs,concat_flag
14,VALLEYWATER_6053,-122.081,37.3322,,"06-21-1974, 08:15:00","07-25-2017, 02:15:00",Y,"01-28-2025, 01:32:09",VALLEYWATER,1511017.0,1511017.0,0.0
41,VALLEYWATER_6144,-122.081,37.3322,,"08-04-2013, 07:15:00","01-10-2025, 17:15:00",Y,"01-28-2025, 01:32:09",VALLEYWATER,401033.0,401033.0,0.0


In [66]:
asosawos_dup

Unnamed: 0,ERA-ID,USAF,WBAN,STATION NAME,CTRY,STATE,ICAO,LAT,LON,ELEV(M),...,hurs,hurs_nobs,sfcWind,sfcWind_nobs,sfcWind_dir,sfcWind_dir_nobs,rsds,rsds_nobs,total_nobs,concat_flag
214,ASOSAWOS_99999903053,999999,3053,DONA ANA COUNTY AIRPORT AT SANTA TERESA,US,NM,K5T6,31.883,-106.717,1253.3,...,N,0,Y,90,Y,85,N,0,465,0.0
379,ASOSAWOS_A0001403053,A00014,3053,DONA ANA COUNTY AIRPORT AT SANTA TERESA,US,NM,K5T6,31.883,-106.717,1253.3,...,N,0,Y,381,Y,366,N,0,393,0.0
367,ASOSAWOS_72269593041,722695,93041,LAS CRUCES INTL AIRPORT,US,NM,KLRU,32.283,-106.917,1357.6,...,N,0,Y,429942,Y,367455,N,0,435510,1.0
211,ASOSAWOS_99999993041,999999,93041,LAS CRUCES MUNICIPAL AP,US,NM,KLRU,32.283,-106.917,1358.2,...,N,0,Y,8334,Y,7075,N,0,9031,1.0
140,ASOSAWOS_72272093063,722720,93063,BISBEE DOUGLAS INTL,US,AZ,KSVC,32.633,-108.167,1659.9,...,N,0,Y,180734,Y,156041,N,0,185610,2.0
227,ASOSAWOS_72272193063,722721,93063,GRANT COUNTY AIRPORT,US,NM,KSVC,32.633,-108.167,1637.7,...,N,0,Y,446284,Y,413346,N,0,451618,2.0
222,ASOSAWOS_74003503145,740035,3145,YUMA MCAS,US,AZ,KNYL,32.65,-114.617,64.9,...,N,0,Y,157305,Y,126730,N,0,163310,3.0
155,ASOSAWOS_72280503145,722805,3145,YUMA MCAS,US,AZ,KNYL,32.65,-114.617,64.9,...,N,0,Y,71005,Y,60739,N,0,71036,3.0
171,ASOSAWOS_69960403145,699604,3145,YUMA MCAS,US,AZ,KNYL,32.65,-114.617,64.9,...,N,0,Y,149695,Y,128239,N,0,157070,3.0
235,ASOSAWOS_72293193107,722931,93107,MARINE CORPS AIR STATION,US,CA,KNKX,32.867,-117.133,145.4,...,N,0,Y,174010,Y,131645,N,0,179796,4.0


### Final Function

In [None]:
def duplicate_check(station_list):
    """
    This function flags stations that are potentially duplicates

    Rules
    ------
        1.) Within stations flagged for concatenation, stations are flagged as potential duplicates 
            if either their start or end times are identical
            - TODO: brainstorm alternative approaches

    Parameters
    ------
        station_list: pd.DataFrame
            list of station information that has passed through the concatenation check

    Returns
    -------
        if success:
            new_station_list: pd.DataFrame


        if failure:
            None
    Notes
    -------

    """
    ##### flag stations with repeat end or start times
    
    time_end_list = ['end_time','end-date']
    time_start_list = ['start_time','start-date']

    end_time_or_date = [col for col in station_list.columns if col in time_var_list]

    new_station_list = new_station_list.groupby('concat_flag').apply(lambda x: x.sort_values(end_time_or_date)).reset_index(drop=True)

    return new_station_list

## Step 4: Concatenation

In [31]:
test = asosawos_list_concat.groupby(["ICAO"]).apply(lambda x: x.sort_values(["end_time"]))

In [None]:
# concatenate
new_list = concat([test,asosawos_list_duplicate,asosawos_list_remaining]).reset_index(drop=True)

In [40]:
output_list = reorder_station_list(asosawos_list,concat_list,duplicate_list)

### Final Function Outline

In [None]:
def concatenate_target_stations_old(concat_station_list):
    """
    Concatenates stations that have been flagged for concatenation
    
    Rules
    ------
        1.) 

    Parameters
    ------
        df: pd.dataframe
            staton data

    Returns
    -------
        if success:
            all processed datasets are exported to the merge folder in AWS and the original datasets are deleted
        if failure:
            None
    """
    ##### Load in stations for concatenation
    

    # Apply reset index only to 'time', as we will need that for concatenation
    df_old = df_old.reset_index(level='time')
    df_new = df_new.reset_index(level='time')

    ##### Split datframes into subsets #####
    # if there is overlap, then create subsets

    # if no overlap, just concatenate

    # Remove data in time overlap between old and new
    df_old_cleaned = df_old[~df_old['time'].isin(df_new['time'])]
    df_new_cleaned = df_new[~df_new['time'].isin(df_old['time'])]

    # Data in new input that overlaps in time with old input
    df_overlap = df_new[df_new['time'].isin(df_old['time'])]

    # Set index to new input for df_old_cleaned
    # We want the final dataset to show up as the new station, not the old
    final_station_name = "{}_{}".format(network_name,station_new)
    new_index = [final_station_name] * len(df_old_cleaned)

    df_old_cleaned.index = new_index
    df_old_cleaned.index.name = 'station'
    
    ##### Concatenate subsets #####

    df_concat = concat([df_old_cleaned, df_overlap, df_new_cleaned])

    # Add 'time' back into multi index
    df_concat.set_index('time', append=True, inplace=True)
    
    
    # Convert concatenated dataframe to dataset
    ds_concat = df_concat.to_xarray()

    ##### Update attributes and datatypes #####

    # Include past attributes
    ds_concat.attrs = ds_new.attrs 
    

    # Update 'history' attribute
    timestamp = datetime.datetime.utcnow().strftime("%m-%d-%Y, %H:%M:%S")
    ds_concat.attrs["history"] = ds_new.attrs[
            "history"
        ] + " \nmaritime_merge.ipynb run on {} UTC".format(timestamp)
    
    # Update 'comment' attribute
    ds_concat.attrs["comment"] = "Final v1 data product. This data has been subjected to cleaning, QA/QC, and standardization."
    
    # Add new qaqc_files_merged attribute
    ds_concat.attrs["qaqc_files_merged"] = "{}_{}, {}_{} merged. Overlap retained from newer station data.".format(network_name,station_old,network_name,station_new)
    
    # Convert all datatypes, to enable export     
    existing_float32 = [col for col in float32_variables if col in df_concat.columns]
    existing_U16 = [col for col in U16_variables if col in df_concat.columns]

    ds_concat[existing_float32] = ds_concat[existing_float32].astype('float32')
    ds_concat[existing_U16] = ds_concat[existing_U16].astype('U16')

    ds_concat.coords['station'] = ds_concat.coords['station'].astype('<U16')

     
    return None #


## CODE SCRAPS

In [None]:
def concatenate_target_stations_old(df):
    """
    Concatenates station data that has been flagged for concatenation
    
    Rules
    ------
        1.) 

    Parameters
    ------
        df: pd.dataframe
            staton data

    Returns
    -------
        if success:
            all processed datasets are exported to the merge folder in AWS and the original datasets are deleted
        if failure:
            None
    """

    # Apply reset index only to 'time', as we will need that for concatenation
    df_old = df_old.reset_index(level='time')
    df_new = df_new.reset_index(level='time')

    ##### Split datframes into subsets #####
    # if there is overlap, then create subsets

    # if no overlap, just concatenate

    # Remove data in time overlap between old and new
    df_old_cleaned = df_old[~df_old['time'].isin(df_new['time'])]
    df_new_cleaned = df_new[~df_new['time'].isin(df_old['time'])]

    # Data in new input that overlaps in time with old input
    df_overlap = df_new[df_new['time'].isin(df_old['time'])]

    # Set index to new input for df_old_cleaned
    # We want the final dataset to show up as the new station, not the old
    final_station_name = "{}_{}".format(network_name,station_new)
    new_index = [final_station_name] * len(df_old_cleaned)

    df_old_cleaned.index = new_index
    df_old_cleaned.index.name = 'station'
    
    ##### Concatenate subsets #####

    df_concat = concat([df_old_cleaned, df_overlap, df_new_cleaned])

    # Add 'time' back into multi index
    df_concat.set_index('time', append=True, inplace=True)
    
    
    # Convert concatenated dataframe to dataset
    ds_concat = df_concat.to_xarray()

    ##### Update attributes and datatypes #####

    # Include past attributes
    ds_concat.attrs = ds_new.attrs 
    

    # Update 'history' attribute
    timestamp = datetime.datetime.utcnow().strftime("%m-%d-%Y, %H:%M:%S")
    ds_concat.attrs["history"] = ds_new.attrs[
            "history"
        ] + " \nmaritime_merge.ipynb run on {} UTC".format(timestamp)
    
    # Update 'comment' attribute
    ds_concat.attrs["comment"] = "Final v1 data product. This data has been subjected to cleaning, QA/QC, and standardization."
    
    # Add new qaqc_files_merged attribute
    ds_concat.attrs["qaqc_files_merged"] = "{}_{}, {}_{} merged. Overlap retained from newer station data.".format(network_name,station_old,network_name,station_new)
    
    # Convert all datatypes, to enable export     
    existing_float32 = [col for col in float32_variables if col in df_concat.columns]
    existing_U16 = [col for col in U16_variables if col in df_concat.columns]

    ds_concat[existing_float32] = ds_concat[existing_float32].astype('float32')
    ds_concat[existing_U16] = ds_concat[existing_U16].astype('U16')

    ds_concat.coords['station'] = ds_concat.coords['station'].astype('<U16')

     
    return None #ds_concat


In [39]:
def reorder_station_list(station_list, concat_list, duplicate_list):
    """
    Reorders the input station list, necessary for concatenation
    
    Rules
    ------
        1.) 

    Parameters
    ------
        station_list: pd.dataframe

        concat_list: pd.dataframe

        duplicate_list: pd.dataframe

    Returns
    -------
        if success:
            output station list with stations to be concatenated at top, followed by potential duplicates
        if failure:
            None
    """

    ##### subsets of station list

    # stations that will be concatenated
    concat_stations = station_list[station_list['ICAO'].isin(concat_list)]

    # potential duplicate stations
    duplicate_stations = station_list[station_list['ICAO'].isin(duplicate_list)]

    # all remaining stations
    remaining_stations = station_list[~station_list['ICAO'].isin(duplicate_list+concat_list)]

    ##### sort concat list alphabetically, to ensure that stations with the same ICAO are grouped together
    concat_stations = concat_stations.sort_values('ICAO')
    duplicate_stations = duplicate_stations.sort_values('ICAO')

    ##### now within each ICAO, order by end time
    concat_stations = concat_stations.groupby(["ICAO"]).apply(lambda x: x.sort_values(["end_time"]))

    ##### concatenate susbets and reset index
    new_list = concat([concat_stations, duplicate_stations, remaining_stations]).reset_index(drop=True)

    return new_list

In [22]:
# check for presence of start and end times

time_check = repeat_list_subset.groupby('ICAO').apply(lambda x: x.isnull().any())

print('number of null start times:')
print(time_check['start_time'].sum())

print('number of null end times:')
print(time_check['end_time'].sum())


number of null start times:
0
number of null end times:
0


In [23]:
# check if the start and end times are identical

start_duplicate_check = repeat_list_subset.groupby('ICAO').apply(lambda x: x.duplicated(subset=['start_time'])).rename('check').reset_index()
end_duplicate_check = repeat_list_subset.groupby('ICAO').apply(lambda x: x.duplicated(subset=['end_time'])).rename('check').reset_index()


In [24]:
end_list = end_duplicate_check[end_duplicate_check['check']==True]['ICAO'].tolist()
start_list = start_duplicate_check[start_duplicate_check['check']==True]['ICAO'].tolist()

print(end_list)
print(start_list)

['KBOK', 'KMLF']
['K20V']


In [None]:
# what is going on with the stations that have duplicate start and end times? are they true duplicates?

repeat_list_subset[repeat_list_subset['ICAO'].isin(start_list + end_list)]

In [None]:
# load in single dc file from AWS
ds_1 = read_nc_from_s3_clean('ASOSAWOS', 'ASOSAWOS_72026294076', temp_dir) 
ds_2 = read_nc_from_s3_clean('ASOSAWOS', 'ASOSAWOS_A0000594076', temp_dir) 


#convert to formatted pandas dataframe
df_1 = qaqc_ds_to_df(ds_1, verbose=False)
df_2 = qaqc_ds_to_df(ds_2, verbose=False)

In [None]:
lon = df_1.lon.mean()
lat = df_1.lat.mean()
# print("{}, {:.5f}, {:.5f}".format(id, lon, lat))


# Plot time series of the data 
fig,ax = plt.subplots(figsize=(9,3))

df_1.plot(ax=ax, x="time", y="sfcWind")
df_2.plot(ax=ax, x="time", y="sfcWind")

ax.set_title("{}  ({:.3f}, {:.3f})".format(id,lon,lat))

In [None]:
def matching_check_old(station_list):
    """
    Resamples meteorological variables to hourly timestep according to standard conventions.

    Rules
    ------
        1.) 

    Parameters
    ------
        df: pd.DataFrame
            list of station information

    Returns
    -------
        if success:
            list
                list of ICAO values of stations that need to be concatenated
            list
                list of ICAO values of potential duplicate stations
        if failure:
            None
    """
    # Generate list of repeat ICAOs
    repeat_list = station_list[station_list.duplicated(subset=['ICAO'], keep=False)]
    repeat_list = repeat_list[['ICAO','ERA-ID','STATION NAME','start_time','end_time']]
    
    concat_list = repeat_list['ICAO'].unique().tolist()

    # And empty list to add potential duplicates to 
    duplicate_list = []

    ##### Generate boolean for whether or not there are null start and/or end times
    # TODO: may not be necessary
    time_check = repeat_list.groupby('ICAO').apply(lambda x: x.isnull().any())

    end_nan_list = time_check[time_check['end_time']==True]['ICAO'].tolist()
    start_nan_list = time_check[time_check['start_time']==True]['ICAO'].tolist()

    # add ICAOs of stations with nan start or end times to potential duplicates list
    duplicate_list = duplicate_list + start_nan_list + end_nan_list    

    duplicate_list = duplicate_list 
    
    ##### Identify ICAOs with duplicate start end times
    start_duplicate_check = repeat_list.groupby('ICAO').apply(lambda x: x.duplicated(subset=['start_time'])).rename('check').reset_index()
    end_duplicate_check = repeat_list.groupby('ICAO').apply(lambda x: x.duplicated(subset=['end_time'])).rename('check').reset_index()

    end_dup_list = end_duplicate_check[end_duplicate_check['check']==True]['ICAO'].tolist()
    start_dup_list = start_duplicate_check[start_duplicate_check['check']==True]['ICAO'].tolist()

    # add ICAOs of stations with nan start or end times to potential duplicates list
    duplicate_list = duplicate_list + start_dup_list + end_dup_list

    # Generate final list of ICAOs for stations to be concatenated
    concat_list = [x for x in concat_list if x not in duplicate_list]

    return concat_list, duplicate_list

In [34]:
# order the subset with only stations to concatenate

asosawos_list_concat['ICAO'] = pd.Categorical(asosawos_list_concat['ICAO'], categories=concat_list, ordered=True)

test_list = asosawos_list_concat.sort_values('ICAO').reset_index(drop=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  asosawos_list_concat['ICAO'] = pd.Categorical(asosawos_list_concat['ICAO'], categories=concat_list, ordered=True)


### Stations within a certain distance

In [8]:
# Convert the data into GeoDataFrames
# using EPSG 3310

gdf_asosawos = gpd.GeoDataFrame(asosawos_list, 
                        geometry=[Point(lon, lat) for lon, lat in zip(asosawos_list['LON'], asosawos_list['LAT'])],
                        crs="EPSG:4326").to_crs(epsg=3310)  

#### approach 3: find the nearest point in the geodataframe

In [9]:
# insert emtpy columns

gdf_asosawos['nearest_station'] = pd.Series(dtype='U16')
gdf_asosawos['distance'] = pd.Series(dtype='float32')


In [None]:
for index, row in gdf_asosawos.iterrows():
    # geometry of individual row 
    point = row.geometry
    # returns a multipoint object with the geometries of every row in the gdf
    multipoint = gdf_asosawos.drop(index, axis=0).geometry.unary_union
    # 
    queried_geom, nearest_geom = nearest_points(point, multipoint)
    dist_from_point = 
    gdf_asosawos.loc[index, 'nearest_geometry'] = nearest_geom
    gdf_asosawos.loc[index, 'distance'] = nearest_geom

#### approach 2: distance function

In [20]:
## function to calculate the distance between points

def distance_sort_filter(row, df2, buffer=None, id=False):

    dist = df2.geometry.distance(row).sort_values()

    if buffer:
        dist = dist[dist<buffer]

    if id:
        distances = {df2.loc[idx]['WBAN']:value for idx,value in zip(dist.index, dist.values)}
    else:
        distances = {idx:value for idx,value in zip(dist.index, dist.values)}
    
    return distances

#### approach 1: using sjoin

In [13]:
# Create a buffer around points in gdf1 (e.g., 10 km buffer)
gdf_asosawos['buffer'] = gdf_asosawos.geometry.buffer(.1)  # Buffer in degrees, 0.1 degrees approx equals 10 km

In [None]:
# Perform a spatial join using the buffer
merged = gpd.sjoin(gdf_asosawos, gdf_asosawos[['geometry', 'buffer']], how="inner", predicate="within")

# The 'merged' GeoDataFrame contains points from gdf_isd that are within the buffer around points in gdf_asosawos

In [None]:
print(merged) # there are not ISD stations within 10km of an ASOSAWOS station missed by the exact matching

In [None]:
### Round asosawos down to 3 decimal points of accuracy
# asosawos_round = asosawos_list.round({"LAT": 3, "LON": 3})