# Set final variable order and names


Set final order and names of variables, for consistency.

## Environment set-up

In [3]:
import datetime
import boto3
import geopandas as gpd
import numpy as np
import pandas as pd
import xarray as xr
import matplotlib.pyplot as plt
from io import BytesIO, StringIO

# Silence warnings
import warnings
from shapely.errors import ShapelyDeprecationWarning

warnings.filterwarnings("ignore", category=RuntimeWarning)
warnings.filterwarnings(
    "ignore", category=ShapelyDeprecationWarning
)  # Warning is raised when creating Point object from coords. Can't figure out why.

plt.rcParams["figure.dpi"] = 300

In [4]:
# AWS credentials
s3 = boto3.resource("s3")
s3_cl = boto3.client("s3")

## AWS buckets
bucket = "wecc-historical-wx"
cleandir = '2_clean_wx/'
qaqcdir = "3_qaqc_wx/"
mergedir = "4_merge_wx/"

## Step 1: Assess differences in order and names

Load in datasets from VALLEYWATER, ASOSAWOS, and MARITIME. 

In [5]:
url = "s3://wecc-historical-wx/3_qaqc_wx/ASOSAWOS/ASOSAWOS_69007093217.zarr"
ds = xr.open_zarr(url)
df_a = ds.to_dataframe()

In [6]:
url = "s3://wecc-historical-wx/3_qaqc_wx/MARITIME/MARITIME_ANVC1.zarr"
ds = xr.open_zarr(url)
df_m = ds.to_dataframe()

In [7]:
url = "s3://wecc-historical-wx/3_qaqc_wx/VALLEYWATER/VALLEYWATER_6001.zarr"
ds = xr.open_zarr(url)
df_v = ds.to_dataframe()

In [12]:
print(df_a.columns)
print(df_m.columns)
print(df_v.columns)

Index(['anemometer_height_m', 'elevation', 'elevation_eraqc', 'lat', 'lon',
       'pr', 'pr_depth_qc', 'pr_duration', 'pr_eraqc', 'pr_qc', 'ps_altimeter',
       'ps_altimeter_eraqc', 'ps_altimeter_qc', 'ps_qc', 'psl', 'psl_eraqc',
       'psl_qc', 'qaqc_process', 'sfcWind', 'sfcWind_dir', 'sfcWind_dir_eraqc',
       'sfcWind_dir_qc', 'sfcWind_eraqc', 'sfcWind_method', 'sfcWind_qc',
       'tas', 'tas_eraqc', 'tas_qc', 'tdps', 'tdps_eraqc', 'tdps_qc',
       'thermometer_height_m'],
      dtype='object')
Index(['anemometer_height_m', 'elevation', 'elevation_eraqc', 'lat', 'lon',
       'ps', 'ps_eraqc', 'sfcWind', 'sfcWind_dir', 'sfcWind_dir_eraqc',
       'sfcWind_eraqc', 'tas', 'tas_eraqc', 'thermometer_height_m'],
      dtype='object')
Index(['anemometer_height_m', 'elevation', 'elevation_eraqc', 'lat', 'lon',
       'pr_15min', 'pr_15min_eraqc', 'raw_qc', 'thermometer_height_m'],
      dtype='object')


In [None]:
# define standard order and name

var_order = [] 
# could be alphabetical within types perhaps
## all containing 'qc' at the end
##
var_names = {} # this could be a library

In [71]:
def reorder_rename(df) -> pd.DataFrame:
    """


    Rules
    ------
        1.)
    Parameters
    ------
        df: pd.DataFrame

    Returns
    -------
        if success:

        if failure:
            None
    Notes
    -------

    """
    # Reorder variables
    desired_order = [
        "ps",
        "tas",
        "tdps",
        "pr",
        "hurs",
        "rsds",
        "sfcWind",
        "sfcWind_dir",
        "pvp",
        "svp",
        ]

    #actual_order = [i for i in desired_order if i.str.contains(desired_order)] #change to if includes string
    actual_order = [i for i in desired_order if i in list(df.columns)]

    # extract qaqc variables, to be sent to the back

    qaqc_vars = [i for i in actual_order if i ]

    rest_of_vars = [
        i for i in list(df.columns) if i not in desired_order
    ]  # Retain rest of variables at the bottom

    new_order = actual_order + rest_of_vars

    df = df[new_order]

    # remove 'method' and 'duration vars

    return df

In [72]:
df_test = reorder_rename(df_a)
df_test.head(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,tas,tdps,pr,sfcWind,sfcWind_dir,anemometer_height_m,elevation,elevation_eraqc,lat,lon,...,sfcWind_dir_eraqc,sfcWind_dir_qc,sfcWind_eraqc,sfcWind_method,sfcWind_qc,tas_eraqc,tas_qc,tdps_eraqc,tdps_qc,thermometer_height_m
station,time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
ASOSAWOS_69007093217,1991-01-02 12:00:00,19,19,0.0,4.1,90.0,,41.0,,36.683,-121.767,...,,1,,N,1,,1,28.0,1,
ASOSAWOS_69007093217,1991-01-02 13:00:00,19,19,,3.6,90.0,,41.0,,36.683,-121.767,...,,1,,N,1,,1,28.0,1,


In [62]:
desired_order = [
    "ps",
    "tas",
    "tdps",
    "pr",
    "hurs",
    "rsds",
    "sfcWind",
    "sfcWind_dir",
    "pvp",
    "svp",
    ]

In [None]:

actual_order = [i for i in desired_order if i in list(df_a.columns)]



In [None]:
# not useful yet, but will be
qaqc_vars = [i for i in actual_order if 'qc' in i]

In [None]:
rest_of_vars = [
    i for i in list(df_a.columns) if i not in desired_order
]  # Retain rest of variables at the bottom