# Set final variable order 


Set final order, for consistency.

## Environment set-up

In [None]:
import datetime
import boto3
import geopandas as gpd
import numpy as np
import pandas as pd
import xarray as xr
import matplotlib.pyplot as plt
from io import BytesIO, StringIO

# Silence warnings
import warnings
from shapely.errors import ShapelyDeprecationWarning

warnings.filterwarnings("ignore", category=RuntimeWarning)
warnings.filterwarnings(
    "ignore", category=ShapelyDeprecationWarning
)  # Warning is raised when creating Point object from coords. Can't figure out why.

plt.rcParams["figure.dpi"] = 300

In [2]:
# AWS credentials
s3 = boto3.resource("s3")
s3_cl = boto3.client("s3")

## AWS buckets
bucket = "wecc-historical-wx"
cleandir = '2_clean_wx/'
qaqcdir = "3_qaqc_wx/"
mergedir = "4_merge_wx/"

## Step 1: Reorder Columns

Load in datasets from VALLEYWATER, ASOSAWOS, and MARITIME for testing

In [3]:
url = "s3://wecc-historical-wx/3_qaqc_wx/ASOSAWOS/ASOSAWOS_69007093217.zarr"
ds = xr.open_zarr(url)
df_a = ds.to_dataframe()

In [53]:
url = "s3://wecc-historical-wx/3_qaqc_wx/MARITIME/MARITIME_ANVC1.zarr"
ds = xr.open_zarr(url)
df_m = ds.to_dataframe()

In [54]:
url = "s3://wecc-historical-wx/3_qaqc_wx/VALLEYWATER/VALLEYWATER_6001.zarr"
ds = xr.open_zarr(url)
df_v = ds.to_dataframe()

In [None]:
def reorder_variables(df: pd.DataFrame) -> pd.DataFrame:
    """
    Reorders input dataframe columns

    Rules
    ------
        1.) Non-qaqc variables that start with the strings in "desired_order" come first,
            followed by their associated qaqc variables, followed by all remaining variables

    Parameters
    ------
    df: pd.DataFrame

    Returns
    -------
    if success:
        df: pd.DataFrame

    if failure:
        None

    Notes
    -------

    """
    # Reorder variables
    desired_order = [
        "ps",
        "tas",
        "tdps",
        "pr",
        "hurs",
        "rsds",
        "sfcWind",
        "pvp",
        "svp",
        ]

    # Select variables with names that start with those in "desired_order"
    new_order = [
        i for keyword in desired_order for i in df.columns if i.startswith(keyword)
    ]

    # Now split them into qaqc and non-qaqc variables
    qaqc_vars = [i for i in new_order if "qc" in i]
    nonqaqc_vars = [i for i in new_order if i not in qaqc_vars]

    # Now store all remaining columns
    rest_of_vars = [
        i for i in list(df.columns) if i not in new_order
    ]  

    # Generate the complete list of variables, in the correct order
    final_order = nonqaqc_vars + qaqc_vars + rest_of_vars

    # Remove 'method' and 'duration' vars
    final_order = [
        i for i in final_order if not any(sub in i for sub in ["duration", "method"])
    ]

    # Use that list to reorder the columns in "df"
    df = df[final_order]

    return df

In [80]:
df_test = reorder_variables(df_v)
df_test.head(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,pr_15min,elevation_eraqc,pr_15min_eraqc,raw_qc,anemometer_height_m,elevation,elevation_eraqc,lat,lon,pr_15min_eraqc,raw_qc,thermometer_height_m
station,time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
VALLEYWATER_6001,1985-02-07 20:30:00,0.0,3.0,,Approved,,0.0,3.0,37.2471,-121.871,,Approved,
VALLEYWATER_6001,1985-02-07 20:45:00,0.0,3.0,,Approved,,0.0,3.0,37.2471,-121.871,,Approved,
