In [1]:
# Import libraries
import boto3
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import matplotlib
from io import BytesIO, StringIO
from datetime import datetime, timezone, date
import xarray as xr
import s3fs

In [2]:
stn_list = pd.read_csv('/Users/victoriaford/Desktop/temp_clean_master_station_list.csv')

In [37]:
def update_stnlist_cleanvars(network, update=False):
    s3 = boto3.resource("s3")
    s3_cl = boto3.client("s3")
    
    bucket_name="wecc-historical-wx"
    directory="2_clean_wx/"
    
    if update == False:
        obj = s3_cl.get_object(Bucket = bucket_name, Key = "2_clean_wx/temp_clean_master_station_list.csv")        
        body = obj['Body'].read()
        df_clean = pd.read_csv(BytesIO(body), encoding='utf8')
        
    # add in default columns of "N" to cleaned station list for all core variables
    core_vars = ['tas', 'ps', 'tdps', 'hurs', 'pr', 'sfcWind', 'sfcWind_dir', 'rsds']
    for var in core_vars:
        df_clean[str(var)] = "N"
                
    # open cleaned datafile
    files = [] # Get files
    cleandir = "{0}{1}/".format(directory, network)
    for item in s3.Bucket(bucket_name).objects.filter(Prefix = cleandir):
        file = str(item.key)
        files += [file]
        
    # get list of station filenames successfully cleaned    
    files = list(filter(lambda f: f.endswith(".nc"), files)) 
    print('{0}: {1} stations'.format(network, len(files)))
    
    for file in files: 
        if file not in files: # dont run qa/qc on a station that isn't cleaned
            continue
        else:
            try:
                fs = s3fs.S3FileSystem()
                aws_url = "s3://wecc-historical-wx/"+file

                with fs.open(aws_url) as fileObj:
                    ds = xr.open_dataset(fileObj, engine='h5netcdf')

                    # mark each variable as present if in dataset
                    for var in ds.variables:
                        if var == "tdps_derived":  # tdps requires handling for tdps_derived
                            df_clean.loc[df_clean['era-id']==ds.station.values[0], 'tdps'] = 'Y'
                            
                        elif var == "pr_1h" or var=="pr_24h" or var=='pr_5min': # pr has multiple options, this is one
                            df_clean.loc[df_clean['era-id']==ds.station.values[0], 'pr'] = 'Y'  
                            
                        elif var == "ps_altimeter" or var == "psl" or var=='ps_derived':
                            df_clean.loc[df_clean['era-id']==ds.station.values[0], 'ps'] = 'Y'
                            
                        elif var in core_vars:
                            df_clean.loc[df_clean['era-id']==ds.station.values[0], str(var)] = 'Y'

                    # close dataset
                    ds.close()
            except:
                print('{} not opening'.format(file))
                continue

    # resort by network
    df_clean.sort_values(by=['network'], inplace = True)

    # reset index
    df_clean = df_clean.reset_index(drop = True)
    
    return df_clean.loc[df_clean['network']==network]

In [38]:
%%time
update_stnlist_cleanvars(network='MARITIME')

MARITIME: 80 stations
CPU times: user 1min 43s, sys: 36.5 s, total: 2min 20s
Wall time: 17min 59s


Unnamed: 0.1,Unnamed: 0,era-id,latitude,longitude,elevation,start-date,end-date,cleaned,time_cleaned,network,tas,ps,tdps,hurs,pr,sfcWind,sfcWind_dir,rsds
11621,11700,MARITIME_poro3,42.739,-124.498,,,,Y,2023-02-06 22:23:34+00:00,MARITIME,Y,Y,N,N,N,Y,Y,N
11622,11701,MARITIME_erkc1,40.778,-124.196,,,,N,,MARITIME,N,N,N,N,N,N,N,N
11623,11702,MARITIME_elxc1,36.815,-121.738,,,,N,,MARITIME,N,N,N,N,N,N,N,N
11624,11703,MARITIME_elqc1,36.818,-121.739,,,,N,,MARITIME,N,N,N,N,N,N,N,N
11625,11704,MARITIME_ehsc1,36.835,-121.738,,,,N,,MARITIME,N,N,N,N,N,N,N,N
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11740,11660,MARITIME_omhc1,37.801,-122.330,,,,Y,2023-02-06 22:20:04+00:00,MARITIME,N,N,N,N,N,Y,Y,N
11741,11661,MARITIME_ptac1,38.955,-123.741,,,,Y,2023-02-06 22:28:08+00:00,MARITIME,Y,Y,N,N,N,Y,Y,N
11742,11662,MARITIME_okxc1,37.811,-122.333,,,,Y,2023-02-06 22:19:17+00:00,MARITIME,Y,Y,N,N,N,Y,Y,N
11743,11663,MARITIME_ohbc1,33.720,-118.273,,,,Y,2023-02-06 22:18:31+00:00,MARITIME,N,Y,N,N,N,N,N,N


In [30]:
ds = xr.open_dataset('/Users/victoriaford/Downloads/VCAPCD_TO.nc')
df = ds.to_dataframe()
df.head()

ds.close()

In [None]:
def update_stnlist_cleanvars(bucket_name, directory, update = False):
    s3 = boto3.resource("s3") 
    s3_cl = boto3.client('s3')
    
    if update == False:
        obj = s3_cl.get_object(Bucket = bucket_name, Key = "2_clean_wx/temp_clean_master_station_list.csv")
        body = obj['Body'].read()
        df_clean = pd.read_csv(BytesIO(body), encoding='utf8')
        
    # add in default columns of "N" to cleaned station list
    core_vars = ['tas', 'ps', 'tdps', 'hurs', 'pr', 'sfcWind', 'sfcWind_dir', 'rsds']
    for var in core_vars:
        df_clean[str(var)] = "N"
                
    # open cleaned datafile
    files = [] # Get files
    for item in s3.Bucket(bucket_name).objects.filter(Prefix = directory):
        file = str(item.key)
        files += [file]

    # get list of station filenames successfully cleaned    
    files = list(filter(lambda f: f.endswith(".nc"), files)) 
    print(len(files))
        
    for file in files: 
        if file not in files: # dont run qa/qc on a station that isn't cleaned
            continue
        else:
            try:
                print(file)
                fs = s3fs.S3FileSystem()
                aws_url = "s3://wecc-historical-wx/"+file

                with fs.open(aws_url) as fileObj:
                    ds = xr.open_dataset(fileObj, engine='h5netcdf')

                    # mark each variable as present if in dataset
                    for var in ds.variables:
                        if var in core_vars:
                            df_clean.loc[df_clean['era-id']==ds.station.values[0], str(var)] = "Y"

                    # close dataset
                    ds.close()
            except:
                print('{} not opening'.format(file))
                continue

    # resort by network
    df_clean.sort_values(by=['network'], inplace = True)

    # reset index
    df_clean = df_clean.reset_index(drop = True)
                
    # save station chart to AWS
    csv_buffer = StringIO()
    df_clean.to_csv(csv_buffer, na_rep = "NaN")
    content = csv_buffer.getvalue()
    s3_cl.put_object(Bucket=bucket_name, Body=content, Key="2_clean_wx/temp_clean_master_station_list_withvars.csv")
    
    return df_clean

In [None]:
bucket_name = "wecc-historical-wx"
directory = "2_clean_wx/"

In [None]:
%%time
update_stnlist_cleanvars(bucket_name, directory)
# note this takes a long time to run