In [1]:
import pandas as pd
import numpy as np

In [2]:
df = np.load('maurer.pickle', allow_pickle= True)
df_attributes = np.load('attributes.pickle', allow_pickle= True)
att_csv = pd.read_csv('attributes.csv')
basin_list = pd.read_csv('basin_list.txt', header=None)

In [3]:
attributes = [#'gauge_id', 
               'p_mean', 'pet_mean', 'p_seasonality', 'frac_snow',
               'aridity', 'high_prec_freq', 'high_prec_dur', 
                #'high_prec_timing',
               'low_prec_freq', 'low_prec_dur', 
                #'low_prec_timing', 'geol_1st_class', 'glim_1st_class_frac', 'geol_2nd_class', 'glim_2nd_class_frac',
               'carbonate_rocks_frac', #'geol_porostiy', 
                'geol_permeability', 
                #'q_mean','runoff_ratio', 'slope_fdc', 'baseflow_index', 'stream_elas', 'q5',
               #'q95', 'high_q_freq', 'high_q_dur', 'low_q_freq', 'low_q_dur',
               #'zero_q_freq', 'hfd_mean', 'huc_02', 'gauge_name',
               'soil_depth_pelletier', 'soil_depth_statsgo', 'soil_porosity','soil_conductivity', 
                'max_water_content', 'sand_frac', 'silt_frac', 'clay_frac', 
                #'water_frac', 'organic_frac', 'other_frac', 
                #'gauge_lat','gauge_lon', 
                'elev_mean', 'slope_mean', 
                #'area_gages2',
                'area_geospa_fabric', 
                'frac_forest', 'lai_max', 'lai_diff', 'gvf_max','gvf_diff', 
                #'dom_land_cover_frac', 'dom_land_cover', 'root_depth_50','root_depth_99', 'hru08'
                ]

In [4]:
basin_list = basin_list[0].apply(lambda x: '0' + str(x) if len(str(x)) < 8 else str(x))

In [5]:
ymd = df['01013500'].groupby(['Year', 'Mnth']).size().reset_index()
ymd['d_cumsum'] = 0
for y in ymd['Year']:
    temp = ymd.loc[ymd['Year'] == y, 0].cumsum()
    ymd.loc[ymd['Year'] == y,'d_cumsum'] =  np.concatenate(([0], temp[:-1]))

In [6]:
for k,v in df.items():
    df[k].reset_index(inplace=True)
    df[k] = pd.DataFrame.merge(v, ymd, right_on=['Year', 'Mnth'], left_on=['Year', 'Mnth'])
    df[k]['n_day'] = df[k]['d_cumsum'] + df[k]['Day']
    df[k]['t'] = df[k]['Year'] + (df[k]['n_day'] - 1) / 366

In [7]:
df_filtered = {}
for (b) in basin_list: 
    if (df[b].shape[0] < 10593):
        basin_list = basin_list[basin_list!=b]
        continue
    df_filtered[b] = df[b][['Date','t', 'Dayl(s)', 'PRCP(mm/day)', 'SRAD(W/m2)', 'Tmax(C)', 'Tmin(C)', 'Vp(Pa)', 'Q']]

In [8]:
%%time
# Choose type of transform, i.e., 'standardize' or 'normalize' 
dist = 'gaussian'

if dist == 'gaussian':
    transform = 'standardize'
    log_P = True
    log_Q = True
    
if dist == 'gamma':
    transform = 'normalize'
    log_P = True
    log_Q = True
    gamma_shift = 1e-3

divide_by_area = True
cols = ['t','Dayl(s)', 'PRCP(mm/day)', 'SRAD(W/m2)', 'Tmax(C)', 'Tmin(C)', 'Vp(Pa)', 'Q']
epsilon = 1e-3

x_maxs, x_mins, x_means, x_stds = [], [], [], []
for k,v in df_filtered.items():
#     # Scale streamflow values by catchment area
    if divide_by_area:
        v['Q'] = v['Q']/df_attributes[k]['area_geospa_fabric'].values
    
    # Calculate mean (after scaling by area)
#     v['Q_mu'] = v['Q'].mean()
    
#     Log-transform precipitation
    if log_P: 
        v['PRCP(mm/day)'] = np.log(v['PRCP(mm/day)'] + epsilon)
    
    # Log-transform streamflow
    if log_Q: 
        v['Q'] = np.log(v['Q'] + epsilon)
    
#     x_maxs.append(v[cols].max().values)
#     x_mins.append(v[cols].min().values)
    x_means.append(v[cols].values)
    x_stds.append(v[cols].values)

# x_max = np.concatenate(x_maxs).reshape(-1,len(cols)).max(axis=0)
# x_min = np.concatenate(x_mins).reshape(-1,len(cols)).min(axis=0)
x_mean = np.concatenate(x_means, axis=0).mean(axis=0)
x_std = np.concatenate(x_stds, axis=0).std(axis = 0)

for k,v in df_filtered.items():
    for i, col in enumerate(cols):
        if transform == 'normalize':
            
            v[col] = (v[col] - x_min[i]) / (x_max[i] - x_min[i])
            
            if dist=="gamma":
                v['Q'] = v['Q'] + gamma_shift
            
            def rev_transform(x):
                x = x * (x_max[0] - x_min[0]) + x_min[0]
                if log_Q:
                    x = np.exp(x) - epsilon
                if dist == "gamma":
                    x = x - gamma_shift
                return x
            
            def rev_transform_tensor(x):
                x = x * (x_max[0] - x_min[0]) + x_min[0]
                if log_Q:
                    x = torch.exp(x) - epsilon 
                if dist == "gamma":
                    x = x - gamma_shift
                return x
        
        elif transform == 'standardize':
            
            v[col] = (v[col] - x_mean[i]) / x_std[i]
            
            #WARNING -- NO GAMMA SHIFT
            
            def rev_transform(x):
                x = x * x_std[0] + x_mean[0]
                if log_Q:
                    x = np.exp(x) - epsilon
                if dist == "gamma":
                    x = x - gamma_shift
                return x

            def rev_transform_tensor(x):
                x = x * x_std[0] + x_mean[0]
                if log_Q:
                    x = torch.exp(x) - epsilon 
                if dist == "gamma":
                    x = x - gamma_shift
                return x
        
        else:
            print("No transform has been applied.")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

CPU times: user 2.56 s, sys: 317 ms, total: 2.88 s
Wall time: 2.88 s


In [9]:
# Each key in df_filtered represents a different basin,
# Q is the target variable
df_filtered['01022500']

Unnamed: 0,Date,t,Dayl(s),PRCP(mm/day),SRAD(W/m2),Tmax(C),Tmin(C),Vp(Pa),Q
0,1980-01-01,-1.731643,-1.685847,-1.263592,-1.091910,-1.521315,-1.521315,-1.065478,0.701785
1,1980-01-02,-1.731317,-1.669542,-1.263592,-1.059105,-1.451285,-1.451285,-1.054692,0.640415
2,1980-01-03,-1.730990,-1.659879,-1.263592,-1.003921,-1.524154,-1.524154,-1.104178,0.578850
3,1980-01-04,-1.730664,-1.648827,-1.263592,-1.104774,-1.831715,-1.831715,-1.178520,0.518092
4,1980-01-05,-1.730338,-1.648827,-1.263592,-1.279001,-1.991647,-1.991647,-1.199030,0.469766
...,...,...,...,...,...,...,...,...,...
10588,2008-12-27,1.730574,-1.697352,1.009416,-1.274967,-1.473051,-1.473051,-1.043021,1.154240
10589,2008-12-28,1.730900,-1.697352,0.866073,-1.477890,-0.822914,-0.822914,-0.648048,1.322055
10590,2008-12-29,1.731226,-1.697352,0.370540,-1.086582,-0.663928,-0.663928,-0.614612,1.436933
10591,2008-12-30,1.731553,-1.697352,0.035316,-1.260429,-1.029216,-1.029216,-0.898361,1.396015
