In [None]:
# %pip install progressbar xgboost matplotlib boto3 openpyxl tqdm hydroeval hydrotools

In [1]:
# hydrological packages
from hydrotools.nwm_client import utils 

# my packages
from evaluation_metric import MAPE, RMSE, KGE, PBias

# basic packages
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# system packages
from progressbar import ProgressBar
from datetime import datetime, date
import pickle
import warnings
warnings.filterwarnings("ignore")
import platform
import time

# data analysi packages
from scipy import optimize

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

# deep learning packages
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset


In [2]:
if platform.system() == 'Windows':
    onedrive_path = 'E:/OneDrive/OneDrive - The University of Alabama/10.material/01.data/usgs_data/'
    box_path = 'C:/Users/snaserneisary/Box/NWM-ML/'

elif platform.system() == 'Darwin':
    onedrive_path = '/Users/savalan/Library/CloudStorage/OneDrive-TheUniversityofAlabama/02.projects/03.ciroh/04.data/'
    box_path = '/Users/savalan/Library/CloudStorage/Box-Box/NWM-ML/Data/NWM/ut/'
    
elif platform.system() == 'Linux':
    path_01 = '/home/snaserneisary/01.projects/01.ciroh_p8/NWM-ML/Savalan/'

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

Device: cpu


In [4]:
raw_training_data = pd.read_csv(path_01 + '03.output/raw_training_data.csv')
raw_training_data.pop('Unnamed: 0')
raw_training_data['station_id'] = raw_training_data['station_id'].astype('str')
raw_training_data.head()

Unnamed: 0,station_id,Lat,Long,Drainage_area_mi2,Mean_Basin_Elev_ft,Perc_Forest,Perc_Develop,Perc_Imperv,Perc_Herbace,Perc_Slop_30,...,datetime,flow_cfs,s1,s2,temperature_F,precipitation_in,storage,swe,NWM_flow,DOY
0,10011500,40.965225,-110.853508,174.0,9720.0,67.7,1.2,0.12,2.94,27.2,...,2012-03-13,45.356945,0.515038,0.857167,35.096,0.0,0.0,7.7,60.0,73
1,10011500,40.965225,-110.853508,174.0,9720.0,67.7,1.2,0.12,2.94,27.2,...,2012-03-14,49.75,0.515038,0.857167,35.258,0.0,0.0,7.45,62.0,74
2,10011500,40.965225,-110.853508,174.0,9720.0,67.7,1.2,0.12,2.94,27.2,...,2012-03-15,52.483334,0.515038,0.857167,36.86,0.0,0.0,7.35,65.0,75
3,10011500,40.965225,-110.853508,174.0,9720.0,67.7,1.2,0.12,2.94,27.2,...,2012-03-16,60.296875,0.515038,0.857167,38.12,0.0,0.0,7.25,63.0,76
4,10011500,40.965225,-110.853508,174.0,9720.0,67.7,1.2,0.12,2.94,27.2,...,2012-03-17,68.876045,0.515038,0.857167,38.102,0.04698,0.0,6.85,65.0,77


In [5]:
Training_DF = raw_training_data.copy()

### Editing the features based on the feature importance should be in the next cell!!!!!!!!!!!!!!!

In [6]:
# Editing the features based on the feature importance should be done here!!!!!!!!!!!!!!!

Training_DF.drop(['precipitation_in', 'temperature_F', 'Mean_Ann_Precip_in', 'Perc_Herbace', 'Perc_Forest',
                        'Mean_Basin_Elev_ft'], axis=1, inplace=True)



### Remove headwater stations!!!!!!!

In [7]:
headwater_stations = ['10011500', '10109000', '10113500', '10128500', '10131000', '10146400', '10150500', '10154200',
'10172700', '10172800', '10172952']
Training_DF = Training_DF[~raw_training_data['station_id'].isin(headwater_stations)]

In [8]:
Training_DF.datetime = pd.to_datetime(Training_DF.datetime)
Training_DF.head()

Unnamed: 0,station_id,Lat,Long,Drainage_area_mi2,Perc_Develop,Perc_Imperv,Perc_Slop_30,datetime,flow_cfs,s1,s2,storage,swe,NWM_flow,DOY
789,10126000,41.576321,-112.100782,7040.0,4.28,0.55,1.94,2012-03-01,1488.4375,0.515038,0.857167,85.588235,12.778571,1542.0,61
790,10126000,41.576321,-112.100782,7040.0,4.28,0.55,1.94,2012-03-02,1282.1875,0.515038,0.857167,86.712418,13.085714,1542.0,62
791,10126000,41.576321,-112.100782,7040.0,4.28,0.55,1.94,2012-03-03,1269.4791,0.515038,0.857167,87.359477,13.178571,1542.0,63
792,10126000,41.576321,-112.100782,7040.0,4.28,0.55,1.94,2012-03-04,1551.5625,0.515038,0.857167,88.163399,13.221429,1541.0,64
793,10126000,41.576321,-112.100782,7040.0,4.28,0.55,1.94,2012-03-05,1490.5209,0.515038,0.857167,89.026144,13.221429,1541.0,65


In [9]:
x_train_temp = Training_DF[Training_DF.datetime < '01-01-2015']
x_train_temp.pop('station_id')
x_train_temp.pop('datetime')
y_train_temp = x_train_temp['flow_cfs']
x_train_temp.pop('flow_cfs')
x_train_temp.head()

Unnamed: 0,Lat,Long,Drainage_area_mi2,Perc_Develop,Perc_Imperv,Perc_Slop_30,s1,s2,storage,swe,NWM_flow,DOY
789,41.576321,-112.100782,7040.0,4.28,0.55,1.94,0.515038,0.857167,85.588235,12.778571,1542.0,61
790,41.576321,-112.100782,7040.0,4.28,0.55,1.94,0.515038,0.857167,86.712418,13.085714,1542.0,62
791,41.576321,-112.100782,7040.0,4.28,0.55,1.94,0.515038,0.857167,87.359477,13.178571,1542.0,63
792,41.576321,-112.100782,7040.0,4.28,0.55,1.94,0.515038,0.857167,88.163399,13.221429,1541.0,64
793,41.576321,-112.100782,7040.0,4.28,0.55,1.94,0.515038,0.857167,89.026144,13.221429,1541.0,65


In [10]:
# Scale the train inputs of the NN model
# First we need to convert it from pandas dataframe to a numpy array 
y_train = y_train_temp.to_numpy()
x_train = x_train_temp.to_numpy()
scaler = MinMaxScaler()
x_train_scaled = scaler.fit_transform(x_train)
y_scaled_train = scaler.fit_transform(y_train.reshape(-1, 1))
y_scaled_train.shape

(8174, 1)

In [11]:
# Determining the test dataset. 
x_test_temp = Training_DF[Training_DF.datetime >= '01-01-2015']
x_test_temp.head()

Unnamed: 0,station_id,Lat,Long,Drainage_area_mi2,Perc_Develop,Perc_Imperv,Perc_Slop_30,datetime,flow_cfs,s1,s2,storage,swe,NWM_flow,DOY
874,10126000,41.576321,-112.100782,7040.0,4.28,0.55,1.94,2015-03-01,723.2917,0.515038,0.857167,79.712418,10.171429,1658.0,60
875,10126000,41.576321,-112.100782,7040.0,4.28,0.55,1.94,2015-03-02,742.03125,0.515038,0.857167,80.156863,10.185714,1668.0,61
876,10126000,41.576321,-112.100782,7040.0,4.28,0.55,1.94,2015-03-03,729.1875,0.515038,0.857167,80.620915,10.335714,1682.0,62
877,10126000,41.576321,-112.100782,7040.0,4.28,0.55,1.94,2015-03-04,792.375,0.515038,0.857167,80.96732,10.564286,1701.0,63
878,10126000,41.576321,-112.100782,7040.0,4.28,0.55,1.94,2015-03-05,848.1667,0.515038,0.857167,81.320261,10.678571,1720.0,64


In [12]:
# Scale the test inputs of the NN model
# First we need to convert it from pandas dataframe to a numpy array 
x_test_temp_1 = x_test_temp.copy()
station_index_list = x_test_temp_1['station_id']
x_test_temp_1.pop('station_id')
x_test_temp_1.pop('datetime')
y_test_temp_1 = x_test_temp_1['flow_cfs']
x_test_temp_1.pop('flow_cfs')
x_test_1_np = x_test_temp_1.reset_index(drop=True).to_numpy()
y_test_1_np = y_test_temp_1.reset_index(drop=True).to_numpy()
x_test_1_scaled = scaler.fit_transform(x_test_1_np)
y_scaled_test_1 = scaler.fit_transform(y_test_1_np.reshape(-1, 1))

In [13]:
# MLP

n_targets = 1
tries = 10
cri_temp_nse = np.zeros([3, n_targets, tries])
cri_temp_rmse = np.zeros([3, n_targets, tries])
cri_temp_r2 = np.zeros([3, n_targets, tries])
cri_temp_kge = np.zeros([3, n_targets, tries])
cri_temp_lognse = np.zeros([3, n_targets, tries])
print('finish')

finish


In [14]:
# Reshape input for MLP model
x_train_scaled_test = torch.Tensor(x_train_scaled)
y_train_scaled_test = torch.Tensor(y_scaled_train)
print('test shape', x_train_scaled_test.shape)
print('train shape', y_train_scaled_test.shape)

test shape torch.Size([8174, 12])
train shape torch.Size([8174, 1])


In [15]:
# Take data to the proper device (GPU or CPU)
# x_train_scaled_test = x_train_scaled_test.to(device)
# y_train_scaled_test = y_train_scaled_test.to(device)

In [None]:
# # Hyperparameters
# epochs = 100
# batch_size = 100
# learning_rate = 0.001
# decay = 1e-2
# validation_split = 0.2
# neurons = 150
# LD1=128
# LD2=128
# LD3=64
# LD4=64
# LD5=32
# LD6=16
# LD7=5

In [None]:
# start_time = time.time()

# # Create PyTorch datasets and dataloaders
# train_dataset = TensorDataset(x_train_scaled_test, y_train_scaled_test)
# train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)

# # Build the model
# model = nn.Sequential(
#     nn.Linear(x_test_1_scaled.shape[1], LD1),
#     nn.ReLU(),
#     nn.Linear(LD1, LD2),
#     nn.ReLU(),
#     nn.Linear(LD2, LD3),
#     nn.ReLU(),
#     nn.Linear(LD3, LD4),
#     nn.ReLU(),
#     nn.Linear(LD4, LD5),
#     nn.ReLU(),
#     nn.Linear(LD5, LD6),
#     nn.ReLU(),
#     nn.Linear(LD6, 1)
# ).to(device)

# # Define loss and optimizer
# criterion = nn.MSELoss()
# optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# # Training loop
# for epoch in range(epochs):
#     total_loss = 0.0
#     for inputs, labels in train_loader:
#         inputs, labels = inputs.to(device), labels.to(device)
#         optimizer.zero_grad()
#         outputs = model(inputs)
#         loss = criterion(outputs, labels)
#         loss.backward()
#         optimizer.step()
#         total_loss += loss.item()
#     print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss / len(train_loader)}")
# print('finish')
# print("Run Time:" + " %s seconds " % (time.time() - start_time))
# #save model
# #torch.save(model.state_dict(), f"./mlp_models/mlp_model.pkl")

In [17]:
# # Build and load the model
device = torch.device('cpu') # for some reason had to change to cpu
models = nn.Sequential(
    nn.Linear(x_test_1_scaled.shape[1], LD1),
    nn.ReLU(),
    nn.Linear(LD1, LD2),
    nn.ReLU(),
    nn.Linear(LD2, LD3),
    nn.ReLU(),
    nn.Linear(LD3, LD4),
    nn.ReLU(),
    nn.Linear(LD4, LD5),
    nn.ReLU(),
    nn.Linear(LD5, LD6),
    nn.ReLU(),
    nn.Linear(LD6, 1)
).to(device)

models.load_state_dict(torch.load(f"{path_01}03.output/mlp/mlp_model.pkl", map_location=torch.device('cpu')))

EvalDF = pd.DataFrame(columns = cols)
SupplyEvalDF = pd.DataFrame(columns = supcols)

SitesDict = {}

for station_number in station_index_list.drop_duplicates():
    index = station_index_list == station_number # Finind the rows that have this station number.
    X_test_scaled = torch.Tensor(x_test_1_scaled[index])
    
    # Evaluation
    models.eval()
    with torch.no_grad():
        yhat_test= models(X_test_scaled)

    # Invert scaling for actual
    inv_yhat_test = scaler.inverse_transform(yhat_test.numpy())
    inv_yhat_test[inv_yhat_test<0] = 0 # THIS IS NOT CORRECT !!!!!!!!!!!!!!!
    nwm_test = pd.DataFrame(inv_yhat_test, columns=['MLP_flow'])
    Dfs = [nwm_test.reset_index(drop=True), x_test_temp[index].reset_index(drop=True)]

    #get reach id for model eval
    nhdreach = utils.crosswalk(usgs_site_codes=station_number)
    nhdreach = nhdreach['nwm_feature_id'].iloc[0]
    
    #save predictions
    mod = 'MLP'
    save_path = f"./Predictions/Hindcast/{mod}/{mod}_{nhdreach}.csv"
    #nwm_test.to_csv(save_path)
    
    # merge
    Eval_DF_mine = pd.concat(Dfs, axis=1)
    SitesDict[nhdreach] = Eval_DF_mine

    
    prediction_columns = ['NWM_flow', 'MLP_flow']
    observation_column = 'flow_cfs'
    
    #Get RMSE from the model
    rmse = RMSE(Eval_DF_mine, prediction_columns, observation_column)

    #Get Mean Absolute Percentage Error from the model
    mape = MAPE(Eval_DF_mine, prediction_columns, observation_column)

    #Get Percent Bias from the model
    pbias = PBias(Eval_DF_mine, prediction_columns, observation_column)

    #Get Kling-Gutz Efficiency from the model
    kge = KGE(Eval_DF_mine, prediction_columns, observation_column)
    
    #Get Volumetric values
    Eval_DF_mine.set_index('datetime', inplace = True, drop =True)
    flowcols = [f"{mod}_flow", 'flow_cfs', 'NWM_flow']
    SupplyEval = Eval_DF_mine[flowcols].copy()
    SupplyEval = SupplyEval*cfsday_AFday
    #set up cumulative monthly values
    SupplyEval['Year'] = SupplyEval.index.year

    for site in flowcols:
        SupplyEval[site] = SupplyEval.groupby(['Year'])[site].cumsum()  

    EOY_mod_vol_af = SupplyEval[f"{mod}_flow"].iloc[-1]
    EOY_obs_vol_af = SupplyEval["flow_cfs"].iloc[-1]
    EOY_nwm_vol_af = SupplyEval[f"NWM_flow"].iloc[-1]
    NWM_vol_diff_af = EOY_nwm_vol_af - EOY_obs_vol_af
    Mod_vol_diff_af = EOY_mod_vol_af - EOY_obs_vol_af
    NWM_Perc_diff = (NWM_vol_diff_af/EOY_obs_vol_af)*100
    Mod_Perc_diff = (Mod_vol_diff_af/EOY_obs_vol_af)*100
    
     #Get Performance Metrics from the model
    Srmse = RMSE(SupplyEval, prediction_columns, observation_column)
    Smape = MAPE(SupplyEval, prediction_columns, observation_column)
    Spbias = PBias(SupplyEval, prediction_columns, observation_column)
    Skge = KGE(SupplyEval, prediction_columns, observation_column)
    
    
    #save model performance
    sitestats = [station_number, nhdreach, rmse[0], rmse[1],  pbias[0], pbias[1], kge[0], kge[1], mape[0],mape[1]]
    EvalDF.loc[len(EvalDF)] = sitestats
    
    Supplystats = [station_number, nhdreach, Srmse[0], Srmse[1],  Spbias[0], Spbias[1], Skge[0], Skge[1], Smape[0],  
                 Smape[1],EOY_obs_vol_af, EOY_nwm_vol_af,EOY_mod_vol_af,NWM_vol_diff_af,Mod_vol_diff_af, NWM_Perc_diff, Mod_Perc_diff ]
    SupplyEvalDF.loc[len(SupplyEvalDF)] = Supplystats
    
    
    
    
    #put prediction DF into dictionary
    # Eval_DF_mine.reset_index(inplace = True)
    # Eval_DF_mine.sort_values(by=['datetime'], inplace=True)
    # Eval_DF_mine.set_index('datetime', inplace = True)
    
    
    
#save model results
#EvalDF.to_csv(f"./Predictions/Hindcast/{mod}/{mod}_Performance.csv")   
#SupplyEvalDF.to_csv(f"./Predictions/Hindcast/{mod}/{mod}_Supply_Performance.csv")

print('finish')


NameError: name 'RMSE' is not defined

In [45]:
# MODEL CLASS
import torch
import torch.nn as nn
import torch.optim as optim

class CustomMLP(nn.Module):
    def __init__(self, layer_sizes, device=None):
        super(CustomMLP, self).__init__()
        self.layers = nn.ModuleList()
        for i in range(len(layer_sizes) - 1):
            self.layers.append(nn.Linear(layer_sizes[i], layer_sizes[i + 1]))
        self.loss_function = nn.MSELoss()
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if device is None else device
        self.to(self.device)
        self.validation_indicator = 0

    def forward(self, x):
        for i in range(len(self.layers) - 1):
            x = torch.relu(self.layers[i](x))
        x = self.layers[-1](x)
        return x

    def train_model(self, train_loader, epochs, early_stopping_patience, learning_rate=None, save_path=None, val_loader=None):
        best_val_loss = float('inf')
        epochs_no_improve = 0
        optimizer = optim.Adam(self.layers.parameters(), lr=learning_rate)

        for epoch in range(epochs):
            self.train()  # Set the model to training mode
            for inputs, targets in train_loader:
                inputs, targets = inputs.to(self.device), targets.to(self.device)

                optimizer.zero_grad()
                outputs = self.forward(inputs)
                loss = self.loss_function(outputs, targets)
                loss.backward()
                optimizer.step()

            
            val_loss = 0
            if val_loader is not None:
                self.validation_indicator = 1
                val_loss = self.evaluate_model(val_loader)

                if val_loss < best_val_loss:
                    best_val_loss = val_loss
                    torch.save(self.state_dict(), save_path)
                    epochs_no_improve = 0
                else:
                    epochs_no_improve += 1

                # if epochs_no_improve == early_stopping_patience:
                #     print('Early stopping triggered')
                #     break
            print(f'Epoch {epoch+1}/{epochs}, Training Loss: {loss.item()}', f'Validation Loss: {val_loss}')
        self.validation_indicator = 0
        print('Training is done!')

    def evaluate_model(self, data_loader):
        self.eval()  # Set the model to evaluation mode
        total_loss = 0
        total = 0
        with torch.no_grad():
            for inputs, targets in data_loader:
                inputs, targets = inputs.to(self.device), targets.to(self.device)

                outputs = self.forward(inputs)
                loss = self.loss_function(outputs, targets)
                total_loss += loss.item() * inputs.size(0)
                total += inputs.size(0)
        avg_loss = total_loss / total
        if self.validation_indicator == 0:
            print(f'Validation Loss: {avg_loss}')
        return outputs if self.validation_indicator == 0 else avg_loss

    def save_model(self, file_path):
        torch.save(self.state_dict(), file_path)

    def load_model(self, file_path):
        self.load_state_dict(torch.load(file_path, map_location=self.device))

# Example Usage:
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model = CustomMLP(layer_sizes, learning_rate=0.01, device=device)
# model.train_model(train_loader, epochs=5, val_loader=val_loader, early_stopping_patience=5, save_path='best_model.pth')
# model.load_model('best_model.pth')


In [121]:
# I should modify the model evauation and testing part. 

In [113]:
# %load_ext autoreload
# %autoreload 2
mod='MLP'
epochs = 200
batch_size = 100
learning_rate = 0.001
early_stopping_patience=5
decay = 1e-2
path_model_parameters = f"{path_01}03.output/best_model.pkl"
#path_01 + '03.output/best_model.pkl'
layer_sizes = [x_train_scaled_test.shape[1] ,128, 128, 64, 64, 32, 16, 5, 1]
# neurons = 150
# LD1=128
# LD2=128
# LD3=64
# LD4=64
# LD5=32
# LD6=16
# LD7=5
# validation_split = 0.2

# I should add decay


In [114]:
# Create PyTorch datasets and dataloaders

#X_train, X_valid, y_train, y_valid = train_test_split(x_train_scaled_test, y_train_scaled_test)

train_dataset = TensorDataset(x_train_scaled_test, y_train_scaled_test)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)
# validation_dataset = TensorDataset(X_valid, y_valid)
# validation_loader = DataLoader(validation_dataset, batch_size=batch_size, shuffle=False)

In [115]:
mlp_model = CustomMLP(layer_sizes, device)

In [None]:
mlp_model.train_model(train_loader, epochs, early_stopping_patience, learning_rate, path_model_parameters)

Epoch 1/200, Training Loss: 0.0003272944886703044 Validation Loss: 0
Epoch 2/200, Training Loss: 0.00017656754062045366 Validation Loss: 0
Epoch 3/200, Training Loss: 0.0002624697226565331 Validation Loss: 0
Epoch 4/200, Training Loss: 0.0003445963084232062 Validation Loss: 0
Epoch 5/200, Training Loss: 0.0003139563777949661 Validation Loss: 0
Epoch 6/200, Training Loss: 0.0005090840859338641 Validation Loss: 0
Epoch 7/200, Training Loss: 0.0008741760393604636 Validation Loss: 0
Epoch 8/200, Training Loss: 0.0011440437519922853 Validation Loss: 0
Epoch 9/200, Training Loss: 0.001751930103637278 Validation Loss: 0
Epoch 10/200, Training Loss: 0.0005906976293772459 Validation Loss: 0
Epoch 11/200, Training Loss: 0.000531708647031337 Validation Loss: 0
Epoch 12/200, Training Loss: 0.0008743303478695452 Validation Loss: 0
Epoch 13/200, Training Loss: 0.0009528466034680605 Validation Loss: 0
Epoch 14/200, Training Loss: 0.0003838026023004204 Validation Loss: 0
Epoch 15/200, Training Loss: 0

In [104]:
mlp_model.load_model(path_model_parameters)

In [110]:
from evalaution_table import evtab

EvalDF = np.zeros([len(station_index_list.drop_duplicates()), 10])
SupplyEvalDF = np.zeros([len(station_index_list.drop_duplicates()), 17])

SitesDict = {}



for station_index, station_number in enumerate(station_index_list.drop_duplicates()):
    index = station_index_list == station_number # Finind the rows that have this station number.
    temp_x_scaled_test = torch.Tensor(x_test_1_scaled)
    temp_y_scaled_test = torch.Tensor(y_scaled_test_1)
    index_np = torch.tensor(index.to_numpy())

    test_dataset = TensorDataset(temp_x_scaled_test[index_np], temp_y_scaled_test[index_np])
    test_loader = DataLoader(test_dataset, batch_size=test_dataset.tensors[0].shape[0], shuffle=False)
    
    # Evaluation
    yhat_test = mlp_model.evaluate_model(test_loader)
    
    # Invert scaling for actual
    inv_yhat_test = scaler.inverse_transform(yhat_test.numpy())
    inv_yhat_test[inv_yhat_test<0] = 0 # THIS IS NOT CORRECT !!!!!!!!!!!!!!!
    nwm_test = pd.DataFrame(inv_yhat_test, columns=['MLP_flow'])
    Dfs = [nwm_test.reset_index(drop=True), x_test_temp[index].reset_index(drop=True)]

    #get reach id for model eval
    nhdreach = utils.crosswalk(usgs_site_codes=station_number)
    nhdreach = nhdreach['nwm_feature_id'].iloc[0]
    
    #save predictions
    #save_path = f"./Predictions/Hindcast/{mod}/{mod}_{nhdreach}.csv"
    #nwm_test.to_csv(save_path)
    
    # merge
    Eval_DF_mine = pd.concat(Dfs, axis=1)
    SitesDict[nhdreach] = Eval_DF_mine
    prediction_columns = ['NWM_flow', f"{mod}_flow"]
    observation_column = 'flow_cfs'
    result = evtab(Eval_DF_mine, prediction_columns, nhdreach, observation_column, mod)
    
    #save model performance
    EvalDF[station_index, :] = result[0]
    SupplyEvalDF[station_index, :] = result[1]
    
    

print('finish')


Validation Loss: 0.017544051632285118
Validation Loss: 0.0018668673001229763
Validation Loss: 0.0007869558758102357
Validation Loss: 0.003422613488510251
Validation Loss: 0.00023849683930166066
Validation Loss: 0.0036304083187133074
Validation Loss: 0.0017385041574016213
Validation Loss: 6.514557026093826e-05
Validation Loss: 0.00010211471089860424
finish


In [91]:
from evalaution_table import evtab
# # Build and load the model
device = torch.device('cpu') # for some reason had to change to cpu
models = nn.Sequential(
    nn.Linear(x_test_1_scaled.shape[1], LD1),
    nn.ReLU(),
    nn.Linear(LD1, LD2),
    nn.ReLU(),
    nn.Linear(LD2, LD3),
    nn.ReLU(),
    nn.Linear(LD3, LD4),
    nn.ReLU(),
    nn.Linear(LD4, LD5),
    nn.ReLU(),
    nn.Linear(LD5, LD6),
    nn.ReLU(),
    nn.Linear(LD6, 1)
).to(device)

models.load_state_dict(torch.load(f"{path_01}03.output/mlp/mlp_model.pkl", map_location=torch.device('cpu')))

EvalDF = np.zeros([len(tation_index_list.drop_duplicates()), 10])
SupplyEvalDF = np.zeros([len(tation_index_list.drop_duplicates()), 10])

SitesDict = {}

for station_index, station_number in enumerate(station_index_list.drop_duplicates()):
    index = station_index_list == station_number # Finind the rows that have this station number.
    X_test_scaled = torch.Tensor(x_test_1_scaled[index])
    
    # Evaluation
    models.eval()
    with torch.no_grad():
        yhat_test= models(X_test_scaled)

    # Invert scaling for actual
    inv_yhat_test = scaler.inverse_transform(yhat_test.numpy())
    inv_yhat_test[inv_yhat_test<0] = 0 # THIS IS NOT CORRECT !!!!!!!!!!!!!!!
    nwm_test = pd.DataFrame(inv_yhat_test, columns=['MLP_flow'])
    Dfs = [nwm_test.reset_index(drop=True), x_test_temp[index].reset_index(drop=True)]

    #get reach id for model eval
    nhdreach = utils.crosswalk(usgs_site_codes=station_number)
    nhdreach = nhdreach['nwm_feature_id'].iloc[0]
    
    #save predictions
    save_path = f"./Predictions/Hindcast/{mod}/{mod}_{nhdreach}.csv"
    #nwm_test.to_csv(save_path)
    
    # merge
    Eval_DF_mine = pd.concat(Dfs, axis=1)
    SitesDict[nhdreach] = Eval_DF_mine
    prediction_columns = ['NWM_flow', f"{mod}_flow"]
    observation_column = 'flow_cfs'
    result = evtab(Eval_DF_mine, prediction_columns, nhdreach, observation_column, mod)
    
    #save model performance
    EvalDF[station_index, :] = temp[0]
    SupplyEvalDF[station_index, :] = temp[1]
    
    

print('finish')


NameError: name 'LD1' is not defined

In [111]:
#put the model scores into a dataframe for comparison
mod = 'MLP'
#Evaluation columns for prediction time series
cols = ['USGSid', 'NHDPlusid', 'NWM_rmse', f"{mod}_rmse", 'NWM_pbias', f"{mod}_pbias", 
        'NWM_kge', f"{mod}__kge", 'NWM_mape',  f"{mod}_mape"]

#Evaluation columns for accumulated supply time series
supcols = ['USGSid', 'NHDPlusid', 'NWM_rmse', f"{mod}_rmse", 'NWM_pbias', f"{mod}_pbias", 
        'NWM_kge', f"{mod}__kge", 'NWM_mape',  f"{mod}_mape", 'Obs_vol', 'NWM_vol', f"{mod}_vol",
        'NWM_vol_err', f"{mod}_vol_err", 'NWM_vol_Perc_diff', f"{mod}_vol_Perc_diff"]
    
#save model results
EvalDF = pd.DataFrame(EvalDF, columns=cols)
SupplyEvalDF = pd.DataFrame(SupplyEvalDF, columns=supcols)

#EvalDF.to_csv(f"./Predictions/Hindcast/{mod}/{mod}_Performance.csv")   
#SupplyEvalDF.to_csv(f"./Predictions/Hindcast/{mod}/{mod}_Supply_Performance.csv")


In [None]:
# I should change EvalDf and SupplyEvalDF to numpy arrays and make class for the models


In [112]:
print("Model Performance for Daily cfs")
display(EvalDF)   
print("Model Performance for Daily Accumulated Supply (Acre-Feet)")
display(SupplyEvalDF )

Model Performance for Daily cfs


Unnamed: 0,USGSid,NHDPlusid,NWM_rmse,MLP_rmse,NWM_pbias,MLP_pbias,NWM_kge,MLP__kge,NWM_mape,MLP_mape
0,10126000.0,4605050.0,1861.439475,1234.432885,-53.15,-22.42,0.14,0.41,369.37,134.79
1,10130500.0,10092262.0,305.608055,402.679246,-114.84,-175.75,-0.8,-1.47,197.11,227.44
2,10134500.0,10277268.0,128.603024,258.96161,-235.17,-308.14,-1.48,-4.31,1014.93,337.42
3,10136500.0,10274616.0,776.608631,545.232159,-191.07,-56.79,-0.99,0.04,436.46,139.09
4,10137500.0,10274270.0,107.073089,143.927524,33.05,-60.19,0.48,0.24,38.66,147.01
5,10141000.0,10273232.0,1317.964422,558.986476,-412.87,11.47,-3.2,-0.55,1670.95,274.01
6,10155000.0,10373622.0,317.555275,388.588886,21.78,-29.7,0.47,0.58,123.39,172.92
7,10164500.0,10329013.0,110.84094,75.167216,-123.23,-115.93,-0.95,-0.29,130.71,234.83
8,10171000.0,10390290.0,3558.880086,94.177406,-2822.39,-55.65,-29.81,-0.4,3342.15,74.1


Model Performance for Daily Accumulated Supply (Acre-Feet)


Unnamed: 0,USGSid,NHDPlusid,NWM_rmse,MLP_rmse,NWM_pbias,MLP_pbias,NWM_kge,MLP__kge,NWM_mape,MLP_mape,Obs_vol,NWM_vol,MLP_vol,NWM_vol_err,MLP_vol_err,NWM_vol_Perc_diff,MLP_vol_Perc_diff
0,10126000.0,4605050.0,452527.8,236242.629231,-55.72,0.4,0.14,0.68,74.17,39.19,655183.188796,1149487.593,946119.0,494304.4,290935.811204,75.445221,44.405262
1,10130500.0,10092262.0,85364.63,109174.787262,-120.9,-149.2,-0.72,-1.33,163.42,139.3,77596.59434,157380.795,239854.40625,79784.2,162257.81191,102.819204,209.104295
2,10134500.0,10277268.0,48452.63,69323.782046,-213.52,-274.96,-1.77,-3.31,521.08,247.84,37262.276058,96970.683,136174.796875,59708.41,98912.520817,160.238217,265.449487
3,10136500.0,10274616.0,295913.1,131958.234031,-155.58,-33.35,-0.86,0.44,243.69,92.56,172949.053345,490788.534,345694.21875,317839.5,172745.165405,183.776363,99.882111
4,10137500.0,10274270.0,25299.9,43361.365904,38.01,-43.53,0.6,0.41,45.32,102.07,54744.014329,39350.652,118725.046875,-15393.36,63981.032546,-28.118804,116.873111
5,10141000.0,10273232.0,538254.8,146544.905923,-363.73,36.95,-2.97,-0.01,977.87,121.1,120033.770763,685973.241,112689.867188,565939.5,-7343.903575,471.483539,-6.118198
6,10155000.0,10373622.0,60665.19,60744.8327,31.76,-18.89,0.59,0.74,38.25,41.79,181010.801007,126493.587,227690.28125,-54517.21,46679.480243,-30.11821,25.788229
7,10164500.0,10329013.0,30349.85,22616.275105,-140.03,-90.5,-0.63,-0.06,183.25,121.79,19609.368842,59842.974,59008.765625,40233.61,39399.396783,205.175421,200.92129
8,10171000.0,10390290.0,1481821.0,34446.708416,-2948.53,-60.84,-38.73,0.06,3366.51,55.79,81024.872027,1912801.8,92719.296875,1831777.0,11694.424848,2260.758804,14.43313


In [19]:
print("Model Performance for Daily cfs")
display(EvalDF)   
print("Model Performance for Daily Accumulated Supply (Acre-Feet)")
display(SupplyEvalDF )

Model Performance for Daily cfs


Unnamed: 0,USGSid,NHDPlusid,NWM_rmse,MLP_rmse,NWM_pbias,MLP_pbias,NWM_kge,MLP__kge,NWM_mape,MLP_mape
0,10126000,4605050,1861.439475,692.033427,-53.15,-12.82,0.14,0.78,369.37,52.63
1,10130500,10092262,305.608055,120.717862,-114.84,20.07,-0.8,0.24,197.11,70.01
2,10134500,10277268,128.603024,47.540663,-235.17,39.93,-1.48,0.09,1014.93,170.26
3,10136500,10274616,776.608631,216.496869,-191.07,-12.84,-0.99,0.81,436.46,69.35
4,10137500,10274270,107.073089,137.583271,33.05,38.44,0.48,0.08,38.66,63.44
5,10141000,10273232,1317.964422,229.866345,-412.87,-8.4,-3.2,0.85,1670.95,96.5
6,10155000,10373622,317.555275,226.989005,21.78,2.64,0.47,0.83,123.39,64.87
7,10164500,10329013,110.84094,54.321582,-123.23,40.58,-0.95,0.06,130.71,120.7
8,10171000,10390290,3558.880086,42.771226,-2822.39,-1.92,-29.81,-0.09,3342.15,39.21


Model Performance for Daily Accumulated Supply (Acre-Feet)


Unnamed: 0,USGSid,NHDPlusid,NWM_rmse,MLP_rmse,NWM_pbias,MLP_pbias,NWM_kge,MLP__kge,NWM_mape,MLP_mape,Obs_vol,NWM_vol,MLP_vol,NWM_vol_err,MLP_vol_err,NWM_vol_Perc_diff,MLP_vol_Perc_diff
0,10126000,4605050,452527.8,135601.368137,-55.72,-11.09,0.14,0.74,74.17,17.65,655183.188796,1149487.593,767235.5625,494304.4,112052.373704,75.445221,17.102449
1,10130500,10092262,85364.63,18659.155726,-120.9,14.88,-0.72,0.61,163.42,40.42,77596.59434,157380.795,72287.570312,79784.2,-5309.024027,102.819204,-6.841826
2,10134500,10277268,48452.63,9943.377293,-213.52,33.6,-1.77,0.38,521.08,110.2,37262.276058,96970.683,21517.763672,59708.41,-15744.512386,160.238217,-42.253222
3,10136500,10274616,295913.1,21430.928363,-155.58,-10.09,-0.86,0.89,243.69,29.58,172949.053345,490788.534,206140.578125,317839.5,33191.52478,183.776363,19.191504
4,10137500,10274270,25299.9,41066.102563,38.01,46.02,0.6,0.13,45.32,46.84,54744.014329,39350.652,47727.761719,-15393.36,-7016.252611,-28.118804,-12.816474
5,10141000,10273232,538254.8,25166.789448,-363.73,-9.14,-2.97,0.87,977.87,37.3,120033.770763,685973.241,75881.4375,565939.5,-44152.333263,471.483539,-36.783259
6,10155000,10373622,60665.19,27428.799109,31.76,2.28,0.59,0.83,38.25,33.34,181010.801007,126493.587,167993.8125,-54517.21,-13016.988507,-30.11821,-7.191277
7,10164500,10329013,30349.85,14423.525165,-140.03,42.68,-0.63,0.13,183.25,78.26,19609.368842,59842.974,16443.640625,40233.61,-3165.728217,205.175421,-16.143958
8,10171000,10390290,1481821.0,7273.459034,-2948.53,2.53,-38.73,0.94,3366.51,22.11,81024.872027,1912801.8,65987.796875,1831777.0,-15037.075152,2260.758804,-18.558592


In [None]:
#save model results
EvalDF.to_csv(f"./Predictions/Hindcast/{mod}/{mod}_Performance.csv")

In [None]:
import importlib
importlib.reload(FigureGenerator)

In [None]:
import FigureGenerator

model = 'MLP'
plotname = 'MLP_TS_plot'
freq = 'D'
supply = True
title = 'Observed and Modeled flows for NHDPlus Reaches \n with Upstream Reservoirs in the Great Salt Lake Basin'
FigureGenerator.TS_plot(SitesDict, model, plotname, title, freq, supply)

In [None]:
plotname = 'MLP_ParityPlot'
FigureGenerator.Parity_plot(SitesDict, model, plotname)

In [None]:
import AWS_transfer
model = 'MLP'
state = 'ut'
AWS_transfer.Predictions2AWS(model, state)

In [None]:
reach = 10273232
variables =['NWM_flow', 'Obs_flow']
colors = ['blue', 'green']
model = 'MLP'
plotname = 'NWMFlow'
units = 'cfs'
y_lab = f"Flow ({units})"
title = f"Daily NWM Estimates \n Reach: {str(reach)}"


FigureGenerator.Var_TS_plot(SitesDict, reach, variables, colors, model,y_lab, plotname, title, units, supply = False)