In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import random
import math
import os
from io import open
import time
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import time
from scipy.stats import gaussian_kde
import scipy.stats as stats

In [2]:
class R2Loss(nn.Module):
    #calculate coefficient of determination
    def forward(self, y_pred, y):
        var_y = torch.var(y, unbiased=False)
        return 1.0 - F.mse_loss(y_pred, y, reduction="mean") / var_y

import subprocess as sp
import os
def get_gpu_memory():
  _output_to_list = lambda x: x.decode('ascii').split('\n')[:-1]

  ACCEPTABLE_AVAILABLE_MEMORY = 1024
  COMMAND = "nvidia-smi --query-gpu=memory.free --format=csv"
  memory_free_info = _output_to_list(sp.check_output(COMMAND.split()))[1:]
  memory_free_values = [int(x.split()[0]) for i, x in enumerate(memory_free_info)]
  print(memory_free_values)
  return memory_free_values
# use N2O model v1--GRU model
class N2OGRU(nn.Module):
    def __init__(self, ninp, nhid, nlayers, nout, dropout):
        super(N2OGRU, self).__init__()
        if nlayers > 1:
            self.gru = nn.GRU(ninp, nhid,nlayers,dropout=dropout)
        else:
            self.gru = nn.GRU(ninp, nhid,nlayers)
        #self.densor1 = nn.ReLU() #can test other function
        self.densor2 = nn.Linear(nhid, nout)
        self.nhid = nhid
        self.nlayers = nlayers
        self.drop=nn.Dropout(dropout)
        self.init_weights()

    def init_weights(self):
        initrange = 0.1 #may change to a small value
        self.densor2.bias.data.zero_()
        self.densor2.weight.data.uniform_(-initrange, initrange)

    def forward(self, inputs, hidden):
        output, hidden = self.gru(inputs, hidden)
        #output = self.densor1(self.drop(output))
        #output = torch.exp(self.densor2(self.drop(output))) # add exp
        output = self.densor2(self.drop(output)) # add exp
        return output, hidden
#bsz should be batch size
    def init_hidden(self, bsz):
        weight = next(self.parameters())
        return weight.new_zeros(self.nlayers, bsz, self.nhid)
    

#spin-up: bsz0 is number of year of data_sp you provided for spin up; bsz0=-1 means data_sp=[]
#data_sp is the data you provided
#return inihidden for simulation period with first year spin-uped
def spinup(model,data_sp,cycle,bsz):
    inihidden0=model1.init_hidden(bsz)
    for c in range(cycle):
        output_dummy,inihidden0 = model(data_sp,inihidden0)
    return inihidden0
def my_loss(output, target):
    loss = torch.mean((output - target)**2)
    return loss
#for multi-task learning, sumloss
def myloss_mul_sum(output, target,loss_weights):
    loss = 0.0
    nout=output.size(2)
    for i in range(nout):
        loss = loss + loss_weights[i]*torch.mean((output[:,:,i] - target[:,:,i])**2)
    return loss
def scalar_maxmin(X):
    return (X - X.min())/(X.max() - X.min()),X.min(),X.max()

#generate input combine statini 
#x should be size of [seq,batch,n_f1], statini be size of [1,batch,n_f2]
def load_ini(x,x_ini):
    nrep = x.size(0)
    x_ini=x_ini[0,:,:].view(1,x_ini.size(1),x_ini.size(2))
    return torch.cat((x,x_ini.repeat(nrep,1,1)),2)

class Statini_N2OGRU(nn.Module):
    #input model variables are for each module
    def __init__(self, ninp1, ninp2, nhid, nlayers, nout1, nout2, dropout):
        super(Statini_N2OGRU, self).__init__()
        if nlayers > 1:
            self.gru1 = nn.GRU(ninp1, nhid,nlayers,dropout=dropout)
            self.gru2 = nn.GRU(ninp2, nhid,nlayers,dropout=dropout)
        else:
            self.gru1 = nn.GRU(ninp1, nhid,nlayers)
            self.gru2 = nn.GRU(ninp2, nhid,nlayers)
        self.densor1 = nn.Linear(nhid, nout1)
        self.densor2 = nn.Linear(nhid, nout2)
        self.nhid = nhid
        self.nlayers = nlayers
        self.drop=nn.Dropout(dropout)
        self.init_weights()

    def init_weights(self):
        initrange = 0.1 #may change to a small value
        self.densor1.bias.data.zero_()
        self.densor1.weight.data.uniform_(-initrange, initrange)
        self.densor2.bias.data.zero_()
        self.densor2.weight.data.uniform_(-initrange, initrange)

    def forward(self, W_inputs, stat_ini, hidden):
        inputs = load_ini(W_inputs,stat_ini)
        output1, hidden1 = self.gru1(inputs, hidden[0])
        output1 = self.densor1(self.drop(output1)) 
        inputs = torch.cat((W_inputs,output1),2)
        output2, hidden2 = self.gru2(inputs, hidden[1])
        output2 = self.densor2(self.drop(output2)) 
        #need to be careful what is the output orders!!!!!!!!!!!!!
        output=torch.cat((output2,output1),2)
        hidden=(hidden1,hidden2)
        return output, hidden
#bsz should be batch size
    def init_hidden(self, bsz):
        weight = next(self.parameters())
        return (weight.new_zeros(self.nlayers, bsz, self.nhid),\
                weight.new_zeros(self.nlayers, bsz, self.nhid))
    
class Statini_N2OGRU_v2(nn.Module):
    #input model variables are for each module
    def __init__(self, ninp1, ninp2, nhid, nlayers, nout1, nout2, dropout):
        super(Statini_N2OGRU_v2, self).__init__()
        if nlayers > 1:
            self.gru1 = nn.GRU(ninp1, nhid,nlayers,dropout=dropout)
            self.gru2 = nn.GRU(ninp2, nhid,nlayers,dropout=dropout)
        else:
            self.gru1 = nn.GRU(ninp1, nhid,nlayers)
            self.gru2 = nn.GRU(ninp2, nhid,nlayers)
        self.densor1 = nn.Linear(nhid, nout1)
        self.densor2 = nn.Linear(nhid, nout2)
        self.nhid = nhid
        self.nlayers = nlayers
        self.drop=nn.Dropout(dropout)
        self.init_weights()

    def init_weights(self):
        initrange = 0.1 #may change to a small value
        self.densor1.bias.data.zero_()
        self.densor1.weight.data.uniform_(-initrange, initrange)
        self.densor2.bias.data.zero_()
        self.densor2.weight.data.uniform_(-initrange, initrange)

    def forward(self, W_inputs, stat_ini,flux_ini, hidden):
        inputs = load_ini(W_inputs,stat_ini)
        output1, hidden1 = self.gru1(inputs, hidden[0])
        output1 = self.densor1(self.drop(output1)) 
        inputs = torch.cat((W_inputs,output1),2)
        inputs = load_ini(inputs,flux_ini)
        output2, hidden2 = self.gru2(inputs, hidden[1])
        output2 = self.densor2(self.drop(output2)) 
        #need to be careful what is the output orders!!!!!!!!!!!!!
        output=torch.cat((output2,output1),2)
        hidden=(hidden1,hidden2)
        return output, hidden
#bsz should be batch size
    def init_hidden(self, bsz):
        weight = next(self.parameters())
        return (weight.new_zeros(self.nlayers, bsz, self.nhid),\
                weight.new_zeros(self.nlayers, bsz, self.nhid))
    
    
class Statini_N2OGRU_v3(nn.Module):
    #input model variables are for each module
    def __init__(self, ninp1, ninp2, nhid1, nhid2, nlayers1, nlayers2, nout1, nout2, dropout):
        super(Statini_N2OGRU_v3, self).__init__()
        if nlayers1[0] > 1:
            self.gru1_1 = nn.GRU(ninp1[0], nhid1[0],nlayers1[0],dropout=dropout)
        else:
            self.gru1_1 = nn.GRU(ninp1[0], nhid1[0],nlayers1[0])
        if nlayers1[1] > 1:
            self.gru1_2 = nn.GRU(ninp1[1], nhid1[1],nlayers1[1],dropout=dropout)
        else:
            self.gru1_2 = nn.GRU(ninp1[1], nhid1[1],nlayers1[1])
        if nlayers1[2] > 1:
            self.gru1_3 = nn.GRU(ninp1[2], nhid1[2],nlayers1[2],dropout=dropout)
        else:
            self.gru1_3 = nn.GRU(ninp1[2], nhid1[2],nlayers1[2])
        if nlayers1[3] > 1:
            self.gru1_4 = nn.GRU(ninp1[3], nhid1[3],nlayers1[3],dropout=dropout)
        else:
            self.gru1_4 = nn.GRU(ninp1[3], nhid1[3],nlayers1[3])
        if nlayers2 > 1:
            self.gru2 = nn.GRU(ninp2, nhid2,nlayers2,dropout=dropout)
        else:
            self.gru2 = nn.GRU(ninp2, nhid2,nlayers2)

        self.densor1_1 = nn.Linear(nhid1[0], nout1[0])
        self.densor1_2 = nn.Linear(nhid1[1], nout1[1])
        self.densor1_3 = nn.Linear(nhid1[2], nout1[2])
        self.densor1_4 = nn.Linear(nhid1[3], nout1[3])
        self.densor2 = nn.Linear(nhid2, nout2)
        self.nhid1 = nhid1
        self.nhid2 = nhid2
        self.nlayers1 = nlayers1
        self.nlayers2 = nlayers2
        self.drop=nn.Dropout(dropout)
        self.init_weights()

    def init_weights(self):
        initrange = 0.1 #may change to a small value
        self.densor1_1.bias.data.zero_()
        self.densor1_1.weight.data.uniform_(-initrange, initrange)
        self.densor1_2.bias.data.zero_()
        self.densor1_2.weight.data.uniform_(-initrange, initrange)
        self.densor1_3.bias.data.zero_()
        self.densor1_3.weight.data.uniform_(-initrange, initrange)
        self.densor1_4.bias.data.zero_()
        self.densor1_4.weight.data.uniform_(-initrange, initrange)
        self.densor2.bias.data.zero_()
        self.densor2.weight.data.uniform_(-initrange, initrange)

    def forward(self, W_inputs, stat_ini,flux_ini, hidden):
        #layer 1 for states simulation
        inputs = load_ini(W_inputs,stat_ini[0])
        output1_1, hidden1_1 = self.gru1_1(inputs, hidden[0][0])
        output1_1 = self.densor1_1(self.drop(output1_1))
        inputs = load_ini(W_inputs,stat_ini[1])
        output1_2, hidden1_2 = self.gru1_2(inputs, hidden[0][1])
        output1_2 = self.densor1_2(self.drop(output1_2))
        inputs = load_ini(W_inputs,stat_ini[2])
        output1_3, hidden1_3 = self.gru1_3(inputs, hidden[0][2])
        output1_3 = self.densor1_3(self.drop(output1_3))
        inputs = load_ini(W_inputs,stat_ini[3])
        output1_4, hidden1_4 = self.gru1_4(inputs, hidden[0][3])
        output1_4 = self.densor1_4(self.drop(output1_4))
        
        inputs = torch.cat((W_inputs,output1_1,output1_2,output1_3,output1_4),2)
        inputs = load_ini(inputs,flux_ini)
        #layer two for N2O O2 and N2 simulation
        output2, hidden2 = self.gru2(inputs, hidden[1])
        output2 = self.densor2(self.drop(output2)) 
        #need to be careful what is the output orders!!!!!!!!!!!!!
        output=torch.cat((output2,output1_1,output1_2,output1_3,output1_4),2)
        
        hidden=((hidden1_1,hidden1_2,hidden1_3,hidden1_4),hidden2)
        return output, hidden
#bsz should be batch size
    def init_hidden(self, bsz):
        weight = next(self.parameters())
        return ((weight.new_zeros(self.nlayers1[0], bsz, self.nhid1[0]),\
                weight.new_zeros(self.nlayers1[1], bsz, self.nhid1[1]),\
                weight.new_zeros(self.nlayers1[2], bsz, self.nhid1[2]),\
                weight.new_zeros(self.nlayers1[3], bsz, self.nhid1[3])),\
                weight.new_zeros(self.nlayers2, bsz, self.nhid2))
    
class Statini_sq_N2OGRU(nn.Module):
    #input model variables are for each module
    def __init__(self, ninp1, ninp2, nhid, nlayers, nout1, nout2, dropout):
        super(Statini_sq_N2OGRU, self).__init__()
        if nlayers > 1:
            self.gru1 = nn.GRU(ninp1, nhid,nlayers,dropout=dropout)
            self.gru2 = nn.GRU(ninp2, nhid,nlayers,dropout=dropout)
        else:
            self.gru1 = nn.GRU(ninp1, nhid,nlayers)
            self.gru2 = nn.GRU(ninp2, nhid,nlayers)
        self.densor1 = nn.Linear(nhid, nout1)
        self.densor2 = nn.Linear(nhid, nout2)
        self.nhid = nhid
        self.nlayers = nlayers
        self.drop=nn.Dropout(dropout)
        self.init_weights()

    def init_weights(self):
        initrange = 0.1 #may change to a small value
        self.densor1.bias.data.zero_()
        self.densor1.weight.data.uniform_(-initrange, initrange)
        self.densor2.bias.data.zero_()
        self.densor2.weight.data.uniform_(-initrange, initrange)

    def forward(self, W_inputs, stat_ini_sq, hidden):
        inputs = torch.cat((W_inputs,stat_ini_sq),2)
        output1, hidden1 = self.gru1(inputs, hidden[0])
        output1 = self.densor1(self.drop(output1)) 
        inputs = torch.cat((W_inputs,output1),2)
        output2, hidden2 = self.gru2(inputs, hidden[1])
        output2 = self.densor2(self.drop(output2)) 
        #need to be careful what is the output orders!!!!!!!!!!!!!
        output=torch.cat((output2,output1),2)
        hidden=(hidden1,hidden2)
        return output, hidden
    
#bsz should be batch size
    def init_hidden(self, bsz):
        weight = next(self.parameters())
        return (weight.new_zeros(self.nlayers, bsz, self.nhid),\
                weight.new_zeros(self.nlayers, bsz, self.nhid))
def get_ini(x,ind,nout):
    initials=[]
    for i in range(len(ind)):
        initials.append(x[:,:,ind[i]].view(x.size(0),x.size(1),nout[i]))
    return initials

def Z_norm(X):
    X_mean=X.mean()
    X_std=np.std(np.array(X))
    return (X-X_mean)/X_std, X_mean, X_std

def Z_norm_reverse(X,Xscaler,units_convert):
    return (X*Xscaler[1]+Xscaler[0])*units_convert

def Z_norm_with_scaler(X,Xscaler):
    return (X-Xscaler[0])/Xscaler[1]

#check whether start time is within the fertilized period
def dropout_check(start_t,fntime_ind):
    dropout_ind=False
    for t in fntime_ind:
        if start_t > t-10 and start_t < t+60:
            dropout_ind=True
    return dropout_ind
        
#sample data considering dropout and leadtime    
def sample_data(X,Y,slw,slw05,totsq,fnfeature_ind):
    maxit=int((totsq-slw)/slw05+1)
    #find the fertilized time
    fntime_ind=np.where(X[:,1,fnfeature_ind].view(-1).to("cpu").numpy()>0)[0]
    #get sliding window data with dropout method
    for it in range(maxit):
        if it==0:
            X_new = X[slw05*it:slw05*it+slw,:,:]
            Y_new = Y[slw05*it:slw05*it+slw,:,:]
        else:
            if not dropout_check(slw05*it,fntime_ind):
                X_new = torch.cat((X_new,X[slw05*it:slw05*it+slw,:,:]),1)
                Y_new = torch.cat((Y_new,Y[slw05*it:slw05*it+slw,:,:]),1)
    #get focused data only for fertilized period with random leading time
    for t in fntime_ind:
        for b in range(X.size(1)):
            if t != fntime_ind[-1]:
                leadtime=np.random.randint(t-60,t-10)
        
                X_new = torch.cat((X_new,X[leadtime:leadtime+slw,b,:].view(slw,1,X.size(2))),1)
                Y_new = torch.cat((Y_new,Y[leadtime:leadtime+slw,b,:].view(slw,1,Y.size(2))),1)
    return X_new,Y_new

#sample data considering dropout and leadtime    
def sample_data_FN(X,Y,totsq,fnfeature_ind):
    #find the fertilized time
    fntime_ind=np.where(X[:,1,fnfeature_ind].view(-1).to("cpu").numpy()>0)[0]
    #get focused data only for fertilized period with random leading time
    for t in fntime_ind:
        if t == fntime_ind[0]:
            X_new = X[t-30:t+90,:,:]
            Y_new = Y[t-30:t+90,:,:]
        else:
            X_new = torch.cat((X_new,X[t-30:t+90,:,:]),1)
            Y_new = torch.cat((Y_new,Y[t-30:t+90,:,:]),1)
    return X_new,Y_new,fntime_ind

def my_loss_weighted(output, target, mask):
    loss = torch.mean(((output - target)**2)*mask)
    return loss

In [3]:
#Get the scaler from pretrain data set
#prepare input and output
start=1
end=18
Tx=365 #timesteps
tyear=end-start+1
#out_names=['N2O_FLUX']
out_names=['N2O_FLUX','CO2_FLUX','WTR_3','NH4_3','NO3_3']
stat_vars=out_names.copy()
stat_vars.remove('N2O_FLUX')
stat_values=[-1.0,0.2,0.0,20.0]
n_out=len(out_names)
#'ATM_CO2' constant, AMENDED_C 0, fire n2o,'FIRE_CH4','STG_DEAD, total 25
f_names_c=['RESIDUE_C','HUMUS_C','LITTER_C','CO2_FLUX','O2_FLUX','AUTO_RESP','MICRO_C','SURF_RES','CH4_FLUX',\
         'SURF_DOC_FLUX','SUBS_DOC_FLUX','SURF_DIC_FLUX','SUBS_DIC_FLUX','NBP','SOC_1','SOC_3','SOC_5',\
         'H2_FLUX','ECO_HVST_C','ECO_LAI','ECO_GPP','ECO_RA','ECO_NPP','ECO_RH','TTL_DIC']

#constant:ACTV_LYR,'SURF_ICE',total 16
f_names_w=['ET','RUNOFF','WATER','DISCHG','SNOWPACK','WTR_1','WTR_3','WTR_5','SURF_WTR','ICE_1','ICE_2','ICE_3',\
           'PSI_1','PSI_3','PSI_5','WTR_TBL']


#constant:FIRE_N,total 24
f_names_n=['RESIDUE_N','HUMUS_N','FERTZR_N','NET_PL_EXCH_N','NH4','NO3','SURF_DON_FLUX','SUBS_DON_FLUX','SURF_DIN_FLUX',\
           'SUBS_DIN_FLUX','N2O_FLUX','NH3_FLUX','N2_FIXN','MICRO_N','NH4_1','NH4_3','NH4_5',\
           'NO3_1','NO3_3','NO3_5','NH4_RES','NO3_RES','ECO_HVST_N','N2_FLUX'] ######### data include the N2O_FLUX!!!!!!!!


#constant:,total 19
f_names_e=['RADN','TMAX_AIR','TMIN_AIR','HMAX_AIR','HMIN_AIR','WIND','PRECN','TMAX_SOIL_1','TMIN_SOIL_1',\
           'TMAX_SOIL_3','TMIN_SOIL_3','TMAX_SOIL_5','TMIN_SOIL_5','TMAX_LITTER','TMIN_LITTER','ECND_1','ECND_3','ECND_5',\
           'TTL_SALT_DISCHG']

#soil property total 15 with variation in new results
fp_names=['TSN','FBCU','PDOY','PDS','PDD','DDOY','PLANTT',\
          'LAT','TLB','TBKDS', 'TCSAND', 'TCSILT', 'TPH', 'TCEC', 'TSOC']

selected_SP=['PDOY','PLANTT','TBKDS', 'TCSAND', 'TCSILT', 'TPH', 'TCEC', 'TSOC']

f_names0=f_names_c+f_names_w+f_names_n+f_names_e+fp_names

f_names=['FERTZR_N','RADN','TMAX_AIR','TMIN_AIR','HMAX_AIR','HMIN_AIR','WIND','PRECN']+selected_SP

#remove_list=['CO2_FLUX','O2_FLUX','AUTO_RESP','CH4_FLUX','SURF_DOC_FLUX','SUBS_DOC_FLUX',\
#             'SURF_DIC_FLUX','SUBS_DIC_FLUX','H2_FLUX','ECO_GPP','ECO_RA','ECO_NPP','ECO_RH','ET',\
#             'RUNOFF','DISCHG','NET_PL_EXCH_N','SURF_DON_FLUX','SUBS_DON_FLUX','SURF_DIN_FLUX',\
#             'SUBS_DIN_FLUX','N2O_FLUX','NH3_FLUX','N2_FIXN','N2_FLUX','TTL_SALT_DISCHG']
#remove_list=['N2O_FLUX']
#for c in remove_list:
#    f_names.remove(c)


n_f0=len(f_names0)
n_f=len(f_names)
ind=[]
for i in range(n_f):
    ind.append(f_names0.index(f_names[i]))
    
#ind=sorted(ind)
f_names=[]
for i in ind:
    f_names.append(f_names0[i])

fn_ind=f_names.index('FERTZR_N')
print(fn_ind)


fln=20 #20 for full 0-300, 15 for 80-240
sln=99
bsz0=fln*sln
X=np.zeros([Tx*tyear,bsz0,n_f0],dtype=np.float32)
Y=np.zeros([Tx*tyear,bsz0,n_out],dtype=np.float32)
Xscaler=np.zeros([n_f0,2])
#load ecosys results
basic_path='D:/machinelearning/pgml_progress/mesocosm/'
path_load = basic_path+'99points_metrix_scaled1_v9_X_part1.sav'
data0=torch.load(path_load)
X[:,:,0:45]=data0['InputX']
Xscaler[0:45,:]=data0['Xscaler']
path_load = basic_path+'99points_metrix_scaled1_v9_X_part2.sav'
data0=torch.load(path_load)
X[:,:,45:84]=data0['InputX']
Xscaler[45:84,:]=data0['Xscaler']
#read soil properties:
path_load = basic_path+'99points_statv_v3_scaled1.sav'
data0=torch.load(path_load)
X[:,:,84:n_f0]=data0['Soil_p']
Xscaler[84:n_f0,:]=data0['Soil_p_scaler']

#use Z-norm to rescale every parameters
Yscaler=np.zeros([n_out,2])
#Z-norm for Y
indout=[]
for i in range(n_out):
    indout.append(f_names0.index(out_names[i]))
Y[:,:,:]=X[:,:,indout]
for i in range(n_out):
    Y[:,:,i]=(Y[:,:,i]*(Xscaler[indout[i],1]-Xscaler[indout[i],0])+Xscaler[indout[i],0]) # convert back
    Y[:,:,i],Yscaler[i,0],Yscaler[i,1]=Z_norm(Y[:,:,i])       
#Z-norm for X
X=X[:,:,ind]
Xscaler=Xscaler[ind,:]
for i in range(len(ind)):
    X[:,:,i]=(X[:,:,i]*(Xscaler[i,1]-Xscaler[i,0])+Xscaler[i,0]) # convert back
    X[:,:,i],Xscaler[i,0],Xscaler[i,1]=Z_norm(X[:,:,i])  

Y=torch.from_numpy(Y)
X=torch.from_numpy(X)

#need to change to new Z_norm
#W_names=['TMAX_AIR','TMIN_AIR','HMAX_AIR','HMIN_AIR','TMAX_SOIL_1','TMIN_SOIL_1',\
#         'TMAX_SOIL_3','TMIN_SOIL_3','TMAX_SOIL_5','TMIN_SOIL_5','TMAX_LITTER','TMIN_LITTER']
#Diff_names=['TDIF_AIR','HDIF_AIR','TDIF_SOIL_1','TDIF_SOIL_3','TDIF_SOIL_5','TDIF_LITTER']
#need to change to new Z_norm
W_names=['TMAX_AIR','TMIN_AIR','HMAX_AIR','HMIN_AIR']
Diff_names=['TDIF_AIR','HDIF_AIR']
W_ind=[]
for i in range(len(W_names)):
    W_ind.append(f_names.index(W_names[i]))
#replace max-min
for i in range(len(Diff_names)):
    Vmax=X[:,:,W_ind[i*2]]*Xscaler[W_ind[i*2],1]+Xscaler[W_ind[i*2],0]
    Vmin=X[:,:,W_ind[i*2+1]]*Xscaler[W_ind[i*2+1],1]+Xscaler[W_ind[i*2+1],0] 
    X[:,:,W_ind[i*2+1]]=Vmax-Vmin
    X[:,:,W_ind[i*2+1]],Xscaler[W_ind[i*2+1],0],Xscaler[W_ind[i*2+1],1]=Z_norm(X[:,:,W_ind[i*2+1]])
    f_names[W_ind[i*2+1]]=Diff_names[i]
#scale the initials
for i in range(len(stat_values)):
    stat_values[i]=Z_norm_with_scaler(stat_values[i],Yscaler[i+1,:])
print(stat_values)

print(X.size(),Y.size())
print(f_names)
print(Xscaler.shape,Yscaler.shape)

0
[0.45571126271708373, -1.2270035148495462, -1.9900609770509639, 1.2540460759836642]
torch.Size([6570, 1980, 16]) torch.Size([6570, 1980, 5])
['FERTZR_N', 'RADN', 'TMAX_AIR', 'TDIF_AIR', 'HMAX_AIR', 'HDIF_AIR', 'WIND', 'PRECN', 'PDOY', 'PLANTT', 'TBKDS', 'TCSAND', 'TCSILT', 'TPH', 'TCEC', 'TSOC']
(16, 2) (5, 2)


In [9]:
#####################prepare the retrain data
# for k-fold cross validation, pretest for speed

path_load = basic_path + 'mesotest_data_org_v1.sav'
data0=torch.load(path_load)
X1=data0['InputX1']
X2=data0['InputX2']
X3=data0['Soil_p']
Y=data0['OutputY']
days=122
nyear=3
totnchamber=6
c_index=[0,1,2,3,4,5]
#######################################k-fold choose 

for val_cn in c_index:
    c_val = [val_cn]
    nc_val = len(c_val)
    c_train= [x for i,x in enumerate(c_index) if i not in c_val] 
    nc_train = len(c_train)
    print(c_train)
    augn=1000
    print(X1.shape,X2.shape,X3.shape,Y.shape)
    pred_names=['N2O_FLUX','CO2_FLUX','NO3_3','NH4_3','WTR_3']
    #load data n
    Ynames_n = [0,1,2,3,4]
    #find the pred_names number in out_names, 
    #the no. of model output Y_train_pred[pred_names_n[i]] will be the related to Y_train[Ynames_n[i]]
    pred_names_n = []
    for i in range(len(pred_names)):
        pred_names_n.append(out_names.index(pred_names[i]))
        
        
    X1names = ['tair','swdown','precip','spRH'] 
    X2names = ['Obs_prec','Fertilizer']
    X3names=['TSN','FBCU','PDOY','PDS','PDD','DDOY','PLANTT',\
              'LAT','TLB','TBKDS', 'TCSAND', 'TCSILT', 'TPH', 'TCEC', 'TSOC']
    Ynames= ['N2O_FLUX','CO2_FLUX','NO3','NH4','WFPS']
    Y_units_convert=[-24.0,-24.0,1.0,1.0,(1-1.5/2.65)/100.0]

    X_train = np.zeros([days,augn*nyear*nc_train,len(f_names)],dtype=np.float32)
    Y_train = np.zeros([days,augn*nyear*nc_train,len(Ynames)],dtype=np.float32)
    Y_train_mask=np.zeros(Y_train.shape,dtype=np.float32)
    #for training without augmentation
    X_train_d = np.zeros([days,nyear*nc_train,len(f_names)],dtype=np.float32)
    Y_train_d = np.zeros([days,nyear*nc_train,len(Ynames)],dtype=np.float32)
    Y_train_d_mask=np.zeros(Y_train_d.shape,dtype=np.float32) 
    
    X_val=np.zeros([days,nyear*nc_val,len(f_names)],dtype=np.float32)
    Y_val=np.zeros([days,nyear*nc_val,len(Ynames)],dtype=np.float32)
    Y_val_mask=np.zeros(Y_val.shape,dtype=np.float32) 

    
    #Y_gt ground truth first day index, for initials creating
    Y_train_gt_1stind = np.zeros([nyear*nc_train,len(Ynames)], dtype=int)
    Y_val_gt_1stind = np.zeros([nyear*nc_val,len(Ynames)], dtype=int)
    print(Y_train_gt_1stind.shape,Y_val_gt_1stind.shape)
    #Method: Multidimensional Shifting using NumPy
    #method from https://ethankoch.medium.com/incredibly-fast-random-sampling-in-python-baf154bd836a
    #product index_array (num_samples,sample_size) within elements
    # constants
    # returning index
    num_samples = augn
    sample_size = 16 #sample 16 hours within one day
    num_elements = 24
    #elements = np.arange(num_elements)
    # probabilities should sum to 1
    probabilities = np.random.random(num_elements)
    probabilities /= np.sum(probabilities)
    def multidimensional_shifting(num_samples, sample_size, probabilities):
        # replicate probabilities as many times as `num_samples`
        replicated_probabilities = np.tile(probabilities, (num_samples, 1))
        # get random shifting numbers & scale them correctly
        random_shifts = np.random.random(replicated_probabilities.shape)
        random_shifts /= random_shifts.sum(axis=1)[:, np.newaxis]
        # shift by numbers & find largest (by finding the smallest of the negative)
        shifted_probabilities = random_shifts - replicated_probabilities
        return np.argpartition(shifted_probabilities, sample_size, axis=1)[:, :sample_size]

    #sample data from mesocosm site chambers
    for d in range(days):
        #for training data with data augmentation
        for y in range(nyear):
            for c in range(nc_train):
                #get random sampled indexes
                sample_indexes = multidimensional_shifting(num_samples, sample_size, probabilities)
                #input data
                #temperature
                elements = np.tile(X1[d*24:(d+1)*24,y,c_train[c],0], (num_samples, 1)) # copy the hourly data num_samples times
                output_samples = np.take_along_axis(elements, sample_indexes, axis=1) # sample the data based on random indexes
                output_samples_tmax = output_samples.max(1)
                output_samples_tdif = output_samples_tmax-output_samples.min(1)
                X_train[d,augn*(y*nc_train+c):augn*(y*nc_train+c+1),2]=output_samples_tmax
                X_train[d,augn*(y*nc_train+c):augn*(y*nc_train+c+1),3]=output_samples_tdif
                X_train_d[d,y*nc_train+c,2] = np.max(X1[d*24:(d+1)*24,y,c_train[c],0])
                X_train_d[d,y*nc_train+c,3] = np.max(X1[d*24:(d+1)*24,y,c_train[c],0])-\
                                                        np.min(X1[d*24:(d+1)*24,y,c_train[c],0])
                #radiation need to convert from W/m-2 to MJ m-2 d-1, *3600*24*10-6
                elements = np.tile(X1[d*24:(d+1)*24,y,c_train[c],1], (num_samples, 1)) # copy the hourly data num_samples times
                output_samples = np.take_along_axis(elements, sample_indexes, axis=1) # sample the data based on random indexes
                output_samples_rad = output_samples.mean(1)*(3600.0*24.0*(10**(-6)))
                X_train[d,augn*(y*nc_train+c):augn*(y*nc_train+c+1),1]=output_samples_rad
                X_train_d[d,y*nc_train+c,1] = np.mean(X1[d*24:(d+1)*24,y,c_train[c],1])*(3600.0*24.0*(10**(-6)))
                #humidity
                elements = np.tile(X1[d*24:(d+1)*24,y,c_train[c],3], (num_samples, 1)) # copy the hourly data num_samples times
                output_samples = np.take_along_axis(elements, sample_indexes, axis=1) # sample the data based on random indexes
                output_samples_hmax = output_samples.max(1)
                output_samples_hdif = output_samples_hmax - output_samples.min(1)
                X_train[d,augn*(y*nc_train+c):augn*(y*nc_train+c+1),4]=output_samples_hmax
                X_train[d,augn*(y*nc_train+c):augn*(y*nc_train+c+1),5]=output_samples_hdif
                X_train_d[d,y*nc_train+c,4] = np.max(X1[d*24:(d+1)*24,y,c_train[c],3])
                X_train_d[d,y*nc_train+c,5] = np.max(X1[d*24:(d+1)*24,y,c_train[c],3]) - \
                                                        np.min(X1[d*24:(d+1)*24,y,c_train[c],3])
                #sample Y data
                for ffy in range(len(Ynames)): 
                    element=Y[d*24:(d+1)*24,y,c_train[c],ffy]
                    nan_nums=np.count_nonzero(np.isnan(element))
                    if  nan_nums < 16:
                        # copy the hourly data num_samples times
                        elements = np.tile(element, (num_samples, 1)) 
                        # sample the data based on random indexes
                        output_samples = np.take_along_axis(elements, sample_indexes, axis=1) 
                        #convert to right units (n2O g N m-2 h-1 to d-1)
                        output_samples_n2o = np.nanmean(output_samples,axis=1)
                        # need to be direction to soil
                        Y_train[d,augn*(y*nc_train+c):augn*(y*nc_train+c+1),ffy]= output_samples_n2o*Y_units_convert[ffy]
                        Y_train_mask[d,augn*(y*nc_train+c):augn*(y*nc_train+c+1),ffy] = (24.0-float(nan_nums))/24.0
                        
                        Y_train_d[d,y*nc_train+c,ffy]= np.nanmean(Y[d*24:(d+1)*24,y,c_train[c],ffy])*\
                                                        Y_units_convert[ffy] #convert 
                        Y_train_d_mask[d,y*nc_train+c,ffy] = (24.0-float(nan_nums))/24.0
                        #get the first day of ground truth
                        if Y_train_gt_1stind[y*nc_train+c,ffy] == 0:
                            Y_train_gt_1stind[y*nc_train+c,ffy] = d

                    else:
                        # if missing value >=16, we use -999 represent nan
                        Y_train[d,augn*(y*nc_train+c):augn*(y*nc_train+c+1),ffy]=-999.0 
                        Y_train_mask[d,augn*(y*nc_train+c):augn*(y*nc_train+c+1),ffy] = 0.0
                        
                        Y_train_d[d,y*nc_train+c,ffy]=-999.0 
                        Y_train_d_mask[d,y*nc_train+c,ffy] = 0.0
                #deal with other training variables
                #fertilizer
                X_train[d,augn*(y*nc_train+c):augn*(y*nc_train+c+1),0] = X2[d,y,c_train[c],1]
                X_train_d[d,y*nc_train+c,0] = X2[d,y,c_train[c],1]
                #wind
                X_train[d,augn*(y*nc_train+c):augn*(y*nc_train+c+1),6] = 0.05
                X_train_d[d,y*nc_train+c,6] = 0.05
                #precipitation
                X_train[d,augn*(y*nc_train+c):augn*(y*nc_train+c+1),7] = X2[d,y,c_train[c],0]
                X_train_d[d,y*nc_train+c,7] = X2[d,y,c_train[c],0]
                for i in range(len(selected_SP)):
                    X_train[d,augn*(y*nc_train+c):augn*(y*nc_train+c+1),8+i] = X3[d,y,c_train[c],X3names.index(selected_SP[i])]
                    X_train_d[d,y*nc_train+c,8+i] = X3[d,y,c_train[c],X3names.index(selected_SP[i])]


    #load the validation
    for d in range(days):
        for y in range(nyear):
            for c in range(nc_val):
                #temperature
                X_val[d,y*nc_val+c,2] = np.max(X1[d*24:(d+1)*24,y,c_val[c],0])
                X_val[d,y*nc_val+c,3] = np.max(X1[d*24:(d+1)*24,y,c_val[c],0])-\
                                                        np.min(X1[d*24:(d+1)*24,y,c_val[c],0])
                #radiation
                X_val[d,y*nc_val+c,1] = np.mean(X1[d*24:(d+1)*24,y,c_val[c],1])*(3600.0*24.0*(10**(-6)))
                #humidity
                X_val[d,y*nc_val+c,4] = np.max(X1[d*24:(d+1)*24,y,c_val[c],3])
                X_val[d,y*nc_val+c,5] = np.max(X1[d*24:(d+1)*24,y,c_val[c],3]) - \
                                                        np.min(X1[d*24:(d+1)*24,y,c_val[c],3])
                #Y data
                for ffy in range(len(Ynames)): 
                    element = Y[d*24:(d+1)*24,y,c_val[c],ffy]
                    nan_nums=np.count_nonzero(np.isnan(element))
                    if  nan_nums < 16:
                        Y_val[d,y*nc_val+c,ffy] = np.nanmean(element)*Y_units_convert[ffy] #convert 
                        Y_val_mask[d,y*nc_val+c,ffy] = (24.0-float(nan_nums))/24.0
                        #get the first day of ground truth
                        if Y_val_gt_1stind[y*nc_val+c,ffy] == 0:
                            Y_val_gt_1stind[y*nc_val+c,ffy] = d
                    else:
                        Y_val[d,y*nc_val+c,ffy] = -999.0 # if missing value >=16, we use -999 represent nan
                        Y_val_mask[d,y*nc_val+c,ffy] = 0.0
                #deal with other training variables
                #fertilizer
                X_val[d,y*nc_val+c,0] = X2[d,y,c_val[c],1]
                #wind
                X_val[d,y*nc_val+c,6] = 0.05
                #precipitation
                X_val[d,y*nc_val+c,7] = X2[d,y,c_val[c],0]
                for i in range(len(selected_SP)):
                    X_val[d,y*nc_val+c,8+i] = X3[d,y,c_val[c],X3names.index(selected_SP[i])]

    print(X_train.shape,Y_train.shape,X_train_d.shape,Y_train_d.shape,X_val.shape,Y_val.shape)
    print(Xscaler.shape, Yscaler.shape,X_train.shape[2])
    #Z-norm the matrix
    for i in range(X_train.shape[2]):
        X_train[:,:,i]=Z_norm_with_scaler(X_train[:,:,i],Xscaler[i,:])
        X_train_d[:,:,i]=Z_norm_with_scaler(X_train_d[:,:,i],Xscaler[i,:])
        X_val[:,:,i]=Z_norm_with_scaler(X_val[:,:,i],Xscaler[i,:])
    for i in range(len(Ynames_n)):
        Y_train[:,:,Ynames_n[i]]=Z_norm_with_scaler(Y_train[:,:,Ynames_n[i]],Yscaler[pred_names_n[i],:])
        Y_train_d[:,:,Ynames_n[i]]=Z_norm_with_scaler(Y_train_d[:,:,Ynames_n[i]],Yscaler[pred_names_n[i],:])
        Y_val[:,:,Ynames_n[i]]=Z_norm_with_scaler(Y_val[:,:,Ynames_n[i]],Yscaler[pred_names_n[i],:])

    
    #transfer to cuda
    if torch.cuda.is_available():
        device = torch.device("cuda")
    print(device)  

    X_train = torch.from_numpy(X_train).to(device)
    Y_train = torch.from_numpy(Y_train).to(device)
    X_train_d = torch.from_numpy(X_train_d).to(device)
    Y_train_d = torch.from_numpy(Y_train_d).to(device)
    X_val = torch.from_numpy(X_val).to(device)
    Y_val = torch.from_numpy(Y_val).to(device)
    #transfer mask to the cuda
    Y_train_mask = torch.from_numpy(Y_train_mask).to(device)
    Y_val_mask = torch.from_numpy(Y_val_mask).to(device)
    Y_train_maskb = Y_train_mask.ge(0.25)
    Y_val_maskb = Y_val_mask.ge(0.25)
    
    Y_train_d_mask = torch.from_numpy(Y_train_d_mask).to(device)
    Y_train_d_maskb = Y_train_d_mask.ge(0.25)

    print(X_train.size(),Y_train.min())
    get_gpu_memory()
    
    #print(Y_train_gt_1stind,'01!!!!!!!!!!!!!!!!')
    #print(Y_val_gt_1stind,'02!!!!!!!!!!!!!!!!')
    
    
    #retrain the model
    #########################prepare the fake initials
    #Generate initials considering the ground truth:
    #Y_gt is the ground truth, using for validate or train
    def generate_ini_stats(stat_values,Y_gt,Y_gt_1stind,aug_n,pred_names_n,Ynames_n):
        statini_sq = torch.zeros([Y_gt.size(0),Y_gt.size(1),len(stat_values)],device=device)
        for i in range(len(stat_values)):
            statini_sq[:,:,i]=stat_values[i]
        for i in range(len(pred_names_n)):
            for cc in range(len(Y_gt_1stind[:,Ynames_n[i]])):
                if Y_gt_1stind[cc,Ynames_n[i]] != 0:
                    statini_sq[Y_gt_1stind[cc,Ynames_n[i]]:,cc*aug_n:(cc+1)*aug_n,pred_names_n[i]-1]= \
                                                        Y_gt[Y_gt_1stind[cc,Ynames_n[i]],cc*aug_n:(cc+1)*aug_n,Ynames_n[i]].\
                                                        view(1,aug_n).repeat(Y_gt.size(0)-Y_gt_1stind[cc,Ynames_n[i]],1)
        return statini_sq
    
    #generate initials
    stats_train_sq = generate_ini_stats(stat_values,Y_train,\
                                        Y_train_gt_1stind,augn,pred_names_n[1:],Ynames_n[1:])
    stats_val_sq = generate_ini_stats(stat_values,Y_val,\
                                                Y_val_gt_1stind,1,pred_names_n[1:],Ynames_n[1:])
    
    #generate initials for training without 
    stats_train_d_sq = generate_ini_stats(stat_values,Y_train_d,\
                                        Y_train_gt_1stind,1,pred_names_n[1:],Ynames_n[1:])

    

    def generate_fake_stats(stat_values,batch):
        fake_stats=torch.zeros([1,batch,len(stat_values)],device=device)
        for i in range(len(stat_values)):
            fake_stats[:,:,i]=stat_values[i]
        return fake_stats
    
    
    
    ################train the model start
    model_version="n2o_gru_mesotest_v4_exp7"+"_val"+str(c_val[0]+1)+".sav"

    ####################freeze the second layer, only train the first layer
    #only train first layer and states variables and without augmentation
    ###load model
    n_a=64 #hidden state number
    n_l=2 #layer of lstm
    dropout=0.2
    model_load='n2o_gru_mesotest_v4_exp1.sav'
    path_load = basic_path+model_load
    checkpoint=torch.load(path_load)
    model1=Statini_sq_N2OGRU(n_f+len(stat_values),n_f+len(stat_values),n_a,n_l,len(stat_values),1,dropout)
    model1.load_state_dict(checkpoint['model_state_dict'])
    model1.to(device) #too large for GPU, kif not enough, change to cpu
    print(model1)
    loss_val_best = 500000
    R2_best=0.2
    compute_r2=R2Loss()
    path_save = basic_path+model_version+'1l'
    
    ##freeze 2nd layer
    for name, param in model1.named_parameters():
        if param.requires_grad and ('gru2' in name or 'densor2' in name):
            param.requires_grad = False
        if param.requires_grad:
            print(name)
            
    
    starttime=time.time()
    lr=0.1 #sgd
    lr_adam=0.0001*0.5
    optimizer = optim.Adam(model1.parameters(), lr=lr_adam) #add weight decay normally 1-9e-4
    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=5000, gamma=0.5)
    slw=122
    print(X_train_d.size(),X_val.size())
    batch_total=X_train_d.size(1)
    batch_size=batch_total  # this is the batch size for training
    train_losses = []
    val_losses = []
    maxepoch=20000
    model1.train()
    
    for epoch in range(maxepoch):
        train_loss=0.0
        val_loss=0.0
        Y_pred_all=torch.zeros([Y_train_d.size(0),Y_train_d.size(1),len(out_names)],device=device)
        #shuffled the training data
        shuffled_b=torch.randperm(X_train_d.size()[1]) 
        X_train_new=X_train_d[:,shuffled_b,:] 
        Y_train_new=Y_train_d[:,shuffled_b,:]
        Y_train_mask_new = Y_train_d_mask[:,shuffled_b,:]
        Y_train_maskb_new = Y_train_d_maskb[:,shuffled_b,:]
        stats_train_sq_new = stats_train_d_sq[:,shuffled_b,:]  
        model1.zero_grad()
        for bb in range(int(batch_total/batch_size)):
            hidden = model1.init_hidden(batch_size)
            Y_pred,hidden = model1(X_train_new[:,bb*batch_size:(bb+1)*batch_size,:],\
                                   stats_train_sq_new[:,bb*batch_size:(bb+1)*batch_size,:],hidden)
            #need to adjust the loss for missing data based on mask
            loss = my_loss_weighted(Y_pred[:,:,pred_names_n[1:]], \
                                    Y_train_new[:,bb*batch_size:(bb+1)*batch_size,Ynames_n[1:]],\
                                    Y_train_mask_new[:,bb*batch_size:(bb+1)*batch_size,Ynames_n[1:]])
            hidden[0].detach_() 
            hidden[1].detach_()
            loss.backward()
            optimizer.step()
            with torch.no_grad():
                train_loss=train_loss+loss.item()
                Y_pred_all[:,bb*batch_size:(bb+1)*batch_size,:]=Y_pred[:,:,:]
       
    
        scheduler.step()
        #validation
        model1.eval()
        with torch.no_grad():
            train_loss=train_loss/(batch_total/batch_size)
            train_losses.append(train_loss)
            #mask out the points
            Y_train_new_masked=torch.masked_select(Y_train_new[:,:,Ynames_n[1:]], Y_train_maskb_new[:,:,Ynames_n[1:]])
            Y_pred_all_masked=torch.masked_select(Y_pred_all[:,:,pred_names_n[1:]], Y_train_maskb_new[:,:,Ynames_n[1:]])
            train_R2=compute_r2(Y_pred_all_masked.contiguous().view(-1),\
                                Y_train_new_masked.contiguous().view(-1)).item()
            ########################validation
            Y_val_pred=torch.zeros([Y_val.size(0),Y_val.size(1),len(out_names)],device=device)
            hidden = model1.init_hidden(X_val.size(1))
            #print(stats_val_sq[:,2,0])
            Y_val_pt, hidden = model1(X_val,stats_val_sq,hidden)
            Y_val_pred[:,:,:] = Y_val_pt[:,:,:]
            loss = my_loss_weighted(Y_val_pred[:,:,pred_names_n[1:]],Y_val[:,:,Ynames_n[1:]],Y_val_mask[:,:,Ynames_n[1:]])
            val_loss=loss.item()
            val_losses.append(val_loss)
            Y_val_masked=torch.masked_select(Y_val[:,:,Ynames_n[1:]], Y_val_maskb[:,:,Ynames_n[1:]])
            Y_val_pred_masked=torch.masked_select(Y_val_pred[:,:,pred_names_n[1:]], Y_val_maskb[:,:,Ynames_n[1:]])
            val_R2=compute_r2(Y_val_pred_masked.contiguous().view(-1),Y_val_masked.contiguous().view(-1)).item()
            if val_loss < loss_val_best and val_R2 > R2_best:
                loss_val_best=val_loss
                R2_best = val_R2
                f0=open(path_save,'w')
                f0.close()
                #os.remove(path_save)
                torch.save({'epoch': epoch,
                        'model_state_dict': model1.state_dict(),
                        'R2': train_R2,
                        'loss': train_loss,
                        'los_val': val_loss,
                        'R2_val': val_R2,
                        }, path_save)
            if epoch%1000 == 999:
                print("finished training 1st layer epoch", epoch+1)
                mtime=time.time()
                print("train_loss: ", train_loss, "train_R2", train_R2,"val_loss:",val_loss,"val_R2", val_R2,\
                      "loss val best:",loss_val_best,"R2 val best:",R2_best, f"Spending time: {mtime - starttime}s")
            if train_R2 > 0.99:
                break
            #adding early stop
        model1.train()

    path_fs = path_save+'fs'
    torch.save({'train_losses': train_losses,
                'val_losses': val_losses,
                'model_state_dict_fs': model1.state_dict(),
                }, path_fs)
    #####finished training first layer
    
    ####################train the second layer, freeze first layer
    
    ###load the trained model:
    n_a=64 #hidden state number
    n_l=2 #layer of lstm
    dropout=0.2
    path_load = basic_path+model_version+'1l'
    print(path_load)
    checkpoint=torch.load(path_load)
    model1=Statini_sq_N2OGRU(n_f+len(stat_values),n_f+len(stat_values),n_a,n_l,len(stat_values),1,dropout)
    model1.load_state_dict(checkpoint['model_state_dict'])
    model1.to(device) #too large for GPU, kif not enough, change to cpu
    print(model1)
    loss_val_best = 500000
    R2_best=0.5
    path_save = basic_path+model_version+'2l'
    
    for name, param in model1.named_parameters():
        param.requires_grad = True
    ###freeze 1st layer
    for name, param in model1.named_parameters():
        if param.requires_grad and ('gru1' in name or 'densor1' in name):
            param.requires_grad = False
        if param.requires_grad:
            print(name)

    lr=0.1 #sgd
    lr_adam=0.0001*0.5
    
    optimizer = optim.Adam(model1.parameters(), lr=lr_adam) #add weight decay normally 1-9e-4
    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=200, gamma=0.5)
    slw=122
    print(X_train.size(),X_val.size())
    batch_total=X_train.size(1)
    batch_size=500  # this is the batch size for training
    train_losses = []
    val_losses = []
    maxepoch=800
    model1.train()
    for epoch in range(maxepoch):
        train_loss=0.0
        val_loss=0.0
        Y_pred_all=torch.zeros([Y_train.size(0),Y_train.size(1),len(out_names)],device=device)
        #shuffled the training data
        shuffled_b=torch.randperm(X_train.size()[1]) 
        X_train_new=X_train[:,shuffled_b,:] 
        Y_train_new=Y_train[:,shuffled_b,:]
        Y_train_mask_new = Y_train_mask[:,shuffled_b,:]
        Y_train_maskb_new = Y_train_maskb[:,shuffled_b,:]
        stats_train_sq_new = stats_train_sq[:,shuffled_b,:]  
        model1.zero_grad()
        for bb in range(int(batch_total/batch_size)):
            hidden = model1.init_hidden(batch_size)
            
            Y_pred,hidden = model1(X_train_new[:,bb*batch_size:(bb+1)*batch_size,:],\
                                   stats_train_sq_new[:,bb*batch_size:(bb+1)*batch_size,:],hidden)
            #need to adjust the loss for missing data based on mask
            loss = my_loss_weighted(Y_pred[:,:,pred_names_n[0]], \
                                    Y_train_new[:,bb*batch_size:(bb+1)*batch_size,Ynames_n[0]],\
                                    Y_train_mask_new[:,bb*batch_size:(bb+1)*batch_size,Ynames_n[0]])
            hidden[0].detach_() 
            hidden[1].detach_()
            loss.backward()
            optimizer.step()
            with torch.no_grad():
                train_loss=train_loss+loss.item()
                Y_pred_all[:,bb*batch_size:(bb+1)*batch_size,:]=Y_pred[:,:,:]
       
    
        scheduler.step()
        #validation
        model1.eval()
        with torch.no_grad():
            train_loss=train_loss/(batch_total/batch_size)
            train_losses.append(train_loss)
            #mask out the points
            Y_train_new_masked=torch.masked_select(Y_train_new[:,:,Ynames_n[0]], Y_train_maskb_new[:,:,Ynames_n[0]])
            Y_pred_all_masked=torch.masked_select(Y_pred_all[:,:,pred_names_n[0]], Y_train_maskb_new[:,:,Ynames_n[0]])
            train_R2=compute_r2(Y_pred_all_masked.contiguous().view(-1),\
                                Y_train_new_masked.contiguous().view(-1)).item()
            ########################validation
            Y_val_pred=torch.zeros([Y_val.size(0),Y_val.size(1),len(out_names)],device=device)
            hidden = model1.init_hidden(X_val.size(1))
            #print(stats_val_sq[:,2,0])
            Y_val_pt, hidden = model1(X_val,stats_val_sq,hidden)
            Y_val_pred[:,:,:] = Y_val_pt[:,:,:]
            loss = my_loss_weighted(Y_val_pred[:,:,pred_names_n[0]],Y_val[:,:,Ynames_n[0]],Y_val_mask[:,:,Ynames_n[0]])
            val_loss=loss.item()
            val_losses.append(val_loss)
            Y_val_masked=torch.masked_select(Y_val[:,:,Ynames_n[0]], Y_val_maskb[:,:,Ynames_n[0]])
            Y_val_pred_masked=torch.masked_select(Y_val_pred[:,:,pred_names_n[0]], Y_val_maskb[:,:,Ynames_n[0]])
            val_R2=compute_r2(Y_val_pred_masked.contiguous().view(-1),Y_val_masked.contiguous().view(-1)).item()
            if val_loss < loss_val_best and val_R2 > R2_best:
                loss_val_best=val_loss
                R2_best = val_R2
                f0=open(path_save,'w')
                f0.close()
                #os.remove(path_save)
                torch.save({'epoch': epoch,
                        'model_state_dict': model1.state_dict(),
                        'R2': train_R2,
                        'loss': train_loss,
                        'los_val': val_loss,
                        'R2_val': val_R2,
                        }, path_save)
            print("finished training epoch", epoch+1)
            mtime=time.time()
            print("train_loss: ", train_loss, "train_R2", train_R2,"val_loss:",val_loss,"val_R2", val_R2,\
                  "loss val best:",loss_val_best,"R2 val best:",R2_best, f"Spending time: {mtime - starttime}s")
            if train_R2 > 0.99:
                break
            #adding early stop
        model1.train()
    endtime=time.time()
    path_fs = path_save+'fs'
    torch.save({'train_losses': train_losses,
                'val_losses': val_losses,
                'model_state_dict_fs': model1.state_dict(),
                }, path_fs)
    print("final train_loss:",train_loss,"final train_R2:",train_R2,"val_loss:",val_loss,"loss validation best:",loss_val_best)
    print(f"total Training time: {endtime - starttime}s")

[1, 2, 3, 4, 5]
(2928, 3, 6, 4) (122, 3, 6, 2) (122, 3, 6, 15) (2928, 3, 6, 5)
(15, 5) (3, 5)
(122, 15000, 16) (122, 15000, 5) (122, 15, 16) (122, 15, 5) (122, 3, 16) (122, 3, 5)
(16, 2) (5, 2) 16
cuda
torch.Size([122, 15000, 16]) tensor(-514441.6250, device='cuda:0')
[5158]
Statini_sq_N2OGRU(
  (gru1): GRU(20, 64, num_layers=2, dropout=0.2)
  (gru2): GRU(20, 64, num_layers=2, dropout=0.2)
  (densor1): Linear(in_features=64, out_features=4, bias=True)
  (densor2): Linear(in_features=64, out_features=1, bias=True)
  (drop): Dropout(p=0.2, inplace=False)
)
gru1.weight_ih_l0
gru1.weight_hh_l0
gru1.bias_ih_l0
gru1.bias_hh_l0
gru1.weight_ih_l1
gru1.weight_hh_l1
gru1.bias_ih_l1
gru1.bias_hh_l1
densor1.weight
densor1.bias
torch.Size([122, 15, 16]) torch.Size([122, 3, 16])
finished training 1st layer epoch 1000
train_loss:  3.0660083293914795 train_R2 0.5410485863685608 val_loss: 4.032071113586426 val_R2 0.5333629846572876 loss val best: 4.030301570892334 R2 val best: 0.5335549116134644 Spendi

KeyboardInterrupt: 