In [None]:
"""
Created on Tue Apr 05 15:43 2022

Script to train DNN on prepared input

Author: @claraburgard

"""

In [None]:
import numpy as np
import xarray as xr
from tqdm.notebook import trange, tqdm
import glob
import matplotlib as mpl
import seaborn as sns
import datetime
import time

import tensorflow as tf
from tensorflow import keras
from contextlib import redirect_stdout

from basal_melt_neural_networks.constants import *
import basal_melt_neural_networks.diagnostic_functions as diag
import basal_melt_neural_networks.data_formatting as dfmt

READ IN DATA

In [None]:
run_list = ['OPM006', 'OPM016', 'OPM018', 'OPM031'] #
inputpath_data = '/bettik/burgardc/DATA/NN_PARAM/interim/INPUT_DATA/'
outputpath_nn_models = '/bettik/burgardc/DATA/NN_PARAM/interim/NN_MODELS/'
outputpath_doc = '/bettik/burgardc/SCRIPTS/basal_melt_neural_networks/custom_doc/'

In [None]:
new_timetag = False
if new_timetag:
    datetag_dt = datetime.datetime.today()
    timetag_dt = datetime.datetime.now()
    timetag = str(datetag_dt.year)+str(datetag_dt.month).zfill(2)+str(datetag_dt.day).zfill(2)+'-'+str(timetag_dt.hour).zfill(2)+str(timetag_dt.minute).zfill(2)
else:
    timetag = '20220414-1706'

new_path_model = outputpath_nn_models+timetag+'/'
if not os.path.isdir(new_path_model):
    print("I did not find this folder ("+timetag+") in model folder so I created a new one, I hope that's ok!")
    os.mkdir(new_path_model)
else:
    print("This folder ("+timetag+") in model folder exists already!")

new_path_doc = outputpath_doc+timetag+'/'
if not os.path.isdir(new_path_doc):
    print("I did not find this folder ("+timetag+") in doc folder so I created a new one, I hope that's ok!")
    os.mkdir(new_path_doc)
else:
    print("This folder ("+timetag+") in doc folder exists already!")

In [None]:
for nemo_run in run_list:
    
    # copy to folder for archive
    src = inputpath_data + 'dataframe_input_'+nemo_run+'.csv'
    dst = new_path_model + 'dataframe_input_'+nemo_run+'.csv'
    os.popen(f"cp {src} {dst}")
    
    print('imported input data to '+nemo_run+' model folder')


In [None]:
all_input_df = None

for nemo_run in run_list:
    print(nemo_run)
    
    # read in the file
    clean_df_nrun = pd.read_csv(new_path_model + 'dataframe_input_'+nemo_run+'.csv',index_col=[0,1,2])
    clean_df_nrun.reset_index(drop=True, inplace=True)
    
    # append the different runs
    if all_input_df is None:
        all_input_df = clean_df_nrun
    else:
        all_input_df = all_input_df.append(clean_df_nrun, ignore_index = True)

DIVIDE INTO TRAIN AND TEST DATASET

In [None]:
data_train = all_input_df.sample(frac=0.7, axis=0) 
data_test  = all_input_df.drop(data_train.index)

In [None]:
y_train = data_train['melt_m_ice_per_y']
x_train = data_train.drop(['melt_m_ice_per_y'], axis=1)

y_test = data_test['melt_m_ice_per_y']
x_test = data_test.drop(['melt_m_ice_per_y'], axis=1)

print('Original data shape was : ',all_input_df.shape)
print('x_train : ',x_train.shape, 'y_train : ',y_train.shape)
print('x_test  : ',x_test.shape,  'y_test  : ',y_test.shape)

DATA NORMALIZATION

In [None]:
norm_method = 'interquart' #'std', 'interquart', 'minmax'

x_mean = x_train.mean()
y_mean = y_train.mean()

if norm_method == 'std':
    x_range  = x_train.std()
    y_range  = y_train.std()
elif norm_method == 'interquart':
    x_range  = x_train.quantile(0.9) - x_train.quantile(0.1)
    y_range  = y_train.quantile(0.9) - y_train.quantile(0.1)
elif norm_method == 'minmax':
    x_range  = x_train.max() - x_train.min() 
    y_range  = y_train.max() - y_train.min() 
    
x_train_norm = (x_train - x_mean)/x_range
x_test_norm = (x_test - x_mean)/x_range

y_train_norm = (y_train - y_mean)/y_range
y_test_norm = (y_test - y_mean)/y_range


In [None]:
summary_df = pd.DataFrame()
summary_df['x_mean'] = x_mean
summary_df['x_range'] = x_range
summary_df = summary_df.T 
summary_df['melt_m_ice_per_y'] = [y_mean, y_range]
summary_df.to_csv(new_path_model + 'dataframe_norm_training_data_'+timetag+'.csv')

In [None]:
x_train_arr, y_train_arr = np.array(x_train_norm), np.array(y_train_norm)
x_test_arr,  y_test_arr  = np.array(x_test_norm),  np.array(y_test_norm)

BUILD THE MODEL

In [None]:
def get_model_v1(shape):
    
    model = keras.models.Sequential()
    model.add(keras.layers.Input(shape, name="InputLayer"))
    model.add(keras.layers.Dense(32, activation='relu', name='Dense_n1'))
    model.add(keras.layers.Dense(64, activation='relu', name='Dense_n2'))
    model.add(keras.layers.Dense(32, activation='relu', name='Dense_n3'))
    model.add(keras.layers.Dense(1, name='Output'))
    
    model.compile(optimizer = 'adam',
                  loss      = 'mse',
                  metrics   = ['mae', 'mse'] )
    return model

def get_model_v2(shape):
    
    model = keras.models.Sequential()
    model.add(keras.layers.Input(shape, name="InputLayer"))
    model.add(keras.layers.Dense(1, name='Output'))
    
    model.compile(optimizer = 'adam',
                  loss      = 'mse',
                  metrics   = ['mae', 'mse'] )
    return model

TRAIN THE MODEL

In [None]:
input_size = len(x_train_arr[0])

In [None]:
model=get_model_v1( (input_size,) )

model.summary()

In [None]:
epoch_nb = 100
batch_siz = 1024

In [None]:
with open(new_path_doc+'info_'+timetag+'.log','w') as file:
    file.write('Timetag: '+timetag+' \n')
    file.write('----- DATA ----- \n')
    file.write('Training data from: '+str(run_list)+'\n')
    file.write('Norm method: '+norm_method+'\n')
    file.write('Original data shape was : '+str(all_input_df.shape)+'\n')
    file.write('x_train : '+str(x_train.shape)+', y_train : '+str(y_train.shape)+'\n')
    file.write('x_test  : '+str(x_test.shape)+', y_test  : '+str(y_test.shape)+'\n') 
    file.write('Input variables: '+','.join(map(str,x_train_norm.columns))+'\n')
    file.write('\n')
    file.write('----- MODEL ----- \n')
    with redirect_stdout(file):
        model.summary()
    file.write('\n')
    file.write('----- TRAINING ----- \n')
    file.write('Epochs: '+str(epoch_nb)+'\n')
    file.write('Batch size: '+str(batch_siz))

In [None]:
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.2,
                              patience=5, min_lr=0.001)

time_start = time.time()
history = model.fit(x_train_arr,
                    y_train_arr,
                    epochs          = epoch_nb,
                    batch_size      = batch_siz,
                    verbose         = 1,
                    validation_data = (x_test_arr, y_test_arr),
                   callbacks=[reduce_lr])
time_end = time.time()
timelength = time_end - time_start
with open(outputpath_doc+timetag+'.log','a') as file:
    file.write('\n Training time (in s): '+str(timelength))
model.save(new_path_model + 'model_nn_'+timetag+'.h5')

In [None]:
# convert the history.history dict to a pandas DataFrame:     
hist_df = pd.DataFrame(history.history) 

hist_csv_file = new_path_model+'history_'+timetag+'.csv'
with open(hist_csv_file, mode='w') as f:
    hist_df.to_csv(f)

QUICK EVALUATION

In [None]:
score = model.evaluate(x_test_arr, y_test_arr, verbose=1)

print('x_test / loss      : {:5.4f}'.format(score[0]))
print('x_test / mae       : {:5.4f}'.format(score[1]))
print('x_test / mse       : {:5.4f}'.format(score[2]))

In [None]:
print("min( val_mae ) : {:.4f}".format( min(history.history["val_mae"]) ) )

In [None]:
diag.plot_history(history, plot={'MSE' :['mse', 'val_mse'],
                                'MAE' :['mae', 'val_mae'],
                                'LOSS':['loss','val_loss']})