In [None]:
"""
Created on Fri Oct 21 16:58 2022

Look if with training I can match exactly latlon

Author: Clara Burgard
"""

In [None]:
import numpy as np
import xarray as xr
import pandas as pd
from tqdm.notebook import trange, tqdm
import glob
import datetime
import time
import sys

import tensorflow as tf
from tensorflow import keras
import basal_melt_neural_networks.model_functions as modf
import basal_melt_neural_networks.prep_input_data as indat
import basal_melt_neural_networks.postprocessing_functions as pp

import xgboost
import shap

import random

In [None]:
%matplotlib qt5

In [None]:
######### READ IN OPTIONS

mod_size = 'medium' #'mini', 'small', 'medium', 'large', 'extra_large'
tblock_out = 1
isf_out = 0
TS_opt = 'extrap' # extrap, whole, thermocline
norm_method = 'std' # std, interquart, minmax

In [None]:
######### READ IN DATA

inputpath_data = '/bettik/burgardc/DATA/NN_PARAM/interim/INPUT_DATA/'
outputpath_nn_models = '/bettik/burgardc/DATA/NN_PARAM/interim/NN_MODELS/'
outputpath_doc = '/bettik/burgardc/SCRIPTS/basal_melt_neural_networks/custom_doc/'

tblock_dim = range(1,14)
isf_dim = [10,11,12,13,18,22,23,24,25,30,31,33,38,39,40,42,43,44,45,47,48,51,52,53,54,55,58,61,65,66,69,70,71,73,75]

if (tblock_out > 0) and (isf_out == 0):
    path_model = outputpath_nn_models+'CV_TBLOCK/'
    
elif (isf_out > 0) and (tblock_out == 0):
    path_model = outputpath_nn_models+'CV_ISF/'
    
else:
    print("I do not know what to do with both tblock and isf left out! ")

#new_path_doc = outputpath_doc+timetag+'/'
#if not os.path.isdir(new_path_doc):
#    print("I did not find this folder ("+timetag+") in doc folder! :( ")

inputpath_CVinput = inputpath_data+'EXTRAPOLATED_ISFDRAFT_CHUNKS_CV/'
    
input_data_train_norm = xr.open_dataset(inputpath_CVinput + 'train_data_CV_noisf'+str(isf_out).zfill(3)+'_notblock'+str(tblock_out).zfill(3)+'.nc')
input_data_val_norm = xr.open_dataset(inputpath_CVinput + 'val_data_CV_noisf'+str(isf_out).zfill(3)+'_notblock'+str(tblock_out).zfill(3)+'.nc') 
latlon_train_norm = xr.open_dataset(inputpath_CVinput + 'trainlatlon_data_CV_noisf'+str(isf_out).zfill(3)+'_notblock'+str(tblock_out).zfill(3)+'.nc')
latlon_val_norm = xr.open_dataset(inputpath_CVinput + 'vallatlon_data_CV_noisf'+str(isf_out).zfill(3)+'_notblock'+str(tblock_out).zfill(3)+'.nc')

In [None]:
## prepare input and target

x_train_norm = input_data_train_norm.drop_vars(['melt_m_ice_per_y','theta_in','salinity_in']).sel(norm_method=norm_method).to_array().load()
y_train_norm = latlon_train_norm.drop_vars(['salinity_in']).sel(norm_method=norm_method).to_array().load()

x_val_norm = input_data_val_norm.drop_vars(['melt_m_ice_per_y','theta_in','salinity_in']).sel(norm_method=norm_method).to_array().load()
y_val_norm = latlon_val_norm.drop_vars(['salinity_in']).sel(norm_method=norm_method).to_array().load()

In [None]:
y_train_norm.T

In [None]:
######### TRAIN THE MODEL

input_size = x_train_norm.values.shape[0]
activ_fct = 'relu' #LeakyReLU
epoch_nb = 35
batch_siz = 512

model = modf.get_model(mod_size, input_size, activ_fct, 2)


reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5,
                              patience=3, min_lr=0.0000001, min_delta=0.0005) #, min_delta=0.1
            
early_stop = tf.keras.callbacks.EarlyStopping(
    monitor="val_loss",
    #min_delta=0.000001,
    patience=10,
    verbose=0,
    mode="auto",
    baseline=None,
    restore_best_weights=True,
)

time_start = time.time()
time_start0 = datetime.datetime.now()
print(time_start0)

history = model.fit(x_train_norm.T.values,
                    y_train_norm.T.values,
                    epochs          = epoch_nb,
                    batch_size      = batch_siz,
                    validation_data = (x_val_norm.T.values, y_val_norm.T.values),
                   callbacks=[reduce_lr, early_stop])
time_end = time.time()
timelength = time_end - time_start

time_end0 = datetime.datetime.now()
print(time_end0)

In [None]:
model.save(path_model + 'model_medium_latlon.h5')
# maybe limit it to 60 epochs?

In [None]:
model = keras.models.load_model(path_model + 'model_medium_latlon.h5')
y_out_norm = model.predict(x_val_norm.T.values)
y_out_norm_xr = xr.DataArray(data=y_out_norm.squeeze()).rename({'dim_0': 'index'})
y_out_norm_xr = y_out_norm_xr.assign_coords({'index': x_val_norm.index,'dim_1': ['latitude','longitude']})

norm_metrics_file = xr.open_dataset(inputpath_CVinput + 'metricslatlon_norm_CV_noisf'+str(isf_out).zfill(3)+'_notblock'+str(tblock_out).zfill(3)+'.nc')
norm_metrics = norm_metrics_file.sel(norm_method=norm_method).drop('norm_method').to_dataframe()

In [None]:
# denormalise the output
lat_out = pp.denormalise_vars(y_out_norm_xr.sel(dim_1='latitude'), 
                         norm_metrics['latitude'].loc['mean_vars'],
                         norm_metrics['latitude'].loc['range_vars'])

lat_target = pp.denormalise_vars(latlon_val_norm['latitude'].sel(norm_method='std'), 
                         norm_metrics['latitude'].loc['mean_vars'],
                         norm_metrics['latitude'].loc['range_vars'])

In [None]:
lon_out = pp.denormalise_vars(y_out_norm_xr.sel(dim_1='longitude'), 
                         norm_metrics['longitude'].loc['mean_vars'],
                         norm_metrics['longitude'].loc['range_vars'])

lon_target = pp.denormalise_vars(latlon_val_norm['longitude'].sel(norm_method='std'), 
                         norm_metrics['longitude'].loc['mean_vars'],
                         norm_metrics['longitude'].loc['range_vars'])

In [None]:
lat_out_pd_s = pd.Series(lat_out.values,index=latlon_val_norm.index,name='predicted_lat') 
lat_target_pd_s = pd.Series(lat_target.values,index=input_data_val_norm.index,name='reference_lat') 

In [None]:
lon_out_pd_s = pd.Series(lon_out.values,index=latlon_val_norm.index,name='predicted_lon') 
lon_target_pd_s = pd.Series(lon_target.values,index=input_data_val_norm.index,name='reference_lon') 

In [None]:
xx = range(-180,180)
yy = range(-90,-60)

In [None]:
plt.scatter(lon_out_pd_s,lon_target_pd_s,s=5,alpha=0.2)
plt.plot(xx,xx,'k-')

In [None]:
plt.figure()
plt.scatter(lat_out_pd_s,lat_target_pd_s,s=5,alpha=0.2)
plt.plot(yy,yy,'k-')

In [None]:
# train an XGBoost model
X, y = shap.datasets.boston()
model = xgboost.XGBRegressor().fit(x_train_norm.T.values, y_train_norm.T.values)

# explain the model's predictions using SHAP
# (same syntax works for LightGBM, CatBoost, scikit-learn, transformers, Spark, etc.)
explainer = shap.Explainer(model)
shap_values = explainer(x_train_norm.T.values)
#shap.plots.bar(shap_values)

In [None]:
x_train_norm

In [None]:
for i in range(12):
    print(i,x_train_norm['variable'].isel(variable=i).values)

In [None]:
shap_values.values.shape

In [None]:
shap.plots.bar(shap_values[:,:,0])

In [None]:
shap.plots.bar(shap_values[:,:,1])

In [None]:
# put some order in the file
lat_out_xr = lat_out_pd_s.to_xarray()
lat_target_xr = lat_target_pd_s.to_xarray()
lat_to_compare = xr.merge([lat_out_xr.T, lat_target_xr.T]).sortby('y')

lon_out_xr = lon_out_pd_s.to_xarray()
lon_target_xr = lon_target_pd_s.to_xarray()
lon_to_compare = xr.merge([lon_out_xr.T, lon_target_xr.T]).sortby('y')

TRY SHUFFLING

In [None]:
shuffled_isf_area = x_val_norm.sel(variable='isf_area').copy()
np.random.shuffle(shuffled_isf_area.values)

In [None]:
x_val_norm.sel(variable='isf_area').values

In [None]:
shuffled_isf_area.values

In [None]:
shuffled_isf_area

In [None]:
x_val_norm_shuffled = xr.concat([x_val_norm.drop_sel(variable='isf_area').copy(),shuffled_isf_area], dim='variable')


In [None]:
x_val_norm_shuffled

In [None]:
model = keras.models.load_model(path_model + 'model_medium_latlon.h5')
y_out_shuffled_norm = model.predict(x_val_norm.T.values)

In [None]:
y_out_shuffled_norm_xr = xr.DataArray(data=y_out_shuffled_norm.squeeze()).rename({'dim_0': 'index'})
y_out_shuffled_norm_xr = y_out_shuffled_norm_xr.assign_coords({'index': x_val_norm.index,'dim_1': ['latitude','longitude']})

In [None]:
# denormalise the output
lat_out_shuffled = pp.denormalise_vars(y_out_shuffled_norm_xr.sel(dim_1='latitude'), 
                         norm_metrics['latitude'].loc['mean_vars'],
                         norm_metrics['latitude'].loc['range_vars'])

lon_out_shuffled = pp.denormalise_vars(y_out_shuffled_norm_xr.sel(dim_1='longitude'), 
                         norm_metrics['longitude'].loc['mean_vars'],
                         norm_metrics['longitude'].loc['range_vars'])

In [None]:
lat_out_pd_s_shuffled = pd.Series(lat_out_shuffled.values,index=latlon_val_norm.index,name='predicted_lat') 
lat_target_pd_s = pd.Series(lat_target.values,index=input_data_val_norm.index,name='reference_lat') 

In [None]:
lon_out_pd_s_shuffled = pd.Series(lon_out_shuffled.values,index=latlon_val_norm.index,name='predicted_lon') 
lon_target_pd_s = pd.Series(lon_target.values,index=input_data_val_norm.index,name='reference_lon') 

In [None]:
np.sqrt(((lat_out_pd_s_shuffled - lat_target_pd_s)**2).mean())

In [None]:
np.sqrt(((lon_out_pd_s_shuffled - lon_target_pd_s)**2).mean())

In [None]:
np.sqrt(((lat_out_pd_s - lat_target_pd_s)**2).mean())

In [None]:
np.sqrt(((lon_out_pd_s - lon_target_pd_s)**2).mean())

In [None]:
for vv in x_val_norm['variable']:
    shuffled_var = x_val_norm.sel(variable=vv).copy()
    np.random.shuffle(shuffled_var.values)
    x_val_norm_shuffled = xr.concat([x_val_norm.drop_sel(variable=vv.values).copy(),shuffled_var], dim='variable')
    
    model = keras.models.load_model(path_model + 'model_medium_latlon.h5')
    y_out_shuffled_norm = model.predict(x_val_norm_shuffled.T.values)
    y_out_shuffled_norm_xr = xr.DataArray(data=y_out_shuffled_norm.squeeze()).rename({'dim_0': 'index'})
    y_out_shuffled_norm_xr = y_out_shuffled_norm_xr.assign_coords({'index': x_val_norm.index,'dim_1': ['latitude','longitude']})
    
    # denormalise the output
    lat_out_shuffled = pp.denormalise_vars(y_out_shuffled_norm_xr.sel(dim_1='latitude'), 
                         norm_metrics['latitude'].loc['mean_vars'],
                         norm_metrics['latitude'].loc['range_vars'])

    lon_out_shuffled = pp.denormalise_vars(y_out_shuffled_norm_xr.sel(dim_1='longitude'), 
                         norm_metrics['longitude'].loc['mean_vars'],
                         norm_metrics['longitude'].loc['range_vars'])
    
    lat_out_pd_s_shuffled = pd.Series(lat_out_shuffled.values,index=latlon_val_norm.index,name='predicted_lat') 
    lat_target_pd_s = pd.Series(lat_target.values,index=input_data_val_norm.index,name='reference_lat') 
    
    lon_out_pd_s_shuffled = pd.Series(lon_out_shuffled.values,index=latlon_val_norm.index,name='predicted_lon') 
    lon_target_pd_s = pd.Series(lon_target.values,index=input_data_val_norm.index,name='reference_lon') 
    
    print(vv.values)
    print('Latitude RMSE:',np.sqrt(((lat_out_pd_s_shuffled - lat_target_pd_s)**2).mean()))
    print('Longitude RMSE:',np.sqrt(((lon_out_pd_s_shuffled - lon_target_pd_s)**2).mean()))
    plt.figure()
    plt.scatter(lat_out_pd_s_shuffled.values,lat_target_pd_s.values,c='r',s=20,alpha=0.03)
    plt.plot(yy,yy,'k-')
    plt.title(vv.values)
    plt.xlim(-90,-60)
    plt.ylim(-90,-60)
    
    plt.figure()
    plt.scatter(lon_out_pd_s_shuffled.values,lon_target_pd_s.values,c='b',s=20,alpha=0.03)
    plt.plot(xx,xx,'k-')
    plt.xlim(-180,180)
    plt.ylim(-180,180)

In [None]:
plt.close('all')

In [None]:
lat_target_pd_s

In [None]:
x_val_norm.drop_sel(variable=vv.values).copy()

In [None]:
vv.values