In [None]:
"""
Created on Wed Jun 09 14:36 2021

Prepare proof of concept with a very simple DNN to parameterise the sub-shelf melt

Author: @claraburgard

"""

In [None]:
import numpy as np
import xarray as xr
from tqdm.notebook import trange, tqdm
import glob

import tensorflow as tf
from tensorflow import keras

from basal_melt_neural_networks.constants import *
import basal_melt_neural_networks.diagnostic_functions as diag

In [None]:
%matplotlib qt5

READ IN DATA

In [None]:
inputpath_data='/bettik/burgardc/SCRIPTS/basal_melt_param/data/interim/NEMO_eORCA025.L121_ANT_STEREO/'
inputpath_mask = '/bettik/burgardc/SCRIPTS/basal_melt_param/data/interim/ANTARCTICA_IS_MASKS/nemo_5km_withdask/'
inputpath_profiles = '/bettik/burgardc/SCRIPTS/basal_melt_param/data/interim/T_S_PROF/nemo_5km/'
inputpath_plumes = '/bettik/burgardc/SCRIPTS/basal_melt_param/data/interim/PLUMES/nemo_5km/'
inputpath_boxes = '/bettik/burgardc/SCRIPTS/basal_melt_param/data/interim/BOXES/nemo_5km/'

FOR EACH POINT:
- T and S profiles at the front (decompose z dimension into single things)
- Distance to front
- Distance to the grounding line
- Local slope ice draft
- Local slope bedrock
- Ice draft depth
- Bathymetry
- Ice draft concentration
- Horizontal coordinates (lon, lat)
- Mean bathymetry at entry (to add in future)
- Max bathymetry (to add in future)
- Target: melt m ice per yr

In [None]:
# dIF, dGL, longitude, latitude
file_isf_orig = xr.open_dataset(inputpath_mask+'nemo_5km_isf_masks_and_info_and_distance_new.nc')
nonnan_Nisf = file_isf_orig['Nisf'].where(np.isfinite(file_isf_orig['front_bot_depth_max']), drop=True).astype(int)
file_isf_nonnan = file_isf_orig.sel(Nisf=nonnan_Nisf)
large_isf = file_isf_nonnan['Nisf'].where(file_isf_nonnan['isf_area_here'] >= 2500, drop=True)
file_isf = file_isf_nonnan.sel(Nisf=large_isf)

In [None]:
# T and S profiles
file_TS_orig = xr.open_dataset(inputpath_profiles+'T_S_mean_prof_corrected_km_contshelf_and_offshore_1980-2018.nc')
file_TS = file_TS_orig.sel(Nisf=file_isf.Nisf)
file_TS_dom = file_TS.sel(profile_domain=50)

In [None]:
box_charac_all_2D = xr.open_dataset(inputpath_boxes + 'nemo_5km_boxes_2D.nc')
box_charac_all_1D = xr.open_dataset(inputpath_boxes + 'nemo_5km_boxes_1D.nc')
plume_charac = xr.open_dataset(inputpath_plumes+'nemo_5km_plume_characteristics.nc')

In [None]:
# Local slope
local_ice_slope = plume_charac['alpha'].sel(option='appenB').drop('option')

In [None]:
def cut_domain_stereo(var_to_cut, map_lim_x, map_lim_y):
    var_cutted = var_to_cut.sel(x=var_to_cut.x.where(in_range(var_to_cut.x,map_lim_x),drop=True), y=var_to_cut.y.where(in_range(var_to_cut.y,map_lim_y),drop=True))
    return var_cutted

def in_range(in_xy,txy):
    return ((in_xy >= min(txy)) & (in_xy < max(txy)))

In [None]:
map_lim = [-3000000,3000000]
file_mask_orig = xr.open_dataset(inputpath_data+'other_mask_vars_Ant_stereo.nc')
file_mask_orig_cut = cut_domain_stereo(file_mask_orig, map_lim, map_lim)
file_other = xr.open_dataset(inputpath_data+'corrected_draft_bathy_isf.nc')#, chunks={'x': chunk_size, 'y': chunk_size})
file_other_cut = cut_domain_stereo(file_other, map_lim, map_lim)
file_conc = xr.open_dataset(inputpath_data+'isfdraft_conc_Ant_stereo.nc')
file_conc_cut = cut_domain_stereo(file_conc, map_lim, map_lim)

In [None]:
# bathymetry, ice draft, concentration
file_bed_orig = file_mask_orig_cut['bathy_metry']
file_draft = file_other_cut['corrected_isfdraft'] 
file_isf_conc = file_conc_cut['isfdraft_conc']

In [None]:
file_bedrock_slope = xr.open_dataset(inputpath_mask+'nemo_5km_bedrock_slope.nc')
local_bedrock_slope = file_bedrock_slope['bedrock_slope']

In [None]:
melt_files = list(sorted(glob.glob(inputpath_data+'cavity_melt_*_Ant_stereo.nc')))
ds_melt  = xr.open_mfdataset(melt_files, concat_dim='new_time', combine='nested')#, chunks={'x': chunksize, 'y': chunksize})
ds_melt = ds_melt.squeeze('time')
ds_melt = ds_melt.rename({'new_time': 'time'})
ds_melt = ds_melt.assign_coords(time=np.arange(1980, 2019))
ds_melt_cutted = cut_domain_stereo(ds_melt, map_lim, map_lim)

In [None]:
melt_rate = (-1*(ds_melt_cutted*yearinsec/rho_i)/file_isf_conc).load()

Collect all 2D data in one dataset

In [None]:
geometry_2D = file_isf[['dGL', 'dIF', 'longitude', 'latitude']].merge(local_ice_slope).merge(local_bedrock_slope).merge(file_draft).merge(file_bed_orig)

SUBSAMPLE DATA

Select one ice shelf

In [None]:
kisf_of_int = 66

In [None]:
geometry_2D_isf = geometry_2D.where(file_isf['ISF_mask'] == kisf_of_int, drop=True)
melt_rate_isf = melt_rate.where(file_isf['ISF_mask'] == kisf_of_int, drop=True).load()
TS_isf = file_TS_dom.sel(Nisf=kisf_of_int)
max_front_depth = file_isf['front_bot_depth_max'].sel(Nisf=kisf_of_int)

Select one time step for now

In [None]:
tt = 0
melt_rate_isf_tt = melt_rate_isf.isel(time=tt)
TS_isf_tt = TS_isf.isel(time=tt).where(TS_isf.depth < max_front_depth, drop=True).drop('profile_domain').drop('Nisf').drop('time')

PREPARE DATAFRAME

In [None]:
T_list = [ ]
S_list = [ ]
depth_list = [ ]
for ii in range(len(TS_isf_tt.depth)):
    T_list.append('T_'+str(ii).zfill(3))
    S_list.append('S_'+str(ii).zfill(3))
    depth_list.append('d_'+str(ii).zfill(3))

Convert T and S to dataframe

In [None]:
TS_isf_df = TS_isf_tt.to_dataframe()

Convert 2D time-independent data to dataframe

In [None]:
length_df = len(geometry_2D_isf.x)*len(geometry_2D_isf.y)

geo_df = geometry_2D_isf.drop('x').drop('y').to_dataframe()

for nn in range(length_df):
    for ii,icol in enumerate(T_list):
        geo_df[icol] = TS_isf_df['theta_ocean'].values[ii]
    for ii,icol in enumerate(S_list):    
        geo_df[icol] = TS_isf_df['salinity_ocean'].values[ii]
        

Convert melt to dataframe

In [None]:
melt_df = melt_rate_isf_tt.drop('x').drop('y').drop('longitude').drop('latitude').to_dataframe().drop(['mapping'],axis=1).reset_index().drop(['time'],axis=1)

Merge all and clean NaN-rows

In [None]:
merged_df = pd.merge(geo_df,melt_df,how='left',on=['x','y'])

clean_df = merged_df.dropna()
clean_df = clean_df.drop(['x'], axis=1).drop(['y'], axis=1)

DIVIDE INTO TRAIN AND TEST DATASET

In [None]:
data_train = clean_df.sample(frac=0.7, axis=0) 
data_test  = clean_df.drop(data_train.index)

In [None]:
y_train = data_train['melt_cavity']
x_train = data_train.drop(['melt_cavity'], axis=1)

y_test = data_test['melt_cavity']
x_test = data_test.drop(['melt_cavity'], axis=1)

print('Original data shape was : ',clean_df.shape)
print('x_train : ',x_train.shape, 'y_train : ',y_train.shape)
print('x_test  : ',x_test.shape,  'y_test  : ',y_test.shape)

### 3.2 - Data normalization
**Note :** 
 - All input data must be normalized, train and test.  
 - To do this we will **subtract the mean** and **divide by the standard deviation**.  
 - But test data should not be used in any way, even for normalization.  
 - The mean and the standard deviation will therefore only be calculated with the train data.

In [None]:
#display(x_train.describe().style.format("{0:.2f}").set_caption("Before normalization :"))


x_train_norm = x_train.copy()
x_test_norm = x_test.copy()

for ccol in ['dGL','dIF','alpha','bedrock_slope','corrected_isfdraft','bathy_metry','longitude','latitude']:
    mean = x_train[ccol].mean()
    std  = x_train[ccol].std()
    x_train_norm[ccol] = (x_train[ccol] - mean) / std
    x_test_norm[ccol]  = (x_test[ccol]  - mean) / std

mean_T = x_train[T_list].mean().mean()
std_T = x_train[T_list].mean().std()
mean_S = x_train[S_list].mean().mean()
std_S = x_train[S_list].mean().std()


for ccol in [T_list]:
    x_train_norm[ccol] = (x_train[ccol] - mean_T) / std_T
    x_test_norm[ccol] = (x_test[ccol] - mean_T) / std_T

for ccol in [S_list]:
    x_train_norm[ccol] = (x_train[ccol] - mean_S) / std_S
    x_test_norm[ccol] = (x_test[ccol] - mean_S) / std_S

#display(x_train.describe().style.format("{0:.2f}").set_caption("After normalization :"))
#display(x_train.head(5).style.format("{0:.2f}").set_caption("Few lines of the dataset :"))

x_train_arr, y_train_arr = np.array(x_train_norm), np.array(y_train)
x_test_arr,  y_test_arr  = np.array(x_test_norm),  np.array(y_test)


## Step 4 - Build a model
About informations about : 
 - [Optimizer](https://www.tensorflow.org/api_docs/python/tf/keras/optimizers)
 - [Activation](https://www.tensorflow.org/api_docs/python/tf/keras/activations)
 - [Loss](https://www.tensorflow.org/api_docs/python/tf/keras/losses)
 - [Metrics](https://www.tensorflow.org/api_docs/python/tf/keras/metrics)

In [None]:
def get_model_v1(shape):
    
    model = keras.models.Sequential()
    model.add(keras.layers.Input(shape, name="InputLayer"))
    model.add(keras.layers.Dense(32, activation='relu', name='Dense_n1'))
    model.add(keras.layers.Dense(64, activation='relu', name='Dense_n2'))
    model.add(keras.layers.Dense(32, activation='relu', name='Dense_n3'))
    model.add(keras.layers.Dense(1, name='Output'))
    
    model.compile(optimizer = 'adam',
                  loss      = 'mse',
                  metrics   = ['mae', 'mse'] )
    return model

## Step 5 - Train the model
### 5.1 - Get it

In [None]:
input_size = len(x_train_arr[0])

In [None]:
model=get_model_v1( (input_size,) )

model.summary()

### 5.2 - Train it

In [None]:
history = model.fit(x_train_arr,
                    y_train_arr,
                    epochs          = 60,
                    batch_size      = 10,
                    verbose         = 1,
                    validation_data = (x_test_arr, y_test_arr))

## Step 6 - Evaluate
### 6.1 - Model evaluation
MAE =  Mean Absolute Error (between the labels and predictions)  
A mae equal to 3 represents an average error in prediction of $3k.

In [None]:
score = model.evaluate(x_test_arr, y_test_arr, verbose=1)

print('x_test / loss      : {:5.4f}'.format(score[0]))
print('x_test / mae       : {:5.4f}'.format(score[1]))
print('x_test / mse       : {:5.4f}'.format(score[2]))

### 6.2 - Training history
What was the best result during our training ?

In [None]:
df=pd.DataFrame(data=history.history)
display(df)

In [None]:
print("min( val_mae ) : {:.4f}".format( min(history.history["val_mae"]) ) )

In [None]:
diag.plot_history(history, plot={'MSE' :['mse', 'val_mse'],
                                'MAE' :['mae', 'val_mae'],
                                'LOSS':['loss','val_loss']})

## Step 7 - Make a prediction
The data must be normalized with the parameters (mean, std) previously used.

In [None]:
tt_val = 20

melt_rate_isf_tt_val = melt_rate_isf.isel(time=tt_val)
TS_isf_tt_val = TS_isf.isel(time=tt_val).where(TS_isf.depth < max_front_depth, drop=True).drop('profile_domain').drop('Nisf')

TS_isf_df_val = TS_isf_tt_val.to_dataframe()

melt_df_val = melt_rate_isf_tt_val.drop('x').drop('y').drop('longitude').drop('latitude').to_dataframe().drop(['mapping'],axis=1).reset_index().drop(['time'],axis=1)

length_df = len(geometry_2D_isf.x)*len(geometry_2D_isf.y)

geo_df_val = geo_df.copy()
for nn in range(length_df):
    for ii,icol in enumerate(T_list):
        geo_df_val[icol] = TS_isf_df_val['theta_ocean'].values[ii]
    for ii,icol in enumerate(S_list):    
        geo_df_val[icol] = TS_isf_df_val['salinity_ocean'].values[ii]
        
merged_df_val = pd.merge(geo_df_val,melt_df_val,how='left',on=['x','y'])

clean_df_val = merged_df_val.dropna()
clean_df_val = clean_df_val.drop(['x'], axis=1).drop(['y'], axis=1)

y_val = clean_df_val['melt_cavity']
x_val = clean_df_val.drop(['melt_cavity'], axis=1)

x_val_norm = x_val.copy()

for ccol in ['dGL','dIF','alpha','bedrock_slope','corrected_isfdraft','bathy_metry','longitude','latitude']:
    mean = x_train[ccol].mean()
    std  = x_train[ccol].std()
    x_val_norm[ccol] = (x_val[ccol] - mean) / std

mean_T = x_train[T_list].mean().mean()
std_T = x_train[T_list].mean().std()
mean_S = x_train[S_list].mean().mean()
std_S = x_train[S_list].mean().std()

for ccol in [T_list]:
    x_val_norm[ccol] = (x_val[ccol] - mean_T) / std_T

for ccol in [S_list]:
    x_val_norm[ccol] = (x_val[ccol] - mean_S) / std_S

#display(x_train.describe().style.format("{0:.2f}").set_caption("After normalization :"))
#display(x_train.head(5).style.format("{0:.2f}").set_caption("Few lines of the dataset :"))

x_val_arr, y_val_arr = np.array(x_val_norm), np.array(y_val)

#my_data=np.array(x_val_arr)#.reshape(1,13)

In [None]:
my_data = x_val_arr[0,:].reshape(1,input_size)

In [None]:
y_val_arr[0]

In [None]:
predictions = model.predict( my_data )
print("Prediction : {:.2f} m ice per y".format(predictions[0][0]))
print("Reality    : {:.2f} m ice per y".format(y_val_arr[0]))

======== TO KEEP FOR THE FUTURE =========

In [None]:
# For each column - for normalization with min and max

normalized_clean_df = clean_df.copy()

for ccol in ['dGL','dIF','alpha','bedrock_slope','corrected_isfdraft','bathy_metry','longitude','latitude','melt_cavity','time']:
    max_ccol = clean_df[ccol].max()
    min_ccol = clean_df[ccol].min()
    normalized_clean_df[ccol] = (clean_df[ccol] - min_ccol)/(max_ccol - min_ccol)

max_T = clean_df[T_list].max().max()
min_T = clean_df[T_list].min().min()
max_S = clean_df[S_list].max().max()
min_S = clean_df[S_list].min().min()

for ccol in [T_list]:
    normalized_clean_df[ccol] = (clean_df[ccol] - min_T)/(max_T - min_T)

for ccol in [S_list]:
    normalized_clean_df[ccol] = (clean_df[ccol] - min_S)/(max_S - min_S)