In [None]:
"""
Created on Tue Apr 26 12:20 2022

Prepare input for a given experiment (marked by timetag)

Author: @claraburgard

"""

In [None]:
import numpy as np
import xarray as xr
from tqdm.notebook import trange, tqdm
import glob
import matplotlib as mpl
import seaborn as sns
import datetime
import time

from dask import delayed

import distributed

import tensorflow as tf
from tensorflow import keras
from contextlib import redirect_stdout

from basal_melt_neural_networks.constants import *
import basal_melt_neural_networks.diagnostic_functions as diag
import basal_melt_neural_networks.data_formatting as dfmt

In [None]:
client = distributed.Client(n_workers=8, dashboard_address=':8795', local_directory='/tmp', memory_limit='4GB')

READ IN DATA

In [None]:
run_list = ['OPM006', 'OPM016', 'OPM018', 'OPM031-2']
TS_input = 'whole_prof' #'whole_prof' #'extrapolated'
isf_list = [10, 11, 12, 13, 18, 21, 22, 23, 24, 25, 30, 31, 33, 38, 39, 40, 42, 43, 44, 45, 47, 48, 51, 52, 53, 54, 55, 58, 61, 65, 66, 69, 70, 71, 73, 75]

In [None]:
inputpath_data = '/bettik/burgardc/DATA/NN_PARAM/interim/INPUT_DATA/'
outputpath_nn_models = '/bettik/burgardc/DATA/NN_PARAM/interim/NN_MODELS/'
outputpath_doc = '/bettik/burgardc/SCRIPTS/basal_melt_neural_networks/custom_doc/'

In [None]:
new_timetag = False
if new_timetag:
    datetag_dt = datetime.datetime.today()
    timetag_dt = datetime.datetime.now()
    timetag = str(datetag_dt.year)+str(datetag_dt.month).zfill(2)+str(datetag_dt.day).zfill(2)+'-'+str(timetag_dt.hour).zfill(2)+str(timetag_dt.minute).zfill(2)
else:
    #timetag = '20220422-1055'
    timetag = '20220425-1050'

new_path_model = outputpath_nn_models+timetag+'/'
if not os.path.isdir(new_path_model):
    print("I did not find this folder ("+timetag+") in model folder so I created a new one, I hope that's ok!")
    os.mkdir(new_path_model)
else:
    print("This folder ("+timetag+") in model folder exists already!")

new_path_doc = outputpath_doc+timetag+'/'
if not os.path.isdir(new_path_doc):
    print("I did not find this folder ("+timetag+") in doc folder so I created a new one, I hope that's ok!")
    os.mkdir(new_path_doc)
else:
    print("This folder ("+timetag+") in doc folder exists already!")
    
new_path_input = inputpath_data+timetag+'/'
if not os.path.isdir(new_path_input):
    print("I did not find this folder ("+timetag+") in input folder so I created a new one, I hope that's ok!")
    os.mkdir(new_path_input)
else:
    print("This folder ("+timetag+") in input folder exists already!")

SAVE SELECTED DATA TO ONE NETCDF (FROM RUN LIST AND ISF LIST)

In [None]:
if TS_input == 'whole_prof':
    csv_path = inputpath_data + 'WHOLE_PROF/'
elif TS_input == 'extrapolated':
    csv_path = inputpath_data + 'EXTRAPOLATED_ISFDRAFT/'

all_input_df = None        
    
for nemo_run in run_list:
    
    for kisf in tqdm(isf_list): 
    #for kisf in tqdm([23,66]): 
    #for kisf in tqdm([23]): 
        clean_df_nrun_kisf = pd.read_csv(csv_path + 'dataframe_input_isf'+str(kisf).zfill(3)+'_'+nemo_run+'.csv',index_col=[0,1,2])
        clean_df_nrun_kisf.reset_index(drop=True, inplace=True)
        clean_ds_nrun_kisf = clean_df_nrun_kisf.to_xarray()

        if all_input_df is None:
            all_input_df = clean_ds_nrun_kisf.copy()
        else:
            #print(clean_ds_nrun_kisf.index)
            #print(all_input_df.index.max())
            new_index = clean_ds_nrun_kisf.index.values + all_input_df.index.max().values+1
            #print(new_index)
            clean_ds_nrun_kisf = clean_ds_nrun_kisf.assign_coords({'index': new_index})
            all_input_df = xr.concat([all_input_df, clean_ds_nrun_kisf], dim='index') 

all_input_df.to_netcdf(new_path_input + 'dataset_input_'+timetag+'.nc','w')

DIVIDE INTO TRAIN AND TEST DATASET

In [None]:
all_input_df = xr.open_mfdataset(new_path_input + 'dataset_input_'+timetag+'.nc')#.load()#.chunk({'index': 10000})

In [None]:
sample_frac = 0.7

all_indexes = all_input_df.index
random_sample = np.random.choice(all_indexes, size=np.round(len(all_indexes)*0.7).astype(int), replace=False)

data_train = all_input_df.sel(index=random_sample)
data_test = all_input_df.drop_sel(index=random_sample)

In [None]:
y_train = data_train['melt_m_ice_per_y']
x_train = data_train.drop_vars(['melt_m_ice_per_y'])

y_test = data_test['melt_m_ice_per_y']
x_test = data_test.drop_vars(['melt_m_ice_per_y'])

print('Original data shape was : ', print_shape_xr_ds(all_input_df))
print('x_train : ',dfmt.print_shape_xr_ds(x_train), 'y_train : ',len(y_train))
print('x_test  : ',dfmt.print_shape_xr_ds(x_test),  'y_test  : ',len(y_test))

DATA NORMALIZATION

In [None]:
norm_method = 'interquart' #'std', 'interquart', 'minmax'

x_mean = x_train.mean()
y_mean = y_train.mean()

if norm_method == 'std':
    x_range  = x_train.std()
    y_range  = y_train.std()
elif norm_method == 'interquart':
    x_range  = x_train.quantile(0.9) - x_train.quantile(0.1)
    y_range  = y_train.quantile(0.9) - y_train.quantile(0.1)
elif norm_method == 'minmax':
    x_range  = x_train.max() - x_train.min() 
    y_range  = y_train.max() - y_train.min() 
    
x_train_norm = (x_train - x_mean)/x_range
x_test_norm = (x_test - x_mean)/x_range

y_train_norm = (y_train - y_mean)/y_range
y_test_norm = (y_test - y_mean)/y_range

Write normalization factors to netcdf (to be used in application of the NN)

In [None]:
mean_ds = xr.merge([x_mean,y_mean])
mean_ds = mean_ds.assign_coords({'metric': 'mean_vars'})
range_ds = xr.merge([x_range,y_range])
range_ds = range_ds.assign_coords({'metric': 'std_vars'})
summary_ds = xr.concat([mean_ds, range_ds], dim='metric')

summary_ds.to_netcdf(new_path_model + 'dataset_norm_training_factors_'+timetag+'.nc')

Write training and test dataset to netcdf

In [None]:
data_train_norm = xr.merge([x_train_norm,y_train_norm])
data_train_norm.to_netcdf(new_path_model + 'dataset_norm_training_data_'+timetag+'.nc', 'w')
data_test_norm = xr.merge([x_test_norm,y_test_norm])
data_test_norm.to_netcdf(new_path_model + 'dataset_norm_test_data_'+timetag+'.nc', 'w')

WRITE INFOS ABOUT TRAINING DATA INTO THE DOC

In [None]:
with open(new_path_doc+'info_'+timetag+'.log','w') as file:
    file.write('Timetag: '+timetag+' \n')
    file.write('----- DATA ----- \n')
    file.write('Taraining data from: '+str(run_list)+'\n')
    file.write('Ice shelves: '+str(isf_list)+'\n')
    file.write('Input T and S format: '+TS_input+'\n')
    file.write('Norm method: '+norm_method+'\n')
    #file.write('Original data shape was : '+str(all_input_df.shape)+'\n')
    file.write('x_train : '+str(print_shape_xr_ds(x_train_norm))+', y_train : '+str(y_train_norm.values.shape)+'\n')
    file.write('x_test  : '+str(print_shape_xr_ds(x_test_norm))+', y_test  : '+str(y_test_norm.values.shape)+'\n') 