In [None]:
"""
Created on Thu Sep 08 11:15 2022

This script is to prepare the normalising coefficients and input data for the training on the whole dataset

Author: Clara Burgard
"""

In [None]:
import numpy as np
import xarray as xr
from tqdm.notebook import trange, tqdm
#from tqdm import trange, tqdm
import glob
import matplotlib as mpl
import seaborn as sns
import datetime
import time

from basal_melt_neural_networks.constants import *
import basal_melt_neural_networks.diagnostic_functions as diag
import basal_melt_neural_networks.data_formatting as dfmt
import basal_melt_neural_networks.prep_input_data as indat

import distributed

In [None]:
client = distributed.Client(n_workers=4, dashboard_address=':8795', local_directory='/tmp', memory_limit='6GB')

PREPARE THE CONTEXT OF THE INPUT DATA

In [None]:
inputpath_data = '/bettik/burgardc/DATA/NN_PARAM/interim/INPUT_DATA/' 

tblock_dim = np.arange(1,17).tolist()+np.arange(21,50).tolist()
isf_dim = [10,11,12,13,18,22,23,24,25,30,31,33,38,39,40,42,43,44,45,47,48,51,52,53,54,55,58,61,65,66,69,70,71,73,75]
    

prepare metrics

In [None]:
def combine_csv_per_timeblock(tt, isf_dim, TS_opt, inputpath_data):

    """
    Combines all csv of ice shelves of one time block
    
    Parameters
    ----------
    tblock_dim : list
        List of all time blocks to conduct the cross-validation on.
    isf_dim : list
        List of all ice shelves to conduct the cross-validation on.
    tblock_out : int
        Time block to leave out in cross-validation.
    isf_out : list
        Ice shelf to leave out in cross-validation.
    TS_opt : str
        Type of input temperature and salinity profiles to use. Can be 'extrap', 'whole', 'thermocline'
    inputpath_data : str
        Path to folder where to find the preformatted csv files.

    Returns
    -------
    summary_ds_all: xr.Dataset
        Dataset containing mean and denominator of the normalisation.
    var_train_norm: xr.Dataset
        Dataset containing normalised training predictors and target.
    var_val_norm: xr.Dataset
        Dataset containing normalised validation predictors and target.
    """
    
    ## which profile option are we using for temperature and salinity
    if TS_opt == 'extrap':
        inputpath_prof = inputpath_data+'EXTRAPOLATED_ISFDRAFT_CHUNKS/'
    elif TS_opt == 'whole':
        inputpath_prof = inputpath_data+'WHOLE_PROF_CHUNKS/'
    elif TS_opt == 'thermocline':
        inputpath_prof = inputpath_data+'THERMOCLINE_CHUNKS/'

    ### prepare training dataset

    train_input_df = None        

    for kisf in tqdm(isf_dim): 

        #print(kisf)
        clean_df_nrun_kisf = pd.read_csv(inputpath_prof + 'dataframe_input_isf'+str(kisf).zfill(3)+'_'+str(tt).zfill(3)+'.csv',index_col=[0,1,2])
        clean_df_nrun_kisf.reset_index(drop=True, inplace=True)
        #print('here1')
        clean_ds_nrun_kisf = clean_df_nrun_kisf.to_xarray()

        #print('here2')
        if train_input_df is None:
            train_input_df = clean_ds_nrun_kisf.copy()
        else:
            new_index = clean_ds_nrun_kisf.index.values + train_input_df.index.max().values+1
            clean_ds_nrun_kisf = clean_ds_nrun_kisf.assign_coords({'index': new_index})
            train_input_df = xr.concat([train_input_df, clean_ds_nrun_kisf], dim='index') 
    
    return train_input_df

In [None]:
#tblock_dim = np.arange(1,17).tolist()+np.arange(21,50).tolist()
tblock_dim = np.arange(21,50).tolist()
isf_dim = [10,11,12,13,18,22,23,24,25,30,31,33,38,39,40,42,43,44,45,47,48,51,52,53,54,55,58,61,65,66,69,70,71,73,75]
TS_opt = 'extrap'

if TS_opt == 'extrap':
    outputpath_CVinput = inputpath_data+'EXTRAPOLATED_ISFDRAFT_CHUNKS/'
elif TS_opt == 'whole':
    outputpath_CVinput = inputpath_data+'WHOLE_PROF_CHUNKS/'
elif TS_opt == 'thermocline':
    outputpath_CVinput = inputpath_data+'THERMOCLINE_CHUNKS/'

for tt in tblock_dim:
    print(tt)
        
    var_train = combine_csv_per_timeblock(tt, isf_dim, TS_opt, inputpath_data)
    var_train.to_netcdf(outputpath_CVinput + 'dataframe_input_allisf_'+str(tt).zfill(3)+'.nc')

In [None]:
def prepare_input_data_whole_summer_paper(tblock_dim, isf_dim, tblock_out, isf_out, TS_opt, inputpath_data):

    """
    Computes normalisation metrics and normalised predictors and target for cross-validation.
    
    Parameters
    ----------
    tblock_dim : list
        List of all time blocks to conduct the cross-validation on.
    isf_dim : list
        List of all ice shelves to conduct the cross-validation on.
    tblock_out : int
        Time block to leave out in cross-validation.
    isf_out : list
        Ice shelf to leave out in cross-validation.
    TS_opt : str
        Type of input temperature and salinity profiles to use. Can be 'extrap', 'whole', 'thermocline'
    inputpath_data : str
        Path to folder where to find the preformatted csv files.

    Returns
    -------
    summary_ds_all: xr.Dataset
        Dataset containing mean and denominator of the normalisation.
    var_train_norm: xr.Dataset
        Dataset containing normalised training predictors and target.
    var_val_norm: xr.Dataset
        Dataset containing normalised validation predictors and target.
    """

    ## are we dealing with leave-one-out cross-validation over time blocks?
    tblock_list = list(tblock_dim)
    if tblock_out > 0:
        tblock_list.remove(tblock_out)
    #print(tblock_list)

    ## are we dealing with leave-one-out cross-validation over ice shelves?
    isf_list = list(isf_dim)
    if isf_out > 0:
        isf_list.remove(isf_out)
    
    ## which profile option are we using for temperature and salinity
    if TS_opt == 'extrap':
        inputpath_prof = inputpath_data+'EXTRAPOLATED_ISFDRAFT_CHUNKS/'
    elif TS_opt == 'whole':
        inputpath_prof = inputpath_data+'WHOLE_PROF_CHUNKS/'
    elif TS_opt == 'thermocline':
        inputpath_prof = inputpath_data+'THERMOCLINE_CHUNKS/'

    ### prepare training dataset

    train_input_df = None        

    for tt in tblock_list:
        print(tt)

        for kisf in isf_list: 
            
            #print(kisf)
            clean_df_nrun_kisf = pd.read_csv(inputpath_prof + 'dataframe_input_isf'+str(kisf).zfill(3)+'_'+str(tt).zfill(3)+'.csv',index_col=[0,1,2])
            clean_df_nrun_kisf.reset_index(drop=True, inplace=True)
            #print('here1')
            clean_ds_nrun_kisf = clean_df_nrun_kisf.to_xarray()
            
            #print('here2')
            if train_input_df is None:
                train_input_df = clean_ds_nrun_kisf.copy()
            else:
                new_index = clean_ds_nrun_kisf.index.values + train_input_df.index.max().values+1
                clean_ds_nrun_kisf = clean_ds_nrun_kisf.assign_coords({'index': new_index})
                train_input_df = xr.concat([train_input_df, clean_ds_nrun_kisf], dim='index') 
        
        

    ## prepare validation dataset

    if (tblock_out > 0) and (isf_out == 0):  
        tt_val = [tblock_out]
        isf_val = isf_list
    elif (isf_out > 0) and (tblock_out == 0):
        isf_val = [isf_out]
        tt_val = tblock_list
    elif (isf_out == 0) and (tblock_out == 0):
        isf_val = isf_list
        tt_val = tblock_list
    else:
        print("I don't know how to handle leave ice shelves AND time blocks out, please teach me!")

    val_input_df = None        

    for tt in tt_val:
        print(tt)
        
        for kisf in isf_val: 
            
            #print(kisf)
            clean_df_nrun_kisf = pd.read_csv(inputpath_prof + 'dataframe_input_isf'+str(kisf).zfill(3)+'_'+str(tt).zfill(3)+'.csv',index_col=[0,1,2])
            #print('here3')
            clean_df_nrun_kisf.reset_index(drop=True, inplace=True)
            #print('here4')
            clean_ds_nrun_kisf = clean_df_nrun_kisf.to_xarray()

            #print('here5')
            if val_input_df is None:
                val_input_df = clean_ds_nrun_kisf.copy()
            else:
                new_index = clean_ds_nrun_kisf.index.values + val_input_df.index.max().values+1
                clean_ds_nrun_kisf = clean_ds_nrun_kisf.assign_coords({'index': new_index})
                val_input_df = xr.concat([val_input_df, clean_ds_nrun_kisf], dim='index') 

    ## prepare input and target
    
    print('here6')
    y_train = train_input_df['melt_m_ice_per_y']
    x_train = train_input_df.drop_vars(['melt_m_ice_per_y'])

    y_val = val_input_df['melt_m_ice_per_y']
    x_val = val_input_df.drop_vars(['melt_m_ice_per_y'])

    #print('x_train : ',dfmt.print_shape_xr_ds(x_train), 'y_train : ',len(y_train))
    #print('x_val  : ',dfmt.print_shape_xr_ds(x_val),  'y_test  : ',len(y_val))

    print('here7')
    ## normalise
    norm_summary_list = []

    for norm_method in ['std','interquart','minmax']:

        summary_ds = compute_norm_metrics(x_train, y_train, norm_method)
        norm_summary_list.append(summary_ds)

    print('here8')
    summary_ds_all = xr.concat(norm_summary_list, dim='norm_method')
    
    print('here9')
    var_mean = summary_ds_all.sel(metric='mean_vars')
    var_range = summary_ds_all.sel(metric='range_vars')

    print('here10')
    var_train_norm = (train_input_df - var_mean)/var_range
    var_val_norm = (val_input_df - var_mean)/var_range
    
    return summary_ds_all, var_train_norm, var_val_norm

In [None]:
tblock_dim = np.arange(1,17).tolist()+np.arange(21,50).tolist()
isf_dim = [10,11,12,13,18,22,23,24,25,30,31,33,38,39,40,42,43,44,45,47,48,51,52,53,54,55,58,61,65,66,69,70,71,73,75]
TS_opt = 'extrap'

if TS_opt == 'extrap':
    outputpath_CVinput = inputpath_data+'EXTRAPOLATED_ISFDRAFT_CHUNKS/'
elif TS_opt == 'whole':
    outputpath_CVinput = inputpath_data+'WHOLE_PROF_CHUNKS/'
elif TS_opt == 'thermocline':
    outputpath_CVinput = inputpath_data+'THERMOCLINE_CHUNKS/'

tblock_out = 0
isf_out = 0
metrics_ds, var_train_norm, var_val_norm = indat.prepare_input_data_CV(tblock_dim, isf_dim, tblock_out, isf_out, TS_opt, inputpath_data)
#metrics_ds = indat.prepare_input_data_CV_onlymetrics(tblock_dim, isf_dim, tblock_out, isf_out, TS_opt, inputpath_data)
metrics_ds.to_netcdf(outputpath_CVinput + 'metrics_norm_wholedataset_orig_christoph.nc')
var_train_norm.to_netcdf(outputpath_CVinput + 'train_data_wholedataset_orig_christoph.nc')
var_val_norm.to_netcdf(outputpath_CVinput + 'val_data_wholedataset_orig_christoph.nc')    

PREPARE ADDITIONAL INPUT VARIABLES

FOR WHOLE PROFILES, PUT ALL CHUNKS TOGETHER

In [None]:
## which profile option are we using for temperature and salinity
if TS_opt == 'extrap':
    inputpath_prof = inputpath_data+'EXTRAPOLATED_ISFDRAFT_CHUNKS/'
elif TS_opt == 'whole':
    inputpath_prof = inputpath_data+'WHOLE_PROF_CHUNKS/'
elif TS_opt == 'thermocline':
    inputpath_prof = inputpath_data+'THERMOCLINE_CHUNKS/'

train_input_df = None    
tblock_list = []
isf_list = []

for tt in tblock_dim:
    print(tt)

    for kisf in isf_dim: 

        clean_df_nrun_kisf = pd.read_csv(inputpath_prof + 'dataframe_input_isf'+str(kisf).zfill(3)+'_'+str(tt).zfill(3)+'.csv',index_col=[0,1,2])
        clean_df_nrun_kisf.reset_index(drop=True, inplace=True)
        clean_ds_nrun_kisf = clean_df_nrun_kisf.to_xarray()

        if train_input_df is None:
            train_input_df = clean_ds_nrun_kisf.copy()
            tblock_list = tblock_list + (np.zeros(len(train_input_df.index)) + tt).astype(int).tolist()
            isf_list = isf_list + (np.zeros(len(train_input_df.index)) + kisf).astype(int).tolist()
        else:
            new_index = clean_ds_nrun_kisf.index.values + train_input_df.index.max().values+1
            clean_ds_nrun_kisf = clean_ds_nrun_kisf.assign_coords({'index': new_index})
            train_input_df = xr.concat([train_input_df, clean_ds_nrun_kisf], dim='index')
            tblock_list = tblock_list + (np.zeros(len(new_index)) + tt).astype(int).tolist()
            isf_list = isf_list + (np.zeros(len(new_index)) + kisf).astype(int).tolist()

train_input_df.to_netcdf(inputpath_prof + 'dataframe_allisf_tblocks1to13.nc')
index_ds = xr.Dataset({'Nisf': (['index'], isf_list), 'tblock': (['index'], tblock_list)}, coords={'index': train_input_df.index})
index_ds.to_netcdf(inputpath_prof + 'indexing_allisf_tblocks1to13.nc')

CV over shelves

In [None]:
tblock_dim = range(1,14)
#isf_dim = [10,11,12,13,18,22,23,24,25,30,31,33,38,39,40,42,43,44,45,47,48,51,52,53,54,55,58,61,65,66,69,70,71,73,75]
isf_dim = [38,55,58,61,65,66,69,70,71,73,75]
TS_opt = 'whole'
norm_method = 'std'

if TS_opt == 'extrap':
    outputpath_CVinput = inputpath_data+'EXTRAPOLATED_ISFDRAFT_CHUNKS_CV/'
elif TS_opt == 'whole':
    outputpath_CVinput = inputpath_data+'WHOLE_PROF_CHUNKS_CV/'
elif TS_opt == 'thermocline':
    outputpath_CVinput = inputpath_data+'THERMOCLINE_CHUNKS_CV/'

inputpath_prof = inputpath_data+'WHOLE_PROF_CHUNKS/'
ds_all = xr.open_mfdataset(inputpath_prof + 'dataframe_allisf_tblocks1to13.nc').chunk({'index': 1e6})
ds_idx = xr.open_mfdataset(inputpath_prof + 'indexing_allisf_tblocks1to13.nc').chunk({'index': 1e6})

#seems to work, I could even increase the chunk size probably
for isf_out in tqdm(isf_dim):
    
    print(isf_out)
    tblock_out = 0
    print('here')
    data_train_norm, data_val_norm = indat.prepare_normed_input_data_CV_metricsgiven(tblock_dim, isf_dim, tblock_out, isf_out, TS_opt, inputpath_data, norm_method, ds_all=ds_all, ds_idx=ds_idx)
    print('here1')
    data_train_norm.to_netcdf(outputpath_CVinput + 'train_data_CV_norm'+norm_method+'_noisf'+str(isf_out).zfill(3)+'_notblock'+str(tblock_out).zfill(3)+'.nc')
    print('here2')
    data_val_norm.to_netcdf(outputpath_CVinput + 'val_data_CV_norm'+norm_method+'_noisf'+str(isf_out).zfill(3)+'_notblock'+str(tblock_out).zfill(3)+'.nc')
    del data_train_norm
    del data_val_norm

In [None]:
data_train_norm

CV over time

In [None]:
tblock_dim = range(6,14) #range(1,14)
isf_dim = [10,11,12,13,18,22,23,24,25,30,31,33,38,39,40,42,43,44,45,47,48,51,52,53,54,55,58,61,65,66,69,70,71,73,75]
TS_opt = 'whole'
norm_method = 'std'
# CONTINUE AT 6

if TS_opt == 'extrap':
    outputpath_CVinput = inputpath_data+'EXTRAPOLATED_ISFDRAFT_CHUNKS_CV/'
elif TS_opt == 'whole':
    outputpath_CVinput = inputpath_data+'WHOLE_PROF_CHUNKS_CV/'
elif TS_opt == 'thermocline':
    outputpath_CVinput = inputpath_data+'THERMOCLINE_CHUNKS_CV/'

inputpath_prof = inputpath_data+'WHOLE_PROF_CHUNKS/'
ds_all = xr.open_mfdataset(inputpath_prof + 'dataframe_allisf_tblocks1to13.nc').chunk({'index': 1e6})
ds_idx = xr.open_mfdataset(inputpath_prof + 'indexing_allisf_tblocks1to13.nc').chunk({'index': 1e6})

for tblock_out in tqdm(tblock_dim):

    print(tblock_out)
    isf_out = 0
    print('here')
    data_train_norm, data_val_norm = indat.prepare_normed_input_data_CV_metricsgiven(tblock_dim, isf_dim, tblock_out, isf_out, TS_opt, inputpath_data, norm_method, ds_all=ds_all, ds_idx=ds_idx)
    print('here1')
    data_train_norm.to_netcdf(outputpath_CVinput + 'train_data_CV_norm'+norm_method+'_noisf'+str(isf_out).zfill(3)+'_notblock'+str(tblock_out).zfill(3)+'.nc')
    print('here2')
    data_val_norm.to_netcdf(outputpath_CVinput + 'val_data_CV_norm'+norm_method+'_noisf'+str(isf_out).zfill(3)+'_notblock'+str(tblock_out).zfill(3)+'.nc')
    del data_train_norm
    del data_val_norm