In [None]:
"""
Created on Thu Sep 08 11:15 2022

This script is to prepare the normalising coefficients and input data for the cross-validation

Author: Clara Burgard
"""

In [None]:
import numpy as np
import xarray as xr
from tqdm.notebook import trange, tqdm
#from tqdm import trange, tqdm
import glob
import matplotlib as mpl
import seaborn as sns
import datetime
import time

from basal_melt_neural_networks.constants import *
import basal_melt_neural_networks.diagnostic_functions as diag
import basal_melt_neural_networks.data_formatting as dfmt
import basal_melt_neural_networks.prep_input_data as indat

import distributed

In [None]:
client = distributed.Client(n_workers=10, dashboard_address=':8795', local_directory='/tmp', memory_limit='6GB')

PREPARE THE CONTEXT OF THE INPUT DATA

In [None]:
inputpath_data = '/bettik/burgardc/DATA/NN_PARAM/interim/INPUT_DATA/' 

tblock_dim = range(1,14)
isf_dim = [10,11,12,13,18,22,23,24,25,30,31,33,38,39,40,42,43,44,45,47,48,51,52,53,54,55,58,61,65,66,69,70,71,73,75]
    

CV over time

In [None]:
tblock_dim = range(1,14)
isf_dim = [10,11,12,13,18,22,23,24,25,30,31,33,38,39,40,42,43,44,45,47,48,51,52,53,54,55,58,61,65,66,69,70,71,73,75]
TS_opt = 'whole'

if TS_opt == 'extrap':
    outputpath_CVinput = inputpath_data+'EXTRAPOLATED_ISFDRAFT_CHUNKS_CV/'
elif TS_opt == 'whole':
    outputpath_CVinput = inputpath_data+'WHOLE_PROF_CHUNKS_CV/'
elif TS_opt == 'thermocline':
    outputpath_CVinput = inputpath_data+'THERMOCLINE_CHUNKS_CV/'

for tblock_out in tqdm(tblock_dim):

    isf_out = 0
    #metrics_ds, var_train_norm, var_val_norm = indat.prepare_input_data_CV(tblock_dim, isf_dim, tblock_out, isf_out, TS_opt, inputpath_data)
    metrics_ds = indat.prepare_input_data_CV_onlymetrics(tblock_dim, isf_dim, tblock_out, isf_out, TS_opt, inputpath_data)
    metrics_ds.to_netcdf(outputpath_CVinput + 'metrics_norm_CV_noisf'+str(isf_out).zfill(3)+'_notblock'+str(tblock_out).zfill(3)+'.nc')
    #var_train_norm.to_netcdf(outputpath_CVinput + 'train_data_CV_noisf'+str(isf_out).zfill(3)+'_notblock'+str(tblock_out).zfill(3)+'.nc')
    #var_val_norm.to_netcdf(outputpath_CVinput + 'val_data_CV_noisf'+str(isf_out).zfill(3)+'_notblock'+str(tblock_out).zfill(3)+'.nc')    

CV over shelves

In [None]:
tblock_dim = range(1,14)
isf_dim = [10,11,12,13,18,22,23,24,25,30,31,33,38,39,40,42,43,44,45,47,48,51,52,53,54,55,58,61,65,66,69,70,71,73,75]
#isf_dim = [31,33,38,39,40,42,43,44,45,47,48,51,52,53,54,55,58,61,65,66,69,70,71,73,75]
TS_opt = 'whole'

if TS_opt == 'extrap':
    outputpath_CVinput = inputpath_data+'EXTRAPOLATED_ISFDRAFT_CHUNKS_CV/'
elif TS_opt == 'whole':
    outputpath_CVinput = inputpath_data+'WHOLE_PROF_CHUNKS_CV/'
elif TS_opt == 'thermocline':
    outputpath_CVinput = inputpath_data+'THERMOCLINE_CHUNKS_CV/'
    
for isf_out in tqdm(isf_dim):

    tblock_out = 0
    #metrics_ds, var_train_norm, var_val_norm = indat.prepare_input_data_CV(tblock_dim, isf_dim, tblock_out, isf_out, TS_opt, inputpath_data)
    metrics_ds = indat.prepare_input_data_CV_onlymetrics(tblock_dim, isf_dim, tblock_out, isf_out, TS_opt, inputpath_data)
    metrics_ds.to_netcdf(outputpath_CVinput + 'metrics_norm_CV_noisf'+str(isf_out).zfill(3)+'_notblock'+str(tblock_out).zfill(3)+'.nc')
    #var_train_norm.to_netcdf(outputpath_CVinput + 'train_data_CV_noisf'+str(isf_out).zfill(3)+'_notblock'+str(tblock_out).zfill(3)+'.nc')
    #var_val_norm.to_netcdf(outputpath_CVinput + 'val_data_CV_noisf'+str(isf_out).zfill(3)+'_notblock'+str(tblock_out).zfill(3)+'.nc')
    

FOR WHOLE PROFILES, PUT ALL CHUNKS TOGETHER

In [None]:
## which profile option are we using for temperature and salinity
if TS_opt == 'extrap':
    inputpath_prof = inputpath_data+'EXTRAPOLATED_ISFDRAFT_CHUNKS/'
elif TS_opt == 'whole':
    inputpath_prof = inputpath_data+'WHOLE_PROF_CHUNKS/'
elif TS_opt == 'thermocline':
    inputpath_prof = inputpath_data+'THERMOCLINE_CHUNKS/'

train_input_df = None    
tblock_list = []
isf_list = []

for tt in tblock_dim:
    print(tt)

    for kisf in isf_dim: 

        clean_df_nrun_kisf = pd.read_csv(inputpath_prof + 'dataframe_input_isf'+str(kisf).zfill(3)+'_'+str(tt).zfill(3)+'.csv',index_col=[0,1,2])
        clean_df_nrun_kisf.reset_index(drop=True, inplace=True)
        clean_ds_nrun_kisf = clean_df_nrun_kisf.to_xarray()

        if train_input_df is None:
            train_input_df = clean_ds_nrun_kisf.copy()
            tblock_list = tblock_list + (np.zeros(len(train_input_df.index)) + tt).astype(int).tolist()
            isf_list = isf_list + (np.zeros(len(train_input_df.index)) + kisf).astype(int).tolist()
        else:
            new_index = clean_ds_nrun_kisf.index.values + train_input_df.index.max().values+1
            clean_ds_nrun_kisf = clean_ds_nrun_kisf.assign_coords({'index': new_index})
            train_input_df = xr.concat([train_input_df, clean_ds_nrun_kisf], dim='index')
            tblock_list = tblock_list + (np.zeros(len(new_index)) + tt).astype(int).tolist()
            isf_list = isf_list + (np.zeros(len(new_index)) + kisf).astype(int).tolist()

train_input_df.to_netcdf(inputpath_prof + 'dataframe_allisf_tblocks1to13.nc')
index_ds = xr.Dataset({'Nisf': (['index'], isf_list), 'tblock': (['index'], tblock_list)}, coords={'index': train_input_df.index})
index_ds.to_netcdf(inputpath_prof + 'indexing_allisf_tblocks1to13.nc')

CV over shelves

In [None]:
tblock_dim = range(1,14)
isf_dim = [10,11,12,13,18,22,23,24,25,30,31,33,38,39,40,42,43,44,45,47,48,51,52,53,54,55,58,61,65,66,69,70,71,73,75]
TS_opt = 'whole'
norm_method = 'std'

if TS_opt == 'extrap':
    outputpath_CVinput = inputpath_data+'EXTRAPOLATED_ISFDRAFT_CHUNKS_CV/'
elif TS_opt == 'whole':
    outputpath_CVinput = inputpath_data+'WHOLE_PROF_CHUNKS_CV/'
elif TS_opt == 'thermocline':
    outputpath_CVinput = inputpath_data+'THERMOCLINE_CHUNKS_CV/'

inputpath_prof = inputpath_data+'WHOLE_PROF_CHUNKS/'
ds_all = xr.open_mfdataset(inputpath_prof + 'dataframe_allisf_tblocks1to13.nc')
ds_idx = xr.open_mfdataset(inputpath_prof + 'indexing_allisf_tblocks1to13.nc')

#seems to work, I could even increase the chunk size probably
for isf_out in tqdm(isf_dim[0:1]):
    
    print(isf_out)
    tblock_out = 0
    print('here')
    data_train_norm, data_val_norm = indat.prepare_normed_input_data_CV_metricsgiven(tblock_dim, isf_dim, tblock_out, isf_out, TS_opt, inputpath_data, norm_method, ds_all=ds_all, ds_idx=ds_idx)
    print('here1')
    data_train_norm.to_netcdf(outputpath_CVinput + 'train_data_CV_norm'+norm_method+'_noisf'+str(isf_out).zfill(3)+'_notblock'+str(tblock_out).zfill(3)+'.nc')
    print('here2')
    data_val_norm.to_netcdf(outputpath_CVinput + 'val_data_CV_norm'+norm_method+'_noisf'+str(isf_out).zfill(3)+'_notblock'+str(tblock_out).zfill(3)+'.nc')

In [None]:
data_train_norm

CV over time

In [None]:
tblock_dim = range(1,14)
isf_dim = [10,11,12,13,18,22,23,24,25,30,31,33,38,39,40,42,43,44,45,47,48,51,52,53,54,55,58,61,65,66,69,70,71,73,75]
TS_opt = 'whole'
norm_method = 'std'

if TS_opt == 'extrap':
    outputpath_CVinput = inputpath_data+'EXTRAPOLATED_ISFDRAFT_CHUNKS_CV/'
elif TS_opt == 'whole':
    outputpath_CVinput = inputpath_data+'WHOLE_PROF_CHUNKS_CV/'
elif TS_opt == 'thermocline':
    outputpath_CVinput = inputpath_data+'THERMOCLINE_CHUNKS_CV/'

inputpath_prof = inputpath_data+'WHOLE_PROF_CHUNKS/'
ds_all = xr.open_dataset(inputpath_prof + 'dataframe_allisf_tblocks1to13.nc')
ds_idx = xr.open_dataset(inputpath_prof + 'indexing_allisf_tblocks1to13.nc')

for tblock_out in tqdm(tblock_dim):

    isf_out = 0
    data_train_norm, data_val_norm = indat.prepare_normed_input_data_CV_metricsgiven(tblock_dim, isf_dim, tblock_out, isf_out, TS_opt, inputpath_data, norm_method, ds_all=ds_all, ds_idx=ds_idx)
    data_train_norm.to_netcdf(outputpath_CVinput + 'train_data_CV_norm'+norm_method+'_noisf'+str(isf_out).zfill(3)+'_notblock'+str(tblock_out).zfill(3)+'.nc')
    data_val_norm.to_netcdf(outputpath_CVinput + 'val_data_CV_norm'+norm_method+'_noisf'+str(isf_out).zfill(3)+'_notblock'+str(tblock_out).zfill(3)+'.nc')