In [None]:
"""
Created on Thu Sep 08 11:15 2022

This script is to prepare the normalising coefficients and input data for the training on the whole dataset

Author: Clara Burgard
"""

In [1]:
import numpy as np
import xarray as xr
#from tqdm.notebook import trange, tqdm
from tqdm import trange, tqdm
import glob
import matplotlib as mpl
import seaborn as sns
import datetime
import time

from multimelt.constants import *
import summer_paper.data_formatting_NN as dfmt
import summer_paper.prep_input_data_NN as indat

import distributed

In [2]:
client = distributed.Client(n_workers=8, dashboard_address=':8795', local_directory='/tmp', memory_limit='4GB')

PREPARE THE CONTEXT OF THE INPUT DATA

In [4]:
inputpath_data = '/bettik/burgardc/DATA/NN_PARAM/interim/INPUT_DATA/' 

tblock_dim = np.arange(1,14).tolist()+np.arange(21,50).tolist()
isf_dim = [10,11,12,13,18,22,23,24,25,30,31,33,38,39,40,42,43,44,45,47,48,51,52,53,54,55,58,61,65,66,69,70,71,73,75]
    

prepare metrics

In [5]:
def combine_csv_per_timeblock(tt, isf_dim, TS_opt, inputpath_data):

    """
    Combines all csv of ice shelves of one time block
    
    Parameters
    ----------
    tblock_dim : list
        List of all time blocks to conduct the cross-validation on.
    isf_dim : list
        List of all ice shelves to conduct the cross-validation on.
    tblock_out : int
        Time block to leave out in cross-validation.
    isf_out : list
        Ice shelf to leave out in cross-validation.
    TS_opt : str
        Type of input temperature and salinity profiles to use. Can be 'extrap', 'whole', 'thermocline'
    inputpath_data : str
        Path to folder where to find the preformatted csv files.

    Returns
    -------
    summary_ds_all: xr.Dataset
        Dataset containing mean and denominator of the normalisation.
    var_train_norm: xr.Dataset
        Dataset containing normalised training predictors and target.
    var_val_norm: xr.Dataset
        Dataset containing normalised validation predictors and target.
    """
    
    ## which profile option are we using for temperature and salinity
    if TS_opt == 'extrap':
        inputpath_prof = inputpath_data+'EXTRAPOLATED_ISFDRAFT_CHUNKS/'
    elif TS_opt == 'whole':
        inputpath_prof = inputpath_data+'WHOLE_PROF_CHUNKS/'
    elif TS_opt == 'thermocline':
        inputpath_prof = inputpath_data+'THERMOCLINE_CHUNKS/'

    ### prepare training dataset

    train_input_df = None        

    for kisf in tqdm(isf_dim): 

        #print(kisf)
        clean_df_nrun_kisf = pd.read_csv(inputpath_prof + 'dataframe_input_isf'+str(kisf).zfill(3)+'_'+str(tt).zfill(3)+'_new.csv',index_col=[0,1,2])
        if 'profile_domain' in clean_df_nrun_kisf.columns:
            clean_df_nrun_kisf = clean_df_nrun_kisf.drop(['profile_domain'], axis=1)
        clean_df_nrun_kisf.reset_index(drop=True, inplace=True)
        #print('here1')
        clean_ds_nrun_kisf = clean_df_nrun_kisf.to_xarray()

        #print('here2')
        if train_input_df is None:
            train_input_df = clean_ds_nrun_kisf.copy()
        else:
            new_index = clean_ds_nrun_kisf.index.values + train_input_df.index.max().values+1
            clean_ds_nrun_kisf = clean_ds_nrun_kisf.assign_coords({'index': new_index})
            train_input_df = xr.concat([train_input_df, clean_ds_nrun_kisf], dim='index') 
    
    return train_input_df

In [6]:
tblock_dim = np.arange(1,14).tolist()+np.arange(21,50).tolist()
#tblock_dim = np.arange(21,50).tolist()
isf_dim = [10,11,12,13,18,22,23,24,25,30,31,33,38,39,40,42,43,44,45,47,48,51,52,53,54,55,58,61,65,66,69,70,71,73,75]
TS_opt = 'extrap'

if TS_opt == 'extrap':
    outputpath_CVinput = inputpath_data+'EXTRAPOLATED_ISFDRAFT_CHUNKS/'
elif TS_opt == 'whole':
    outputpath_CVinput = inputpath_data+'WHOLE_PROF_CHUNKS/'
elif TS_opt == 'thermocline':
    outputpath_CVinput = inputpath_data+'THERMOCLINE_CHUNKS/'

In [1]:
#for tt in tblock_dim:
for tt in range(41,50):
    print(tt)
        
    var_train = combine_csv_per_timeblock(tt, isf_dim, TS_opt, inputpath_data)
    var_train.to_netcdf(outputpath_CVinput + 'dataframe_input_allisf_'+str(tt).zfill(3)+'.nc')

41


NameError: name 'combine_csv_per_timeblock' is not defined

In [8]:
df_tt_list = []
max_idx_prev = 0
for tt in tqdm(tblock_dim):
    df_tt = xr.open_mfdataset(outputpath_CVinput + 'dataframe_input_allisf_'+str(tt).zfill(3)+'.nc')
    if tt > 1:
        df_tt = df_tt.assign_coords({'index': (df_tt.index + max_idx_prev + 1).astype('int64')})
    max_idx_prev = df_tt.index.max()
    df_tt_list.append(df_tt.load())

  0%|          | 0/42 [00:00<?, ?it/s]

In [9]:
df_tt_all = xr.concat(df_tt_list, dim='index')

In [10]:
df_tt_all.to_netcdf(outputpath_CVinput + 'dataframe_input_allisf_timeblocks_training_summerpaper_notnormed_new.nc')

In [11]:
df_tt_all_mean = df_tt_all.mean('index')
df_tt_all_std = df_tt_all.std('index')

In [12]:
norm_method = 'std'
norm_mean = df_tt_all_mean.assign_coords({'metric': 'mean_vars', 'norm_method': norm_method})
norm_range = df_tt_all_std.assign_coords({'metric': 'range_vars', 'norm_method': norm_method})
summary_metrics = xr.concat([norm_mean, norm_range], dim='metric').assign_coords({'norm_method': norm_method})
summary_metrics.to_netcdf(outputpath_CVinput + 'metrics_norm_wholedataset_origexcept26_christoph_new.nc')


In [13]:
var_mean = summary_metrics.sel(metric='mean_vars')
var_range = summary_metrics.sel(metric='range_vars')

var_train_norm = (df_tt_all - var_mean)/var_range
var_train_norm.to_netcdf(outputpath_CVinput + 'train_data_wholedataset_origexcept26_christoph.nc')
