In [None]:
"""
Created on Wed Apr 05 13:52 2023

Check distribution of input variables

@author: Clara Burgard
"""

In [None]:
import xarray as xr
import numpy as np
import seaborn as sns

In [None]:
%matplotlib qt5

In [None]:
plot_path = '/bettik/burgardc/PLOTS/NN_plots/input_vars/'

In [None]:
inputpath_data = '/bettik/burgardc/DATA/NN_PARAM/interim/INPUT_DATA/' 
outputpath_CVinput = inputpath_data+'EXTRAPOLATED_ISFDRAFT_CHUNKS/'
train_data = xr.open_dataset(outputpath_CVinput + 'val_data_wholedataset.nc').sel(norm_method='std')
norm_metrics_file = xr.open_dataset(outputpath_CVinput + 'metrics_norm_wholedataset.nc')
norm_metrics = norm_metrics_file.sel(norm_method='std').drop('norm_method').to_dataframe()

In [None]:
train_data_notnormed = (train_data * norm_metrics.loc['range_vars']) + norm_metrics.loc['mean_vars']
 

In [None]:
inputpath_data = '/bettik/burgardc/DATA/NN_PARAM/interim/INPUT_DATA/SMITH_bf663_EXTRAPDRAFT_CHUNKS/'
indata_df_bf663 = pd.read_csv(inputpath_data + 'dataframe_shuffledinput_allisf_1980-2040_bf663.csv')
indata_df_bf663_normed = (indata_df_bf663 - norm_metrics.loc['mean_vars']) / norm_metrics.loc['range_vars']

inputpath_data = '/bettik/burgardc/DATA/NN_PARAM/interim/INPUT_DATA/SMITH_bi646_EXTRAPDRAFT_CHUNKS/'
indata_df_bi646 = pd.read_csv(inputpath_data + 'dataframe_shuffledinput_allisf_1980-2040_bi646.csv')
indata_df_bi646_normed = (indata_df_bi646 - norm_metrics.loc['mean_vars']) / norm_metrics.loc['range_vars']


In [None]:
input_list = ['dGL','dIF','corrected_isfdraft','bathy_metry','slope_bed_lon','slope_bed_lat','slope_ice_lon','slope_ice_lat','theta_in','salinity_in','T_mean', 'S_mean', 'T_std', 'S_std','melt_m_ice_per_y']
label_list = ['Distance GL [m]','Distance IF [m]',
              'Ice draft depth \n [m below sea-level]','Bathymetry \n [m below sea-level]',
              'Slope bed lon [rad]','Slope bed lat [rad]','Slope ice lon [rad]','Slope ice lat [rad]',
              'Temperature \n [degrees C]','Salinity [psu]','Temperature mean \n [degrees C]', 'Salinity mean [psu]', 'Temperature std \n [degrees C]', 'Salinity std [psu]',
              'Melt [m ice per y]']

In [None]:
#for vv in input_list:
n_bins = 50
for vv in input_list:
    min_all =  min([train_data_notnormed[vv].quantile(0.01).values,indata_df_bf663[vv].quantile(0.01),indata_df_bi646[vv].quantile(0.01)])
    max_all =  max([train_data_notnormed[vv].quantile(0.99).values,indata_df_bf663[vv].quantile(0.99),indata_df_bi646[vv].quantile(0.99)])

    plt.figure()
    plt.hist(train_data_notnormed[vv], range=(min_all,max_all), bins=n_bins, alpha = 0.4, color='grey', density=True)
    plt.hist(indata_df_bf663[vv], range=(min_all,max_all), bins=n_bins, alpha = 0.4, color='orange', density=True)
    plt.hist(indata_df_bi646[vv], range=(min_all,max_all), bins=n_bins, alpha = 0.4, color='red', density=True)
    plt.title(vv)
    plt.savefig(plot_path+'distrib_input_'+vv+'.pdf')

In [None]:
f = plt.figure()
f.set_size_inches(8.25*1.5, 8.25*1.5)

ax={}

i = 0
nn=0
ii=0
n_bins = 50
for k,vv in enumerate(input_list):
    
    min_all =  min([train_data[vv].quantile(0.01).values,indata_df_bf663_normed[vv].quantile(0.01),indata_df_bi646_normed[vv].quantile(0.01)])
    max_all =  max([train_data[vv].quantile(0.99).values,indata_df_bf663_normed[vv].quantile(0.99),indata_df_bi646_normed[vv].quantile(0.99)])

    ax[i] = f.add_subplot(4,4,i+1)

    ax[i].hist(train_data[vv], range=(min_all,max_all), bins=n_bins, alpha = 0.35, color='grey')#, stacked=True, density=True) # 
    ax[i].hist(indata_df_bf663_normed[vv], range=(min_all,max_all), bins=n_bins, alpha = 0.35, color='orange')#, stacked=True, density=True) #
    ax[i].hist(indata_df_bi646_normed[vv], range=(min_all,max_all), bins=n_bins, alpha = 0.35, color='red')#, stacked=True, density=True) #

    ax[i].set_title(label_list[k])
    

    i = i+1
    



f.tight_layout()
sns.despine()

In [None]:
f = plt.figure()
f.set_size_inches(8.25*1.25, 8.25*1.25)

ax={}

i = 0
nn=0
ii=0
n_bins = 50
for vv in input_list:
    
    min_all =  min([train_data_notnormed[vv].quantile(0.01).values,indata_df_bf663[vv].quantile(0.01),indata_df_bi646[vv].quantile(0.01)])
    max_all =  max([train_data_notnormed[vv].quantile(0.99).values,indata_df_bf663[vv].quantile(0.99),indata_df_bi646[vv].quantile(0.99)])

    ax[i] = f.add_subplot(4,4,i+1)

    sns.histplot(train_data[vv].to_dataframe(), x=vv) #, bins=n_bins, alpha = 0.4, color='grey', ax=ax[i]) # 
    sns.histplot(indata_df_bf663_normed, x=vv) #, bins=n_bins, alpha = 0.4, color='orange', ax=ax[i]) #, bins=n_bins
    sns.histplot(indata_df_bi646_normed, x=vv) #, bins=n_bins, alpha = 0.4, color='red', ax=ax[i]) #, bins=n_bins

    #ax[i].set_title(vv)
    #ax[i].set_xticklabels(rotation=45)

    i = i+1
    



f.tight_layout()
sns.despine()

In [None]:
f = plt.figure()
f.set_size_inches(8.25*1.25, 8.25*1.25)

ax={}

i = 0
nn=0
ii=0
n_bins = 50
for k,vv in enumerate(input_list):
    
    min_all =  min([train_data_notnormed[vv].quantile(0.01).values,indata_df_bf663[vv].quantile(0.01),indata_df_bi646[vv].quantile(0.01)])
    max_all =  max([train_data_notnormed[vv].quantile(0.99).values,indata_df_bf663[vv].quantile(0.99),indata_df_bi646[vv].quantile(0.99)])

    ax[i] = f.add_subplot(4,4,i+1)

    ax[i].hist(train_data_notnormed[vv], range=(min_all,max_all), bins=n_bins, alpha = 0.4, color='grey') # , stacked=True, density=True
    ax[i].hist(indata_df_bf663[vv], range=(min_all,max_all), bins=n_bins, alpha = 0.4, color='orange') #, stacked=True, density=True
    ax[i].hist(indata_df_bi646[vv], range=(min_all,max_all), bins=n_bins, alpha = 0.4, color='red') #, stacked=True, density=True

    ax[i].set_title(vv)
    ax[i].set_title(label_list[k])

    i = i+1
    



f.tight_layout()
sns.despine()
plt.savefig(plot_path+'distrib_input_allvar.pdf')

In [None]:
indata_df_bf663[vv]

In [None]:
indata_df_bi646

In [None]:
f = plt.figure()
f.set_size_inches(8.25*1.25, 8.25*1.25)

ax={}

i = 0
nn=0
ii=0
n_bins = 50
for vv in input_list:
    
    min_all =  min([train_data_notnormed[vv].quantile(0.01).values,indata_df_bf663[vv].quantile(0.01),indata_df_bi646[vv].quantile(0.01)])
    max_all =  max([train_data_notnormed[vv].quantile(0.99).values,indata_df_bf663[vv].quantile(0.99),indata_df_bi646[vv].quantile(0.99)])

    ax[i] = f.add_subplot(4,4,i+1)

    sns.histplot(train_data_notnormed[vv].to_dataframe(), x=vv, y='index', alpha = 0.4, color='grey', ax=ax[i], stat='percent') # , bins=n_bins
    sns.histplot(indata_df_bf663, x=vv, y=indata_df_bf663.index, alpha = 0.4, color='orange', ax=ax[i], stat='percent') #, bins=n_bins
    sns.histplot(indata_df_bi646, x=vv, y=indata_df_bi646.index, alpha = 0.4, color='red', ax=ax[i], stat='percent') #, bins=n_bins

    #ax[i].set_title(vv)
    #ax[i].set_xticklabels(rotation=45)

    i = i+1
    



f.tight_layout()
sns.despine()

In [None]:
n_bins = 50
vv = 'slope_ice_lon'
min_all =  min([train_data_notnormed[vv].quantile(0.1).values,indata_df_bf663[vv].quantile(0.1),indata_df_bi646[vv].quantile(0.05)])
max_all =  max([train_data_notnormed[vv].quantile(0.9).values,indata_df_bf663[vv].quantile(0.9),indata_df_bi646[vv].quantile(0.95)])

plt.figure()
plt.hist(train_data_notnormed[vv], range=(min_all,max_all), bins=n_bins, alpha = 0.25, color='grey', density=True)
plt.hist(indata_df_bf663[vv], range=(min_all,max_all), bins=n_bins, alpha = 0.25, color='orange', density=True)
plt.hist(indata_df_bi646[vv], range=(min_all,max_all), bins=n_bins, alpha = 0.25, color='red', density=True)
plt.title(vv)

In [None]:
min_all =  np.quantile(np.array([train_data_notnormed[vv].min().values,indata_df_bf663[vv].min(),indata_df_bi646[vv].min()]), 0.95)
min_all