## SHAP on R2B5 NARVAL data

In [1]:
import os
import sys
import gc
import time
import numpy as np
import pandas as pd
import xarray as xr
import matplotlib.pyplot as plt
import shap

from tensorflow.keras.models import load_model
from tensorflow import nn 

# Add path with my_classes to sys.path
sys.path.insert(0, '/pf/b/b309170/workspace_icon-ml/cloud_cover_parameterization/')

import importlib
import my_classes
importlib.reload(my_classes)

from my_classes import load_data
from my_classes import read_mean_and_std

# np.random.seed(10)
np.random.seed(100)

In [2]:
root_path = '/pf/b/b309170'
# data_path = os.path.join(root_path,
#                          'my_work/NARVAL/data_var_vertinterp_R02B05')
model_path = os.path.join(root_path,
                          'workspace_icon-ml/cloud_cover_parameterization/grid_column_based_QUBICC_R02B05/saved_models')

### We use SHAP on the R2B5 NARVAL data (Key Point 3)

In [3]:
# Load data
ORDER_OF_VARS_NARVAL = ['qv', 'qc', 'qi', 'temp', 'pres', 'zg', 'fr_land', 'clc']
# data_dict = load_data(source='narval', days='all', vert_interp=True, resolution='R02B05', 
#                     order_of_vars=ORDER_OF_VARS_NARVAL)
data_dict = load_data(source='narval', days='all', vert_interp=True, resolution='R02B05', 
                    order_of_vars=ORDER_OF_VARS_NARVAL)

qv
qc
qi
temp
pres


In [4]:
# Are there any bad data points
ta_is_0 = np.where(data_dict['temp'] == 0)
for i in range(3):
    assert ta_is_0[i].size == 0

del ta_is_0
gc.collect()

2923

In [5]:
(TIME_STEPS, VERT_LAYERS, HORIZ_FIELDS) = data_dict['clc'].shape

In [6]:
try:
    #Reshaping into nd-arrays of equaling shapes (don't reshape in the vertical)
    data_dict['zg'] = np.repeat(np.expand_dims(data_dict['zg'], 0), TIME_STEPS, axis=0)
    data_dict['fr_land'] = np.repeat(np.expand_dims(data_dict['fr_land'], 0), TIME_STEPS, axis=0)
except:
    pass

In [7]:
# One sample should contain a column of information
data_dict_reshaped = {}

for key in data_dict.keys():
    if data_dict[key].shape[1] == VERT_LAYERS:  
        # Removing data above 21kms
        for i in range(4, VERT_LAYERS):
            new_key = '{}{}{:d}'.format(key,'_',(i+17)) # Should start at 21
            data_dict_reshaped[new_key] = np.reshape(data_dict[key][:,i,:], -1)
    else:
        data_dict_reshaped[key] = np.reshape(data_dict[key], -1)

In [8]:
#Converting dict into a DataFrame-object 
df = pd.DataFrame.from_dict(data_dict_reshaped)
df.head()

Unnamed: 0,qv_21,qv_22,qv_23,qv_24,qv_25,qv_26,qv_27,qv_28,qv_29,qv_30,...,clc_38,clc_39,clc_40,clc_41,clc_42,clc_43,clc_44,clc_45,clc_46,clc_47
0,3e-06,3e-06,3e-06,3e-06,6e-06,2.4e-05,8.9e-05,0.000198,0.000254,0.000111,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,3e-06,3e-06,3e-06,3e-06,5e-06,2e-05,9.9e-05,0.000155,0.000234,9.1e-05,...,0.0,0.000827,0.00154,0.068071,0.0,0.0,0.0,0.0,0.0,0.0
2,3e-06,3e-06,3e-06,3e-06,6e-06,2.7e-05,3.1e-05,0.000106,0.000218,0.00014,...,0.0,0.729135,0.202046,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3e-06,3e-06,3e-06,2e-06,7e-06,2.3e-05,8.5e-05,0.000273,0.000234,0.000278,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,3e-06,3e-06,3e-06,3e-06,6e-06,2.7e-05,8.5e-05,0.000277,0.000229,0.000292,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
#Modifies df as well
def split_input_output(dataset):
    output_df = pd.DataFrame()
    for i in range(21, 48):
        output_df['clc_%d'%i] = dataset['clc_%d'%i] # Should start at 21
        del dataset['clc_%d'%i]
    return output_df

In [10]:
output_df = split_input_output(df)

input_narval = np.float32(df)
output_narval = np.float32(output_df)

#### Remove columns that were constant in at least one of the training folds

In [11]:
remove_fields = [27, 28, 29, 30, 31, 32, 135, 136, 137]
(samples_narval, no_of_features) = input_narval.shape
assert no_of_features == 163

input_narval = np.delete(input_narval, remove_fields, axis=1)
no_of_features = no_of_features - len(remove_fields)

#### Scale the data according to the model's mean and std

In [12]:
mean_2, std_2 = read_mean_and_std(model_path+'/cloud_cover_R2B5_QUBICC/cross_validation_column_based_fold_2.txt')
input_narval = (input_narval-mean_2)/std_2

#### Load the model

In [13]:
os.listdir(model_path+'/cloud_cover_R2B5_QUBICC')

['cross_validation_column_based_fold_3.txt',
 '.ipynb_checkpoints',
 'cross_validation_column_based_fold_3.h5',
 'delete_this.ipynb',
 'cross_validation_column_based_fold_2.txt',
 'cross_validation_column_based_fold_2.yaml',
 'cross_validation_column_based_fold_2.h5',
 'cross_validation_column_based_fold_1.txt',
 'cross_validation_column_based_fold_3.yaml',
 'scaler_100.txt',
 'cross_validation_column_based_fold_1.h5',
 'cross_validation_column_based_fold_1.yaml']

In [14]:
# custom_objects = {}
# custom_objects['leaky_relu'] = nn.leaky_relu

fold_2 = 'cross_validation_column_based_fold_2.h5'

model_fold_2 = load_model(os.path.join(model_path+'/cloud_cover_R2B5_QUBICC', fold_2))

#### Setting up SHAP

In [None]:
# Load column-based R2B5 training data
path_train_data = '/pf/b/b309170/my_work/icon-ml_data/cloud_cover_parameterization/grid_column_based_QUBICC_R02B05/based_on_var_interpolated_data'
input_train = np.transpose(np.load(path_train_data + '/cloud_cover_input_qubicc.npy'))

In [None]:
no_samples_train = 10000

rand_indices_train = np.random.randint(0, input_train.shape[0], no_samples_train)

In [None]:
# Load column-based R2B5 training data
input_train = np.delete(input_train, remove_fields, axis=1)
input_train = np.float32(input_train)[rand_indices_train]
input_train = (input_train-mean_2)/std_2

In [None]:
#initialize js methods for visualization
shap.initjs()

# It takes any combination of a model and masker (usually the training data!) and 
# returns a callable subclass object that implements the particular estimation algorithm 
# (e.g. 'kernel' or 'deep') that was chosen.
# Data is taken to compute the base value.
explainer_shap = shap.DeepExplainer(model=model_fold_2, data=input_train)

In [None]:
#I doubt we can look at much more than 10000 at a time
# for no_samples_narval in [10, 10**2, 10**3, 10**4, 10**5]:
for no_samples_narval in [10000]:
    rand_indices_narval = np.random.randint(0, input_narval.shape[0], no_samples_narval)
    t0 = time.time()
    # Estimate the SHAP values on a subset of the data (you can try all but then gets slower)
    # It's not great to disable the additivity check but we are talking about differences of O(10^{-1}).
    shap_values = explainer_shap.shap_values(X=input_narval[rand_indices_narval], check_additivity=False)   
    elapsed_time = time.time() - t0
    
    # Print to a file: no_samples_narval, elapsed_time, mean shap values, 
    # variance shap values, mean absolute shap values
    with open('/pf/b/b309170/workspace_icon-ml/iconml_clc/additional_content/shap_values/averaged_shap_values/r2b5_column-based_fold_2_on_narval_r2b5.txt', 'a') as file:
        file.write('Number of NARVAL samples: %d\n'%no_samples_narval)
        file.write('Elapsed time: %.3f\n'%elapsed_time)
        file.write('Mean SHAP values:\n')
        for i in range(27):
            file.write(str(list(np.around(np.mean(shap_values[i], axis=0, dtype=np.float64), 3))))
        file.write('\nVariance SHAP values:\n')
        for i in range(27):
            file.write(str(list(np.around(np.var(shap_values[i], axis=0, dtype=np.float64), 3))))
        file.write('\nMean absolute SHAP values:\n')
        for i in range(27):
            file.write(str(list(np.around(np.mean(np.abs(shap_values[i]), axis=0, dtype=np.float64), 3))))
        file.write('\n\n')