In [1]:
import numpy as np
import numpy.random as npr
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from tqdm.notebook import tqdm
import os
import pickle
import time

Calculation of shap values, parallel if node has more than one gpu

In [2]:
available_gpus = [torch.cuda.device(i) for i in range(torch.cuda.device_count())]
print(torch.cuda.is_available())
display([(gpu, gpu.idx) for gpu in available_gpus])

True


[(<torch.cuda.device at 0x1461009ca500>, 0),
 (<torch.cuda.device at 0x1460f9a50310>, 1),
 (<torch.cuda.device at 0x1461009f16f0>, 2),
 (<torch.cuda.device at 0x1461009f2380>, 3)]

In [3]:
# data = np.load('../../Processed/TrainData/R2B5_vcg_20221118-153949.npz')
data_path = '../../local_data/TrainData/20230131-171851-R2B5_y13y16_vcg-fluxes_rho_fluct.npz'
# data_path = '../../local_data/TrainData/20230210-131835-R2B5_y13y16_vcg-fluxes_rho_fluct_neglect.npz'
data = np.load(data_path)
print(data.files)

X_train, X_val, X_test, Y_train, Y_val, Y_test, X_expl, Y_expl = \
data['X_train'], data['X_val'], data['X_test'], data['Y_train'], data['Y_val'], data['Y_test'], data['X_expl'], data['Y_expl']

# Convert Data to torch Tensors and permute to conform to pytorch channels first format
transform_to_unet_shape = False
if transform_to_unet_shape:
    x_transform = nn.Upsample(size=(32), mode='linear')
else:
    x_transform = nn.Identity()

X_train = x_transform(torch.Tensor(X_train).permute(0,2,1))
X_val = x_transform(torch.Tensor(X_val).permute(0,2,1))
X_test = x_transform(torch.Tensor(X_test).permute(0,2,1))
Y_train = torch.Tensor(Y_train)
Y_val = torch.Tensor(Y_val)
Y_test = torch.Tensor(Y_test)

with open('../../local_data/TrainData/20230210-131835-R2B5_y13y16_vcg-fluxes_rho_fluct_neglect_Ymask.pickle', 'rb') as handle:
    Y_mask = pickle.load(handle)

print('X_train shape: ', X_train.shape)
print('X_val shape: ', X_val.shape)
print('X_test shape: ', X_test.shape)
print('Y_train shape: ', Y_train.shape)
print('Y_val shape: ', Y_val.shape)
print('Y_test shape: ', Y_test.shape)
print('len X_expl', len(X_expl))
print('len Y_expl', len(Y_expl))

['X_train', 'X_val', 'X_test', 'Y_train', 'Y_val', 'Y_test', 'X_expl', 'Y_expl', 'train_coords', 'val_coords', 'test_coords']
X_train shape:  torch.Size([1613616, 9, 23])
X_val shape:  torch.Size([201702, 9, 23])
X_test shape:  torch.Size([201702, 9, 23])
Y_train shape:  torch.Size([1613616, 189])
Y_val shape:  torch.Size([201702, 189])
Y_test shape:  torch.Size([201702, 189])
len X_expl 207
len Y_expl 189


In [4]:
from convection_param.HelperFuncs import unique_unsorted

vars_to_neglect = ['qr','qi','qs']
# vars_to_neglect = []
vars_to_neglect_mask = ~np.isin(unique_unsorted([e[0] for e in X_expl]), vars_to_neglect)
print(vars_to_neglect_mask)

X_train = X_train[:,vars_to_neglect_mask,:]
X_val = X_val[:,vars_to_neglect_mask,:]
X_test = X_test[:,vars_to_neglect_mask,:]
X_expl = np.array([e for e in X_expl if e[0] not in vars_to_neglect])

[ True  True  True False False False  True  True  True]


In [5]:
batch_size = 1024
batch_size_val = 1024
# Create data loaders.
train_data = TensorDataset(X_train, Y_train)
val_data = TensorDataset(X_val, Y_val)
test_data = TensorDataset(X_test, Y_test)
# torch.save([train_data, val_data, test_data], '../../local_data/TrainData/20230111-165428-R2B5_y13y16_vcg-fluxes_rho_fluct.torch_data')
train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_data, batch_size=batch_size_val, shuffle=False)
test_dataloader = DataLoader(test_data, batch_size=batch_size_val, shuffle=False)

for X, y in val_dataloader:
    print('---------------------------------------')
    print(f"Shape of X [N, C, H, W]: {X.shape}")
    print(f"Shape of y: {y.shape} {y.dtype}")
    break

---------------------------------------
Shape of X [N, C, H, W]: torch.Size([1024, 6, 23])
Shape of y: torch.Size([1024, 189]) torch.float32


In [6]:
from convection_param.NetworksTorch import ResDNN, Sequential, Unet, SeqConv

# Get cpu or gpu device for training.
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")

# model = ResDNN(in_size=23*9,
#                out_size=189,
#                n_neurons=2048,
#                bn=True,
#                n_layers_per_block=1,
#                n_levels=10,
#                activation=nn.ReLU())
model = Unet(n_channels=6,
                n_classes=8,
                output_channels_total=189,
                n_levels=2,
                n_features=512,
                bn1=False,
                bn2=False,
                column_height=23,
                activation=F.leaky_relu,
                linear=False)
# model = Sequential(X_train.shape[1]*X_train.shape[2], 189, 512, F.relu, True, True)
# model = SeqConv(n_channels=X_train.shape[1],
#                 n_feature_channels=512,
#                 column_height=23,
#                 n_hidden=200,
#                 n_layers=1,
#                 output_dim=189,
#                 kernel_size=4).to(device)
# model = Convolutional(n_channels=9, n_feature_channels=10, output_dim=189)
# print(model)

# model = nn.DataParallel(model)
model.to(device)

loss_fn = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0003)

Using cuda device


In [7]:
def train(tepoch, model, loss_fn, optimizer, epoch, writer=None):
    # size = len(dataloader.dataset)
    size = tepoch.total
    model.train()
    # for batch, (X, y) in enumerate(dataloader):
    loss_sum = 0
    for batch, (X, y) in enumerate(tepoch):
        X, y = X.to(device), y.to(device)

        # Compute prediction error
        pred = model(X)
        loss = loss_fn(pred, y)
        loss_sum += loss.item()

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch % 300 == 0:
            tepoch.set_postfix(loss=loss.item())
    if writer:
        writer.add_scalar('epoch_loss', loss_sum/size, epoch)

def test(dataloader, model, loss_fn, epoch, writer=None):
    # size = len(dataloader.dataset) # number of samples
    num_batches = len(dataloader)
    model.eval()
    test_loss = 0
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
    test_loss /= num_batches
    if writer:
        writer.add_scalar('epoch_loss', test_loss, epoch)
    print(f"Avg val loss: {test_loss:>8f} \n")

In [8]:
model_path = "../../Models/NewFormat/Torch/20230320-112327R2B5_vlr_unet_adam_lr0.0003_y13y16full_fluxes_prescaledeps1_wqrqstend_worhoprestemp_torch_rhofluctneglect_alldays_woqrqiqs_hpoed_lrelu/"
load_path = os.path.join(model_path, 'model.state_dict')
model_name = os.path.basename(load_path).replace('.state_dict','')

checkpoint = torch.load(load_path)#, map_location=torch.device('cpu'))
model.load_state_dict(checkpoint['model_state_dict'])

<All keys matched successfully>

In [None]:
import shap
import ray
import copy
import datetime

seeds = [745, 3452, 1458, 2489, 646]
# seed = 4523455
# seed = 42
for seed in seeds:
    print(f'Calculating shap values for seed {seed}')
    npr.seed(seed)
    random_idx = npr.choice(X_train.shape[0], size=1500, replace=False)
    background_idx = random_idx[:500]
    explain_idx = random_idx[500:]

    model.to(device)
    background = X_train[background_idx].to(device)
    X_explain = X_train[explain_idx].to(device)

    t0 = time.time()
    ray.init(num_cpus=10, num_gpus=4)

    a = len(X_explain)//4
    b = 2*len(X_explain)//4
    c = 3*len(X_explain)//4
    d = len(X_explain)

    @ray.remote(num_gpus=1)
    def solve_a(model, i1, i2):
        device = 'cuda:0'
        local_model = copy.deepcopy(model)
        local_model.to(device)
        e = shap.DeepExplainer(local_model, background)
        return np.array(e.shap_values(X_explain[i1:i2].to(device)))

    a_idx = solve_a.remote(model, 0, a)
    b_idx = solve_a.remote(model, a, b)
    c_idx = solve_a.remote(model, b, c)
    d_idx = solve_a.remote(model, c, d)

    a_res, b_res, c_res, d_res = ray.get([a_idx,b_idx,c_idx,d_idx])

    ray.shutdown()
    print(f'Calculations took {time.time()-t0} seconds')
    
    shap_values = np.concatenate([a_res,b_res,c_res,d_res], axis=1)
    print('Shap values shape: ', shap_values.shape)
    
    now = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    shap_path = os.path.join(model_path, f'shaps_{now}_seed{seed}')
    np.savez(shap_path,#f'ShapValues/{now}-{model_name}_seed42',
             shap_values=shap_values,
             background=background.cpu().numpy(),
             X_explain=X_explain.cpu().numpy())

Calculating shap values for seed 745


2023-03-20 16:30:05,547	INFO worker.py:1529 -- Started a local Ray instance. View the dashboard at [1m[32m127.0.0.1:8266 [39m[22m


Calculations took 2131.9812400341034 seconds
Shap values shape:  (189, 1000, 6, 23)
Calculating shap values for seed 3452


2023-03-20 17:05:38,562	INFO worker.py:1529 -- Started a local Ray instance. View the dashboard at [1m[32m127.0.0.1:8266 [39m[22m
