# POC Explained IA using pytorch and captum

## Load libraries

In [14]:
# Our ML things
import torch
import torch.nn as nn
import torch.nn.functional as F

from captum.attr import IntegratedGradients # Most popular atribution methode

# Visualization
import matplotlib.pyplot as plt

# Utils
import pandas as pd
import numpy as np
import multiprocessing
import random

import datetime
import time

from sklearn.metrics import classification_report
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score, accuracy_score, auc
from torch.utils.data import DataLoader

# Custom
from tool_box.deviceHandler import deviceHandler
from tool_box.model import simpleDenseNN
from tool_box.utilities import createDataLoader, listSplitter, type_converter, secondsConverter

run_timestamp = datetime.datetime.now()

## Set up processing device

In [2]:
device_handler = deviceHandler()

GPU isn't available, fallback to CPU


## Process data

### Load data

In [3]:
titanic_dataset = pd.read_csv('titanic_dataset.csv')

### Simple data processing

In [4]:
# Get dummies for categorical variables
sex_dummies = pd.get_dummies(titanic_dataset.Sex, prefix='Sex')
embarked_dummies = pd.get_dummies(titanic_dataset.Embarked, prefix='Embarked')
pclass_dummies = pd.get_dummies(titanic_dataset.Pclass.astype(str), prefix='Pclass')

# Fill NaNs
titanic_dataset.Age = titanic_dataset.Age.fillna(round(titanic_dataset.Age.mean(), 1))
titanic_dataset.Fare = titanic_dataset.Fare.fillna(round(titanic_dataset.Fare.mean(), 1))

### Create features and target datasets

In [5]:
features_df = pd.concat([titanic_dataset[['Age', 'SibSp', 'Parch', 'Fare']], pclass_dummies, sex_dummies, embarked_dummies], axis=1)
target_df = titanic_dataset.Survived

### Scale data

In [6]:
scaler = StandardScaler()
s_features_df = scaler.fit_transform(features_df)

### Transform into tuple list of pytorch tensors

In [7]:
features_tensor = device_handler.data_to_tensor(s_features_df)
target_tensor = device_handler.data_to_tensor(target_df).reshape(-1, 1)

features_tensor = type_converter(features_tensor, torch.float32)
target_tensor = type_converter(target_tensor, torch.float32)

tuple_lst_data = list(zip(features_tensor, target_tensor))

### Split data into test and training

In [8]:
lst_splitter = listSplitter(0.3, shuffle = True)
test_data, train_data = lst_splitter.split(tuple_lst_data)

### Create Dataloader

In [9]:
dataloader_gen = createDataLoader()

test_loader = dataloader_gen.create(test_data, batch_size=150)
train_loader = dataloader_gen.create(train_data, batch_size=150, shuffle = True)

## Model

### Define model parameters

In [10]:
simple_model = simpleDenseNN(features_tensor.shape[1], features_tensor.shape[1]*2+1, 1)

opt = torch.optim.Adam(simple_model.parameters(), 1e-02)

### Run model

In [17]:
batch_cum = 0 
epoch_amt = 5
start_time_VAL = time.time()
run_results = []
model_DICT = {}

for epoch in range(epoch_amt):
  
  train_loss = []
  train_acc = []
  train_f1 = []
  train_prec = []
  train_rec = []
  train_auc = []
  
  for _i, batch in enumerate(train_loader):
    
    preds = simple_model.training_step(batch)
    
    train_loss.append(preds['loss'])
    train_acc.append(preds['acc'])
    train_f1.append(preds['f1'])
    train_prec.append(preds['prec'])
    train_rec.append(preds['rec'])
    
    preds['loss'].backward()
    opt.step()
    opt.zero_grad()
    
  validation_result = [simple_model.testing_step(batch) for batch in test_loader]
  
  epoch_train_loss = np.mean([val.item() for val in train_loss])
  epoch_train_acc = np.mean([val.item() for val in train_acc])
  epoch_train_f1 = np.mean([val.item() for val in train_f1])
  epoch_train_prec = np.mean([val.item() for val in train_prec])
  epoch_train_rec = np.mean([val.item() for val in train_rec])
  
  epoch_test_loss = np.mean([dic['loss'].item() for dic in validation_result])
  epoch_test_acc = np.mean([dic['acc'].item() for dic in validation_result])
  epoch_test_f1 = np.mean([dic['f1'].item() for dic in validation_result])
  epoch_test_prec = np.mean([dic['prec'].item() for dic in validation_result])
  epoch_test_rec = np.mean([dic['rec'].item() for dic in validation_result])
  
  epoch_results = {'run_id': run_timestamp.strftime('%Y%m%d%H%M%S'), 'calendar_dt': run_timestamp.strftime('%Y-%m-%d'),\
                   'training_cases': len(train_data), 'testing_cases': len(test_data), 'epoch': epoch+1, 'total_epochs': epoch_amt,\
                   'training_run' : {'loss': epoch_train_loss, 'accuracy': epoch_train_acc, 'f1': epoch_train_f1,\
                                     'precision': epoch_train_prec, 'recall': epoch_train_rec},\
                   'validation_run' : {'loss': epoch_test_loss, 'accuracy': epoch_test_acc, 'f1': epoch_test_f1,\
                                       'precision': epoch_test_prec, 'recall': epoch_test_rec}}
  
  run_results.append(epoch_results)
  
  print('{} Message time: Epoch {}/{} processed - {} time passed'.format(datetime.datetime.now().strftime("%d/%m/%Y %H:%M:%S"), epoch+1, epoch_amt, secondsConverter(time.time()-start_time_VAL)))
  print('TRAINING   Loss: {:.5f} | Accuracy: {:.2f}% | F1: {:.2f} | Precision: {:.2f} | Recall: {:.2f}'.format(epoch_train_loss.item(), epoch_train_acc * 100, epoch_train_f1, epoch_train_prec, epoch_train_rec))
                 
  print('VALIDATION Loss: {:.5f} | Accuracy: {:.2f}% | F1: {:.2f} | Precision: {:.2f} | Recall: {:.2f}'.format(epoch_validation_loss.item(), epoch_validation_acc * 100, epoch_validation_f1, epoch_validation_prec, epoch_validation_rec) + '\n')

NameError: name 'secondsConverter' is not defined

In [15]:
validation_result

[{'loss': tensor(0.4541, grad_fn=<BinaryCrossEntropyBackward>),
  'acc': 0.8266666666666667,
  'f1': 0.7936507936507936,
  'prec': 0.847457627118644,
  'rec': 0.746268656716418},
 {'loss': tensor(0.3389, grad_fn=<BinaryCrossEntropyBackward>),
  'acc': 0.8866666666666667,
  'f1': 0.8210526315789473,
  'prec': 0.8478260869565217,
  'rec': 0.7959183673469388},
 {'loss': tensor(0.3002, grad_fn=<BinaryCrossEntropyBackward>),
  'acc': 0.8913043478260869,
  'f1': 0.8214285714285714,
  'prec': 0.92,
  'rec': 0.7419354838709677}]