In [1]:
import pandas as pd
import numpy as np
import random
from rdkit import Chem
from snn_model import get_loss_fn
import torch
from torch.utils.data import TensorDataset, DataLoader
from snntorch import spikegen, surrogate
import matplotlib.pyplot as plt
from utils import load_dataset_df, smile_to_fp_mix, smile_to_fp, data_splitter, get_spiking_net, make_filename
from sklearn.metrics import confusion_matrix, roc_auc_score, accuracy_score, f1_score, precision_score
from csnn_model import CSNNet, get_prediction_fn

No normalization for SPS. Feature removed!
No normalization for AvgIpc. Feature removed!
No normalization for NumAmideBonds. Feature removed!
No normalization for NumAtomStereoCenters. Feature removed!
No normalization for NumBridgeheadAtoms. Feature removed!
No normalization for NumHeterocycles. Feature removed!
No normalization for NumSpiroAtoms. Feature removed!
No normalization for NumUnspecifiedAtomStereoCenters. Feature removed!
No normalization for Phi. Feature removed!
Skipped loading some Tensorflow models, missing a dependency. No module named 'tensorflow'
Skipped loading modules with pytorch-geometric dependency, missing a dependency. No module named 'torch_geometric'
Skipped loading modules with transformers dependency. No module named 'transformers'
cannot import name 'HuggingFaceModel' from 'deepchem.models.torch_models' (c:\Users\knsve\Desktop\MEI\Tese\torch\snn_venv\lib\site-packages\deepchem\models\torch_models\__init__.py)
Skipped loading modules with pytorch-geometri

#### Load DataFrame

In [2]:
files = ['tox21.csv','sider.csv', 'BBBP.csv']
dt_file = files[1]
dirname = dt_file.removesuffix('.csv')

df, targets = load_dataset_df(filename=dt_file)


In [3]:
if dirname == 'tox21':
    # SR-ARE
    target_name = targets[7]
    # SR-MMP
elif dirname == 'sider':
    #Hepatobiliary disorders 1427 samples, 0.52 class ratio
    target_name = targets[0]
else:
    target_name = targets[0]
    
df = df[[target_name, 'smiles']].dropna()

#### SMILE to Fingerprint

In [4]:
fp_types = [['morgan', 1024], ['maccs', 167], ['RDKit', 1024], ['pubchem', 881]]
mix = True
fp_type, num_bits = fp_types[0]
if mix: #Always use maccs + Morgan
    fp_type, num_bits = fp_types[1]
#num_bits = 2048
fp_config = {"fp_type": fp_type,
             "num_bits": num_bits,
             "radius": 2,
             "fp_type_2": fp_types[0][0],
             "num_bits_2": 1024 - num_bits,
             "mix": mix,
             }

print(fp_type, '-', num_bits)

maccs - 167


In [5]:
dtype = torch.float32
dataset = None
split = "random"

if dirname == 'BBBP':
    split = "scaffold"
else:
    if fp_config['mix']:
        fp_array, target_array = smile_to_fp_mix(df, fp_config=fp_config, target_name=target_name)
    else:
        fp_array, target_array = smile_to_fp(df, fp_config=fp_config, target_name=target_name)
    # Create Torch Dataset
    fp_tensor = torch.tensor(fp_array, dtype=dtype)
    target_tensor = torch.tensor(target_array, dtype=dtype).long()

    dataset = TensorDataset(fp_tensor, target_tensor)



In [6]:
print(fp_array.shape)

(1427, 1024)


#### Loss Function

In [7]:
from sklearn.utils.class_weight import compute_class_weight

loss_types = ['ce_mem', 'rate_loss', 'count_loss', 'temporal_loss', 'bce_loss']
loss_type = loss_types[2]
print(loss_type)


count_loss


#### Test Loop

In [8]:
net_types = ["SNN", "DSNN", "CSNN"]
net_type = net_types[2]
#spike_grad = surrogate.sigmoid(slope=25)
spike_grad = None
beta = 0.95 #experimentar 0.7

net_config = {"input_size": 1024 if fp_config['mix'] else num_bits,
              "num_hidden": 512,
              "num_hidden_l2": 256,
              "use_l2": net_type == "DSNN",
              "time_steps": 10,
              "spike_grad": spike_grad,
              "beta": beta,
              "encoding": 'rate',
              "out_num": 2
              }
pop_coding = net_config['out_num'] > 2

In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
iterations = 30
batch_size = 16 #16, 8
weight_decay = 0
lr = 1e-4
optim_type = 'Adam'
train_config = {"num_epochs": 1000,
                "batch_size": batch_size,
                "device": device,
                "loss_type": loss_type,
                "loss_fn": None,
                'dtype': dtype,
                'num_steps': net_config['time_steps'],
                'val_net': None,
                'prediction_fn': get_prediction_fn(encoding=net_config['encoding'], pop_coding=pop_coding),
                }
drop_last = net_type == "CSNN"
pin_memory = device == "cuda"
save = True
results = [[], [], [], [], [], []]

In [10]:
def calc_metrics(metrics_list, all_targets, all_preds):

    accuracy = accuracy_score(all_targets, all_preds)
    auc_roc = roc_auc_score(all_targets, all_preds)
    tn, fp, fn, tp = confusion_matrix(all_targets, all_preds).ravel()
    sensitivity = tp/(tp + fn)
    specificity = tn/(tn + fp)
    f1 = f1_score(all_targets, all_preds)
    precision = precision_score(all_targets, all_preds)
    
    print("acc:", accuracy,"auc:", auc_roc)
    metrics_list[0].append(accuracy)
    metrics_list[1].append(auc_roc)
    metrics_list[2].append(sensitivity)
    metrics_list[3].append(specificity)
    metrics_list[4].append(f1)
    metrics_list[5].append(precision)
    

In [11]:
import os
import copy

net_list = []
    
net, train_net, val_net, test_net = get_spiking_net(net_type, net_config)
filename = make_filename(dirname, target_name, net_type, fp_config, lr, weight_decay, optim_type, net_config, train_config, net, model = True)
model_name = filename.removesuffix('.csv')

models_path = os.path.join("results", dirname, "models", "")
all_model_names = os.listdir(models_path)
print(models_path)
#print(all_model_names)
for iter in range(iterations):
    seed = int(iter + 1)
    string_id = f"seed-{seed}.pth"
    search_name = model_name + str(string_id) 
    search_name_no_folder = search_name.removeprefix(models_path)
    if search_name_no_folder in all_model_names:
        state_dict = torch.load(search_name, weights_only=True)
        net_list.append(copy.deepcopy(state_dict))
    else: print(search_name_no_folder)

results\sider\models\


In [12]:
for iter in range(iterations):
    #print(f"Iteration:{iter + 1}/{iterations}")
    seed = iter + 1
    print(f"Seed:{seed} -> ",end='', flush=True)
    random.seed(seed)

    # DATA SPLIT
    train, val, test = data_splitter(df, target_name, split=split, dataset=dataset, fp_config=fp_config, seed=seed, dtype=dtype)
    _, train_label = train[:]
    _, val_label = val[:]
    _, test_label = test[:]
    train_loader = DataLoader(train, batch_size=batch_size, shuffle=True, pin_memory=pin_memory, drop_last=drop_last)
    val_loader = DataLoader(val, batch_size=batch_size, shuffle=False, pin_memory=pin_memory)
    test_loader = DataLoader(test, batch_size=batch_size, shuffle=False, pin_memory=pin_memory)

    # TESTING
    model = net
    best_test_auc = 0
    best_epoch = 0

    model.load_state_dict(net_list[iter])
    model.to(device)
    all_preds, all_targets = test_net(model, device, test_loader, train_config)
    calc_metrics(results, all_preds=all_preds, all_targets=all_targets)
    print()

Seed:1 -> acc: 0.676056338028169 auc: 0.6750248756218906

Seed:2 -> acc: 0.704225352112676 auc: 0.7030241935483871

Seed:3 -> acc: 0.7253521126760564 auc: 0.7253521126760563

Seed:4 -> acc: 0.7323943661971831 auc: 0.7259701492537314

Seed:5 -> acc: 0.7112676056338029 auc: 0.7116339090728608

Seed:6 -> acc: 0.6901408450704225 auc: 0.6859241612070678

Seed:7 -> acc: 0.7253521126760564 auc: 0.7242063492063492

Seed:8 -> acc: 0.6338028169014085 auc: 0.6426829268292682

Seed:9 -> acc: 0.6830985915492958 auc: 0.6959117587532888

Seed:10 -> acc: 0.6549295774647887 auc: 0.6669651741293533

Seed:11 -> acc: 0.7183098591549296 auc: 0.7198009950248755

Seed:12 -> acc: 0.7253521126760564 auc: 0.7294657097288676

Seed:13 -> acc: 0.6830985915492958 auc: 0.6894715692184047

Seed:14 -> acc: 0.7535211267605634 auc: 0.7555092316855271

Seed:15 -> acc: 0.6690140845070423 auc: 0.6728468899521531

Seed:16 -> acc: 0.6971830985915493 auc: 0.6993243243243243

Seed:17 -> acc: 0.704225352112676 auc: 0.7077352472

#### Save Metrics

In [15]:
metrics_np = np.zeros(12)

for i, metric in enumerate(results):
    metrics_np[i*2] = np.round(np.mean(metric), 3)
    metrics_np[i*2+1] = np.round(np.std(metric), 3)

# Print Results
print(f"Accuracy:  {metrics_np[0]:.3f} ± {metrics_np[1]:.3f}")
print(f"AUC ROC: {metrics_np[2]:.3f} ± {metrics_np[3]:.3f}")
print(f"Sensitivity: {metrics_np[4]:.3f} ± {metrics_np[5]:.3f}")
print(f"Specificity: {metrics_np[6]:.3f} ± {metrics_np[7]:.3f}")


Accuracy:  0.697 ± 0.028
AUC ROC: 0.700 ± 0.026
Sensitivity: 0.649 ± 0.067
Specificity: 0.752 ± 0.061


In [16]:

metric_names = ['Acc', 'AUC', 'Sn', 'Sp', 'F1', 'Precision']
metrics_np = metrics_np.reshape(1, -1)
columns = []
for name in metric_names:
    columns.extend([f'Mean {name}', f'Std {name}'])


df_metrics = pd.DataFrame(metrics_np, columns=columns)
num_hidden = net_config['num_hidden']
time_steps = train_config['num_steps']
num_epochs = train_config['num_epochs']


filename = make_filename(dirname, target_name, net_type, fp_config, lr, weight_decay, optim_type, net_config, train_config, model)
if save: df_metrics.to_csv(filename, index=False)

print(filename)

results\sider\Hepatobiliary disorders_CSNN_beta-0.95_maccs_morgan_1024_out-8-8_kernel-3_stride-1_t10_e1000_b16_lr0.0001_count_loss_Adam_wd0.csv
