In [1]:
import pandas as pd
import numpy as np
import random
from rdkit import Chem
from snn_model import get_loss_fn
import torch
from torch.utils.data import TensorDataset, DataLoader
import matplotlib.pyplot as plt
from utils import load_dataset_df, smile_to_fp,smiles_to_descriptor,smiles_to_onehot, smiles_to_onehot_selfies, data_splitter, get_spiking_net, make_filename
from sklearn.metrics import confusion_matrix, roc_auc_score, accuracy_score, f1_score, precision_score
from csnn_model import CSNNet, get_prediction_fn

No normalization for SPS. Feature removed!
No normalization for AvgIpc. Feature removed!
Skipped loading some Tensorflow models, missing a dependency. No module named 'tensorflow'
Skipped loading modules with pytorch-geometric dependency, missing a dependency. No module named 'torch_geometric'
Skipped loading modules with transformers dependency. No module named 'transformers'
cannot import name 'HuggingFaceModel' from 'deepchem.models.torch_models' (c:\Users\knsve\Desktop\MEI\Tese\torch\pt_venv2\lib\site-packages\deepchem\models\torch_models\__init__.py)
Skipped loading modules with pytorch-geometric dependency, missing a dependency. cannot import name 'DMPNN' from 'deepchem.models.torch_models' (c:\Users\knsve\Desktop\MEI\Tese\torch\pt_venv2\lib\site-packages\deepchem\models\torch_models\__init__.py)
Skipped loading modules with pytorch-lightning dependency, missing a dependency. No module named 'lightning'
Skipped loading some Jax models, missing a dependency. No module named 'jax'


#### Load DataFrame

In [2]:
files = ['tox21.csv','sider.csv', 'BBBP.csv']
dt_file = files[0]
dirname = dt_file.removesuffix('.csv')

df, targets = load_dataset_df(filename=dt_file)


In [3]:
if dirname == 'tox21':
    # SR-ARE
    target_name = targets[7]
    # SR-MMP
elif dirname == 'sider':
    #Hepatobiliary disorders 1427 samples, 0.52 class ratio
    target_name = targets[0]
else:
    target_name = targets[0]
    
df = df[[target_name, 'smiles']].dropna()

#### Molecular Representation

In [4]:
representations = ["fp", "descriptor", "SELFIES-1hot", "SMILES-1hot"]#, "graph-list"]

repr_type = representations[1]

In [5]:
if repr_type == "fp":
    fp_types = [['morgan', 1024], ['maccs', 167], ['RDKit', 1024], ['count_morgan', 1024], ['pubchem', 881]]
    mix = True
    fp_type, num_bits = fp_types[1]
    if mix and fp_type == 'RDKit':
        num_bits = 512
    data_config = {"fp_type": fp_type,
                "num_bits": num_bits,
                "radius": 2,
                "fp_type_2": fp_types[0][0],
                "num_bits_2": 1024 - num_bits,
                "mix": mix,}
    dim_2 = False
    print(fp_type, '-', num_bits)
    if mix: print(data_config['fp_type_2'], '-', data_config['num_bits_2'])
    if dim_2: print("2D FP")

elif repr_type == "descriptor":
    desc_type = ["RDKit", "TODO"]
    data_config = {"desc": desc_type[0],
                   "size": 0,
                }
elif repr_type == "SELFIES-1hot":
    dim_2 = True
    data_config = {}

elif repr_type == "SMILES-1hot":
    dim_2 = True
    data_config = {}

data_config["repr_type"] = repr_type
print(repr_type)

descriptor


In [6]:
dtype = torch.float32
split = "scaffold"
dataset = None

if dirname != 'BBBP':
    split = "random"
    if repr_type == "fp":
        fp_array, target_array = smile_to_fp(df, data_config=data_config, target_name=target_name)
        # Create Torch Dataset
        fp_tensor = torch.tensor(fp_array, dtype=dtype)
        print(fp_tensor.size())
        target_tensor = torch.tensor(target_array, dtype=dtype).long()
        if dim_2:
            fp_tensor = fp_tensor.view(-1, 32, 32)
            print(fp_tensor.size())
        dataset = TensorDataset(fp_tensor, target_tensor)
    elif repr_type == "descriptor":
        desc_array, target_array = smiles_to_descriptor(df, data_config=data_config, target_name=target_name, missing_val=0)
        # Create Torch Dataset
        desc_tensor = torch.tensor(desc_array, dtype=dtype)
        target_tensor = torch.tensor(target_array, dtype=dtype).long()

        dataset = TensorDataset(desc_tensor, target_tensor)
        print(desc_tensor.size())
    elif repr_type == "SELFIES-1hot":
        selfies_array, target_array = smiles_to_onehot_selfies(df, data_config=data_config, target_name=target_name, missing_val=0)
        # Create Torch Dataset
        selfies_tensor = torch.tensor(selfies_array, dtype=dtype)
        target_tensor = torch.tensor(target_array, dtype=dtype).long()

        dataset = TensorDataset(selfies_tensor, target_tensor)
        print(selfies_tensor.size())
    elif repr_type == "SMILES-1hot":
        smiles_array, target_array = smiles_to_onehot(df, data_config=data_config, target_name=target_name, missing_val=0)
        # Create Torch Dataset
        smiles_tensor = torch.tensor(smiles_array, dtype=dtype)
        target_tensor = torch.tensor(target_array, dtype=dtype).long()

        dataset = TensorDataset(smiles_tensor, target_tensor)
        print(smiles_tensor.size())

Inf in descriptor MaxPartialCharge for molecule CC1=C2N=C(C=C3N=C(C(C)=C4[C@@H](CCC(N)=O)[C@](C)(CC(N)=O)[C@](C)([C@@H]5N=C1[C@](C)(CCC(=O)NC[C@@H](C)OP(=O)([O-])O[C@@H]1[C@@H](CO)O[C@H](n6cnc7cc(C)c(C)cc76)[C@@H]1O)[C@H]5CC(N)=O)N4[Co+]C#N)[C@@](C)(CC(N)=O)[C@@H]3CCC(N)=O)C(C)(C)[C@@H]2CCC(N)=O
Inf in descriptor MinPartialCharge for molecule CC1=C2N=C(C=C3N=C(C(C)=C4[C@@H](CCC(N)=O)[C@](C)(CC(N)=O)[C@](C)([C@@H]5N=C1[C@](C)(CCC(=O)NC[C@@H](C)OP(=O)([O-])O[C@@H]1[C@@H](CO)O[C@H](n6cnc7cc(C)c(C)cc76)[C@@H]1O)[C@H]5CC(N)=O)N4[Co+]C#N)[C@@](C)(CC(N)=O)[C@@H]3CCC(N)=O)C(C)(C)[C@@H]2CCC(N)=O
Inf in descriptor MaxAbsPartialCharge for molecule CC1=C2N=C(C=C3N=C(C(C)=C4[C@@H](CCC(N)=O)[C@](C)(CC(N)=O)[C@](C)([C@@H]5N=C1[C@](C)(CCC(=O)NC[C@@H](C)OP(=O)([O-])O[C@@H]1[C@@H](CO)O[C@H](n6cnc7cc(C)c(C)cc76)[C@@H]1O)[C@H]5CC(N)=O)N4[Co+]C#N)[C@@](C)(CC(N)=O)[C@@H]3CCC(N)=O)C(C)(C)[C@@H]2CCC(N)=O
Inf in descriptor MinAbsPartialCharge for molecule CC1=C2N=C(C=C3N=C(C(C)=C4[C@@H](CCC(N)=O)[C@](C)(CC(N)=

#### Loss Function

In [8]:
from sklearn.utils.class_weight import compute_class_weight

loss_types = ['ce_mem', 'rate_loss', 'count_loss', 'temporal_loss', 'bce_loss']
loss_type = loss_types[2]
print(loss_type)


count_loss


#### Test Loop

In [19]:
net_types = ["SNN", "DSNN", "CSNN", "RSNN"]
net_type = net_types[1]
slope = 10
#spike_grad = surrogate.fast_sigmoid(slope=slope)
spike_grad = None
beta = 0.95 
bias = True
net_config = {
            "num_hidden": 512,
            "num_hidden_l2": 256,
            "num_steps": 10,
            "spike_grad": spike_grad,
            "slope": None if not spike_grad else slope, #spike_grad.__closure__[0].cell_contents,
            "beta": beta,
            "encoding": 'rate' if loss_type != 'temporal_loss' else 'ttfs',
            "bias": bias,
            "out_num": 2
            }
if net_type == "CSNN":
    net_config['num_conv'] = 1
    net_config['stride'] = [1 for _ in range(net_config['num_conv'])]
    net_config["pool_size"] = 2
    net_config["conv_kernel"] = 3
    net_config["conv_stride"] = 1
    net_config["conv_groups"] = 1

if repr_type == "fp":
    net_config["input_size"] = 1024 if data_config['mix'] else num_bits
    net_config["2d"] = dim_2

elif repr_type == "descriptor":
    net_config["input_size"] = desc_tensor.shape[1]
    net_config["2d"] = False
    net_config["time_steps"] = 50

if repr_type == "SELFIES-1hot":
    net_config["input_size"] = [desc_tensor.shape[1],desc_tensor.shape[2]] 
    net_config["2d"] = True
if repr_type == "SMILES-1hot":
    net_config["2d"] = True
    net_config["input_size"] = [desc_tensor.shape[1],desc_tensor.shape[2]] 
print(net_type)

DSNN


In [10]:
pop_coding = net_config['out_num'] > 2
lr=1e-4 #1e-6 default for 1000 epochs. csnn requires higher
iterations = 30
weight_decay = 0 # 1e-5
optim_type = 'Adam'
#optim_type = 'SGD'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
batch_size = 16 #16, 8
train_config = {"num_epochs": 50,
                "batch_size": batch_size,
                "device": device,
                "loss_type": loss_type,
                "loss_fn": None,
                'dtype': dtype,
                'num_steps': net_config['num_steps'],
                'val_net': None,
                'prediction_fn': get_prediction_fn(encoding=net_config['encoding'], pop_coding=pop_coding),
                }
drop_last = net_type == "CSNN"
pin_memory = device == "cuda"
save_csv = True
save_models = True
results = [[], [], [], [], [], []]

cpu


In [11]:
def calc_metrics(metrics_list, all_targets, all_preds):

    accuracy = accuracy_score(all_targets, all_preds)
    auc_roc = roc_auc_score(all_targets, all_preds)
    tn, fp, fn, tp = confusion_matrix(all_targets, all_preds).ravel()
    sensitivity = tp/(tp + fn)
    specificity = tn/(tn + fp)
    f1 = f1_score(all_targets, all_preds)
    precision = precision_score(all_targets, all_preds)
    
    print("acc:", accuracy,"auc:", auc_roc)
    metrics_list[0].append(accuracy)
    metrics_list[1].append(auc_roc)
    metrics_list[2].append(sensitivity)
    metrics_list[3].append(specificity)
    metrics_list[4].append(f1)
    metrics_list[5].append(precision)
    

In [20]:
import os
import copy

net_list = []
    
net, train_net, val_net, test_net = get_spiking_net(net_type, net_config)
filename = make_filename(dirname, target_name, net_type, data_config, lr, weight_decay, optim_type, net_config, train_config, net, model = True)

model_name = filename.removesuffix('.csv')

models_path = os.path.join("results", dirname, "models", "")
all_model_names = os.listdir(models_path)
print(models_path)
#print(all_model_names)
for iter in range(iterations):
    seed = int(iter + 1)
    string_id = f"seed-{seed}.pth"
    search_name = model_name + str(string_id) 
    search_name_no_folder = search_name.removeprefix(models_path)
    if search_name_no_folder in all_model_names:
        state_dict = torch.load(search_name, weights_only=True)
        net_list.append(copy.deepcopy(state_dict))
    else: print(search_name_no_folder)

210 512
512 256
256 2
results\tox21\models\
SR-ARE_DSNN_beta-0.95_desc_210_l1512_l2256_t10_e50_b16_lr0.0001_count_loss_Adam_wd0_biasseed-1.pth
SR-ARE_DSNN_beta-0.95_desc_210_l1512_l2256_t10_e50_b16_lr0.0001_count_loss_Adam_wd0_biasseed-2.pth
SR-ARE_DSNN_beta-0.95_desc_210_l1512_l2256_t10_e50_b16_lr0.0001_count_loss_Adam_wd0_biasseed-3.pth
SR-ARE_DSNN_beta-0.95_desc_210_l1512_l2256_t10_e50_b16_lr0.0001_count_loss_Adam_wd0_biasseed-4.pth
SR-ARE_DSNN_beta-0.95_desc_210_l1512_l2256_t10_e50_b16_lr0.0001_count_loss_Adam_wd0_biasseed-5.pth
SR-ARE_DSNN_beta-0.95_desc_210_l1512_l2256_t10_e50_b16_lr0.0001_count_loss_Adam_wd0_biasseed-6.pth
SR-ARE_DSNN_beta-0.95_desc_210_l1512_l2256_t10_e50_b16_lr0.0001_count_loss_Adam_wd0_biasseed-7.pth
SR-ARE_DSNN_beta-0.95_desc_210_l1512_l2256_t10_e50_b16_lr0.0001_count_loss_Adam_wd0_biasseed-8.pth
SR-ARE_DSNN_beta-0.95_desc_210_l1512_l2256_t10_e50_b16_lr0.0001_count_loss_Adam_wd0_biasseed-9.pth
SR-ARE_DSNN_beta-0.95_desc_210_l1512_l2256_t10_e50_b16_lr0.0001_c

In [17]:
import torch.nn as nn
import os
def make_filename(dirname, target, net_type, data_config, lr, wd, optim_type, net_config, train_config, net, model = False):
    results_dir = os.path.join("results", dirname, "")
    if model:
        results_dir = os.path.join(results_dir, "models", "")

    data_str = [] 
    if data_config["repr_type"] == 'descriptor':
        data_str.append("desc")
    elif data_config["repr_type"] == 'fp':
        data_str.append(data_config['fp_type'])
        if data_config['fp_type'] == 'morgan':
            data_str.append(f"r-{data_config['radius']}")
        if data_config['mix']:
            data_str.append(data_config['fp_type_2'])
        if net_config['2d']:
            data_str.append("2D")
            
    input_size = net_config['input_size']
    if isinstance(input_size, int):
        input_size_l = [input_size]
    else: input_size_l = input_size
    params = [
        None if dirname == 'BBBP' else target, 
        net_type, 
        f"beta-{net_config['beta']}",
        *(
            ["desc"] if data_config["repr_type"] == 'descriptor' else
            (
                [data_config['repr_type']] if data_config["repr_type"] != 'fp' else
                [data_config['fp_type']] + 
                (['r-' + f"{data_config['radius']}"] if data_config['fp_type'] == 'morgan' else []) +
                ([data_config['fp_type_2']] if data_config['mix'] else []) +
                (["2D"] if net_config['2d'] else [])
            )
        ),
        *input_size_l,
        None if net_type == "CSNN" else f"l1{net_config['num_hidden']}",
        None if net_type != "DSNN" else f"l2{net_config['num_hidden_l2']}",
        None if net_type != "CSNN" else "out-" + "-".join(str(layer.out_channels) for layer in net.layers if isinstance(layer, (nn.Conv1d, nn.Conv2d))),
        None if net_type != "CSNN" else f"kernel-{net.conv_kernel}",
        None if net_type != "CSNN" else f"stride-{net.conv_stride}",
        f"t{net_config['num_steps']}",
        f"e{train_config['num_epochs']}",
        f"b{train_config['batch_size']}",
        f"lr{lr}",
        train_config['loss_type'],
        optim_type,
        f"wd{wd}",
        None if net_config['spike_grad'] is None else f"sig-{net_config['slope']}",
        "no-bias" if not net_config['bias'] else "bias",
        None if net_config['out_num'] == 2 else f"pop-{net_config['out_num']}",
    ]

    filename = results_dir + "_".join(str(p) for p in params if p is not None) + ".csv"
    return filename

In [21]:
for iter in range(iterations):
    #print(f"Iteration:{iter + 1}/{iterations}")
    seed = iter + 1
    print(f"Seed:{seed} -> ",end='', flush=True)
    random.seed(seed)

    # DATA SPLIT
    train, val, test = data_splitter(df, target_name, split=split, dataset=dataset, data_config=data_config, seed=seed, dtype=dtype)
    _, train_label = train[:]
    _, val_label = val[:]
    _, test_label = test[:]
    train_loader = DataLoader(train, batch_size=batch_size, shuffle=True, pin_memory=pin_memory, drop_last=drop_last)
    val_loader = DataLoader(val, batch_size=batch_size, shuffle=False, pin_memory=pin_memory)
    test_loader = DataLoader(test, batch_size=batch_size, shuffle=False, pin_memory=pin_memory)

    # TESTING
    model = net
    best_test_auc = 0
    best_epoch = 0

    model.load_state_dict(net_list[iter])
    model.to(device)
    all_preds, all_targets = test_net(model, device, test_loader, train_config)
    calc_metrics(results, all_preds=all_preds, all_targets=all_targets)
    print()

Seed:1 -> acc: 0.7221269296740995 auc: 0.4949627823408625

Seed:2 -> acc: 0.7324185248713551 auc: 0.5250150693188668

Seed:3 -> acc: 0.8593481989708405 auc: 0.5050995569835938

Seed:4 -> acc: 0.8078902229845626 auc: 0.49346744934980225

Seed:5 -> acc: 0.8456260720411664 auc: 0.49497991967871485

Seed:6 -> acc: 0.8439108061749572 auc: 0.5080685147622267

Seed:7 -> 

IndexError: list index out of range

#### Save Metrics

In [22]:
metrics_np = np.zeros(12)

for i, metric in enumerate(results):
    metrics_np[i*2] = np.round(np.mean(metric), 3)
    metrics_np[i*2+1] = np.round(np.std(metric), 3)

# Print Results
print(f"Accuracy:  {metrics_np[0]:.3f} ± {metrics_np[1]:.3f}")
print(f"AUC ROC: {metrics_np[2]:.3f} ± {metrics_np[3]:.3f}")
print(f"Sensitivity: {metrics_np[4]:.3f} ± {metrics_np[5]:.3f}")
print(f"Specificity: {metrics_np[6]:.3f} ± {metrics_np[7]:.3f}")


Accuracy:  0.802 ± 0.055
AUC ROC: 0.504 ± 0.011
Sensitivity: 0.073 ± 0.092
Specificity: 0.934 ± 0.080


In [None]:
print(net)

CSNNet(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (layers): ModuleList(
    (0): Conv1d(1, 8, kernel_size=(3,), stride=(1,), padding=(1,))
    (1): Leaky()
    (2): Conv1d(8, 8, kernel_size=(3,), stride=(1,), padding=(1,))
    (3): Leaky()
    (4): Linear(in_features=2048, out_features=2, bias=True)
    (5): Leaky()
  )
  (fc_out): Linear(in_features=2048, out_features=2, bias=True)
  (lif_out): Leaky()
)


In [None]:

metric_names = ['Acc', 'AUC', 'Sn', 'Sp', 'F1', 'Precision']
metrics_np = metrics_np.reshape(1, -1)
columns = []
for name in metric_names:
    columns.extend([f'Mean {name}', f'Std {name}'])


df_metrics = pd.DataFrame(metrics_np, columns=columns)
num_hidden = net_config['num_hidden']
time_steps = train_config['num_steps']
num_epochs = train_config['num_epochs']

save = True
filename = make_filename(dirname, target_name, net_type, data_config, lr, weight_decay, optim_type, net_config, train_config, model)
if save: df_metrics.to_csv(filename, index=False)

print(filename)

results\tox21\SR-ARE_CSNN_beta-0.95_maccs_morgan_[1024]_out-8-8_kernel-3_stride-1_t10_e1000_b16_lr0.0001_ce_mem_Adam_wd0_bias.csv
