In [1]:
import pandas as pd
import numpy as np
import random
from rdkit import Chem
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader, random_split
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, roc_auc_score, accuracy_score, f1_score, precision_score
from sklearn.utils.class_weight import compute_class_weight
from mlp import MLP


In [2]:
from utils import load_dataset_df, fp_generator

#### Load DataFrame

In [3]:
files = ['tox21.csv','sider.csv', 'BBBP.csv']
dt_file = files[2]

df, targets = load_dataset_df(filename=dt_file)
print(targets)

target_name = targets[0]
df = df[[target_name, 'smiles']].dropna()


['p_np']


#### SMILE to Fingerprint

In [4]:
fp_types = [['morgan', 1024], ['maccs', 167], ['RDKit', 1024]]
fp_type, num_bits = fp_types[0]
print(fp_type, '-', num_bits)
num_rows = len(df)
fp_array = np.zeros((num_rows, num_bits))
target_array = np.zeros((num_rows, 1))
i = 0

img = None
# Smile to Fingerprint of size {num_bits}
fp_gen = fp_generator(fp_type)
for idx, row in df.iterrows():
    mol = Chem.MolFromSmiles(row['smiles'])
    #TODO: sanitize molecules to remove the warnings (?)
    
    if mol is not None:
        fingerprint = fp_gen(mol)

        fp_array[i] = np.array(fingerprint)
        target_array[i] = row[target_name]
        i += 1
target_array = target_array.ravel()

morgan - 1024


[12:02:37] Explicit valence for atom # 1 N, 4, is greater than permitted
[12:02:37] Explicit valence for atom # 6 N, 4, is greater than permitted
[12:02:37] Explicit valence for atom # 6 N, 4, is greater than permitted
[12:02:38] Explicit valence for atom # 11 N, 4, is greater than permitted
[12:02:38] Explicit valence for atom # 12 N, 4, is greater than permitted
[12:02:38] Explicit valence for atom # 5 N, 4, is greater than permitted
[12:02:38] Explicit valence for atom # 5 N, 4, is greater than permitted
[12:02:38] Explicit valence for atom # 5 N, 4, is greater than permitted
[12:02:38] Explicit valence for atom # 5 N, 4, is greater than permitted
[12:02:38] Explicit valence for atom # 5 N, 4, is greater than permitted
[12:02:38] Explicit valence for atom # 5 N, 4, is greater than permitted


In [5]:
# Create Torch Dataset
dtype = torch.float32
fp_tensor = torch.tensor(fp_array, dtype=dtype)
target_tensor = torch.tensor(target_array, dtype=dtype).long()

dataset = TensorDataset(fp_tensor, target_tensor)

#### Train/Test Split

In [6]:
input_size = num_bits
hidden_size = 2048
output_size = 1
learning_rate = 1e-4
num_epochs = 100
batch_size = 32
device = "cpu"

In [7]:
generator = torch.Generator().manual_seed(1)
train, val,  test = random_split(dataset, [0.8, 0.1, 0.1], generator=generator)

_, train_label = train[:]
_, val_label = val[:]
_, test_label = test[:]

train_loader = DataLoader(train, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(test, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test, batch_size=batch_size, shuffle=True)


#### Loss Function

In [8]:

use_weights = True

#Loss Function
if use_weights:
    class_weights = compute_class_weight(class_weight='balanced', classes=np.array([0, 1]), y=np.array(train_label))
    class_weights = torch.tensor(class_weights, dtype=torch.float)
    class_weights[0] += 1
    weighted = 'class_weights'
else: 
    class_weights=None
    weighted = ''


In [9]:
random.seed(1)

In [14]:
criterion = nn.BCEWithLogitsLoss(pos_weight=class_weights[0] / class_weights[1])
model = MLP(input_size, hidden_size, output_size)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)


In [15]:
model.train()
for epoch in range(num_epochs):
    for data, targets in train_loader:  # Ensure targets are floats for BCEWithLogitsLoss
        targets = targets.float()  # Convert targets to float for binary classification

        # Forward pass
        outputs = model(data).squeeze(1)  # Remove extra dimensions from output
        loss = criterion(outputs, targets)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

KeyboardInterrupt: 

In [None]:
model.eval()
total_loss = 0
all_preds = []
all_targets = []

with torch.no_grad():  # Disable gradient computation
    for data, targets in test_loader:
        targets = targets.float().to(device)
        data = data.to(device)
        outputs = model(data).squeeze(1)
        
        # Apply sigmoid to get probabilities
        probs = torch.sigmoid(outputs)
        predictions = (probs >= 0.5).float()  # Convert to binary predictions
        
        # Calculate loss
        loss = criterion(outputs, targets)
        total_loss += loss.item()
        
        all_preds.extend(predictions.cpu().numpy())
        all_targets.extend(targets.cpu().numpy())

accuracy = accuracy_score(all_targets, all_preds)
auc_roc = roc_auc_score(all_targets, all_preds)
tn, fp, fn, tp = confusion_matrix(all_targets, all_preds).ravel()
sensitivity = tp/(tp + fn)
specificity = tn/(tn + fp)

print(accuracy, auc_roc, sensitivity, specificity)


0.8585365853658536 0.7968000958772771 0.9328859060402684 0.6607142857142857
