In [3]:
#data manipulation
import math
import pandas as pd
import numpy as np

#Pytorch geometric
import torch   
import torch_geometric
from torch import nn, Tensor
from torch_geometric.data import Data
from torch.utils.data import Dataset, DataLoader
from torch_geometric.loader import DataLoader
from torch_geometric.nn import GATConv
from torch.nn import Linear, BCELoss, LSTM, Dropout
import torch_geometric.transforms as T
import torch.nn.functional as F

from torch_geometric.nn import global_mean_pool as gap,  global_max_pool as gmp, global_add_pool as gsp

#Pytorch NLP
from torch.nn import TransformerEncoderLayer,TransformerEncoder, Embedding

#rdkit
from rdkit import Chem                      
from rdkit.Chem import GetAdjacencyMatrix       
from scipy.sparse import coo_matrix
from rdkit.Chem import AllChem
from rdkit import Chem, DataStructs

#matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

#chemprop
import chemprop
from chemprop.args import TrainArgs, PredictArgs
from chemprop.train import cross_validate, run_training, make_predictions

#sklearn
import sklearn
from sklearn.metrics import classification_report, roc_auc_score

#shuffle
from random import shuffle

#for word embeddings
import re
import gensim
from torchtext.vocab import build_vocab_from_iterator

from torch.nn.utils.rnn import pad_sequence

#import the pretrained word-embeddings
from gensim.models import KeyedVectors
wv = KeyedVectors.load("dir_to_wordvectors.kv", mmap='r')

#GPU
import gc

## Utility Function

### Chem

In [5]:
def onek_encoding_unk(value, choices):
    """
    Creates a one-hot encoding with an extra category for uncommon values.

    :param value: The value for which the encoding should be one.
    :param choices: A list of possible values.
    :return: A one-hot encoding of the :code:`value` in a list of length :code:`len(choices) + 1`.
             If :code:`value` is not in :code:`choices`, then the final element in the encoding is -1.
    """
    encoding = [0] * (len(choices) + 1)
    index = choices.index(value) if value in choices else -1
    encoding[index] = 1

    return encoding

In [6]:
class Featurization_parameters:
    """
    A class holding molecule featurization parameters as attributes.
    """
    def __init__(self) -> None:

        # Atom feature sizes
        self.MAX_ATOMIC_NUM = 100
        #for one-hot-encoding
        self.ATOM_FEATURES = {
            'atomic_num': list(range(self.MAX_ATOMIC_NUM)),
            'degree': [0, 1, 2, 3, 4, 5],
            'formal_charge': [-1, -2, 1, 2, 0],
            'chiral_tag': [0, 1, 2, 3],
            'num_Hs': [0, 1, 2, 3, 4],
            'hybridization': [
                Chem.rdchem.HybridizationType.SP,
                Chem.rdchem.HybridizationType.SP2,
                Chem.rdchem.HybridizationType.SP3,
                Chem.rdchem.HybridizationType.SP3D,
                Chem.rdchem.HybridizationType.SP3D2
            ],
        }

        # Distance feature sizes
        self.PATH_DISTANCE_BINS = list(range(10))
        self.THREE_D_DISTANCE_MAX = 20
        self.THREE_D_DISTANCE_STEP = 1
        self.THREE_D_DISTANCE_BINS = list(range(0, self.THREE_D_DISTANCE_MAX + 1, self.THREE_D_DISTANCE_STEP))

        # len(choices) + 1 to include room for uncommon values; + 2 at end for IsAromatic and mass
        self.ATOM_FDIM = sum(len(choices) + 1 for choices in self.ATOM_FEATURES.values()) + 2
        self.EXTRA_ATOM_FDIM = 0
        self.BOND_FDIM = 14
        self.EXTRA_BOND_FDIM = 0
        self.REACTION_MODE = None
        self.EXPLICIT_H = False
        self.REACTION = False

In [7]:
PARAMS = Featurization_parameters()

In [8]:
def atom_features(atom: Chem.rdchem.Atom, functional_groups=None):
    """
    Builds a feature vector for an atom.

    :param atom: An RDKit atom.
    :param functional_groups: A k-hot vector indicating the functional groups the atom belongs to.
    :return: A list containing the atom features.
    """
    if atom is None:
        features = [0] * PARAMS.ATOM_FDIM
    else:
        features = onek_encoding_unk(atom.GetAtomicNum() - 1, PARAMS.ATOM_FEATURES['atomic_num']) + \
            onek_encoding_unk(atom.GetTotalDegree(), PARAMS.ATOM_FEATURES['degree']) + \
            onek_encoding_unk(atom.GetFormalCharge(), PARAMS.ATOM_FEATURES['formal_charge']) + \
            onek_encoding_unk(int(atom.GetChiralTag()), PARAMS.ATOM_FEATURES['chiral_tag']) + \
            onek_encoding_unk(int(atom.GetTotalNumHs()), PARAMS.ATOM_FEATURES['num_Hs']) + \
            onek_encoding_unk(int(atom.GetHybridization()), PARAMS.ATOM_FEATURES['hybridization']) + \
            [1 if atom.GetIsAromatic() else 0] + \
            [atom.GetMass() * 0.01]  # scaled to about the same range as other features
        if functional_groups is not None:
            features += functional_groups
    return features

In [9]:
def bond_features(bond: Chem.rdchem.Bond):
    """
    Builds a feature vector for a bond.

    :param bond: An RDKit bond.
    :return: A list containing the bond features.
    """
    if bond is None:
        fbond = [1] + [0] * (PARAMS.BOND_FDIM - 1)
    else:
        bt = bond.GetBondType()
        fbond = [
            0,  # bond is not None
            bt == Chem.rdchem.BondType.SINGLE,
            bt == Chem.rdchem.BondType.DOUBLE,
            bt == Chem.rdchem.BondType.TRIPLE,
            bt == Chem.rdchem.BondType.AROMATIC,
            (bond.GetIsConjugated() if bt is not None else 0),
            (bond.IsInRing() if bt is not None else 0)
        ]
        fbond += onek_encoding_unk(int(bond.GetStereo()), list(range(6)))
    return fbond

In [10]:
MORGAN_RADIUS = 2
MORGAN_NUM_BITS = 2048
#a vector representation (1x2048) for molecular feature 

In [11]:
def morgan_binary_features_generator(mol,
                                     radius: int = MORGAN_RADIUS,
                                     num_bits: int = MORGAN_NUM_BITS):
    """
    Generates a binary Morgan fingerprint for a molecule.
    :param mol: A molecule (i.e., either a SMILES or an RDKit molecule).
    :param radius: Morgan fingerprint radius.
    :param num_bits: Number of bits in Morgan fingerprint.
    :return: A 1D numpy array containing the binary Morgan fingerprint.
    """
    mol = Chem.MolFromSmiles(mol) if type(mol) == str else mol
    features_vec = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=num_bits)
    features = np.zeros((1,))
    DataStructs.ConvertToNumpyArray(features_vec, features)

    return features


### NLP

In [14]:
def pad_tokens(sentences, pad_token = '<pad>'):
    """
    A function that pads a list of token lists to the maximum length with providedpadding token (pad_token) 
    """
    max_len = max(len(item) for item in sentences)
    padded = [sentence + [pad_token]*(max_len - len(sentence)) for sentence in sentences]
    
    return padded
    

In [15]:
SMI_REGEX_PATTERN = r"""(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|
#|-|\+|\\|\/|:|~|@|\?|>>?|\*|\$|\%[0-9]{2}|[0-9])"""

class BasicSmilesTokenizer(object):
    """
    Run basic SMILES tokenization using a regex pattern developed by Schwaller et. al. This tokenizer is to be used
    when a tokenizer that does not require the transformers library by HuggingFace is required.
    Examples
    --------
    >>> from deepchem.feat.smiles_tokenizer import BasicSmilesTokenizer
    >>> tokenizer = BasicSmilesTokenizer()
    >>> print(tokenizer.tokenize("CC(=O)OC1=CC=CC=C1C(=O)O"))
    ['C', 'C', '(', '=', 'O', ')', 'O', 'C', '1', '=', 'C', 'C', '=', 'C', 'C', '=', 'C', '1', 'C', '(', '=', 'O', ')', 'O']
    References
    ----------
    .. [1]  Philippe Schwaller, Teodoro Laino, Théophile Gaudin, Peter Bolgar, Christopher A. Hunter, Costas Bekas, and Alpha A. Lee
            ACS Central Science 2019 5 (9): Molecular Transformer: A Model for Uncertainty-Calibrated Chemical Reaction Prediction
            1572-1583 DOI: 10.1021/acscentsci.9b00576
    """

    def __init__(self, regex_pattern: str = SMI_REGEX_PATTERN):
        """ Constructs a BasicSMILESTokenizer.
        Parameters
        ----------
        regex: string
            SMILES token regex
        """
        self.regex_pattern = regex_pattern
        self.regex = re.compile(self.regex_pattern)

    def tokenize(self, text):
        """ Basic Tokenization of a SMILES.
        """
        tokens = [token for token in self.regex.findall(text)]
        return tokens


In [16]:
tokenizer = BasicSmilesTokenizer()

In [17]:
def numericalize(smile):
    """Converts smile into a flat Tensor with indicies of token in the vocab of our pretrained w2v model."""
    #data = [vocab[item] for item in tokenizer.tokenize(smile)]
    data = [wv.get_index(item) for item in tokenizer.tokenize(smile)] 
    
    return data

In [18]:
class PositionalEncoding(nn.Module):
    """
    A pytroch Module object: 
    The positional embeddings of Transformer model.
    Input:  x: Tensor, shape [seq_len, batch_size, embedding_dim]
    
    Output: Sum of the original x and positional signal
    """
    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x: Tensor) -> Tensor:
        """
        Args:
            x: Tensor, shape [seq_len, batch_size, embedding_dim]
        """
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

### Data processing

In [19]:
def data_process(dataset,batch_size):
    """Convert dataset into dataloader with following attributes
       x: atom feature matrix of molecule
       y: label of molecule
       edge_index: [2, num_edges], dtype=torch.Long, matrix that captures connection among atoms in molecule
       edge_attr: edge featrue matrix of molecule
       smi: a Tensor indicating indicies of tokens in vocab for a SMILES (with paddings)
       
       Input: A dataframe with a column SMILES, batch_size 
       
       Output: A dataloader with attributes above
    """
    SMILES = dataset['SMILES']
    
    data_list = []
    
    for smile in SMILES:
        
        mol = Chem.MolFromSmiles(smile)     
        mol = Chem.AddHs(mol) 
        
        mol_feature = torch.tensor(morgan_binary_features_generator(mol))
        
        #dtype is numpy, list of lists will be produced in batch
        idx_list = numericalize(smile)
        
        xs = []
        for atom in mol.GetAtoms():
            x = atom_features(atom)
            xs.append(x)
            
        x = torch.tensor(xs)
        
        edge_indices, edge_attrs = [], []
        
        for bond in mol.GetBonds():
            i = bond.GetBeginAtomIdx()
            j = bond.GetEndAtomIdx()
    
            e = bond_features(bond)

            edge_indices += [[i,j],[j,i]]
            edge_attrs += [e, e]
        
        edge_index = torch.tensor(edge_indices)
        edge_index = edge_index.t().to(torch.long).view(2, -1)
        edge_attr = torch.tensor(edge_attrs).view(-1, 14)
        
        y = torch.tensor(int(dataset.loc[dataset['SMILES'] == smile,'Activity'])) #response variable y

        # add smiles and num_feature as the attributes
        data = Data(x=x, y=y, edge_index=edge_index, edge_attr = edge_attr, emb = idx_list, mol_feature= mol_feature)  
        data_list.append(data)   # store processed data into the list
        
    return DataLoader(data_list,batch_size,shuffle=True)

### Training

In [None]:
class EarlyStopping:
    """Early stops the training if validation loss doesn't improve after a given patience."""
    def __init__(self, patience=7, verbose=False, delta=0, path='checkpoint.pt', trace_func=print):
        """
        Args:
            patience (int): How long to wait after last time validation loss improved.
                            Default: 7
            verbose (bool): If True, prints a message for each validation loss improvement. 
                            Default: False
            delta (float): Minimum change in the monitored quantity to qualify as an improvement.
                            Default: 0
            path (str): Path for the checkpoint to be saved to.
                            Default: 'checkpoint.pt'
            trace_func (function): trace print function.
                            Default: print            
        """
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = np.Inf
        self.delta = delta
        self.path = path
        self.trace_func = trace_func
    def __call__(self, val_loss, model):

        score = -val_loss

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
        elif score < self.best_score + self.delta:
            self.counter += 1
            self.trace_func(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
            self.counter = 0

    def save_checkpoint(self, val_loss, model):
        '''Saves model when validation loss decrease.'''
        if self.verbose:
            self.trace_func(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}).  Saving model ...')
        torch.save(model.state_dict(), self.path)
        self.val_loss_min = val_loss

In [20]:
def reset_weights(m):
  '''
    Try resetting model weights to avoid
    weight leakage.
  '''
  for layer in m.children():
    if hasattr(layer, 'reset_parameters'):
        print(f'Reset trainable parameters of layer = {layer}')
        layer.reset_parameters()

In [21]:
def train(epoch,train_loader):
    """
    A function to train model on training set in each epoch.
    1. Optimization/
    2. Compute and store training losses and accuracies in lists train_loss and train_accuracy, respectively, for each epoch. 
    3. Report training losses and accuracies every 10 epochs.
    
    """
    
    model.train()   
    
    running_loss = 0 
    correct = 0
    total = 0
    criterion = BCELoss()
    
    for batch in train_loader:
        
        optimizer.zero_grad()
        
        outputs = model(batch.to(device))
        label = batch.y.view(-1,1).to(device)
        
        loss = criterion(outputs.float(),label.float())
        

        loss.backward()   # Compute the gradient of loss function 
        optimizer.step()  # Update parameters based on gradients.
        
        running_loss += loss.item()
        
        # probability that is larger than 0.5, classify as 1 
        pred = (outputs >= 0.5).float()

        total += label.size(0)
        correct += (pred == label).float().sum()
        
    #compute losses and accuracies for entire epoch
    loss = running_loss/len(train_loader)
    accuracy = 100*correct/total
    
    #to record training losses and accuracies
    train_accuracy.append(accuracy)
    train_loss.append(loss)
    
    #report training losses for every 10 epochs
    if epoch % 10 == 0:
        print('Epoch: '+str(int(epoch)))
        print('Train Loss: %.3f | Accuracy: %.3f'%(loss,accuracy))

In [22]:
def test(epoch,test_loader):
    """
    A function to evaluate model on test set in each epoch.
    
    1. Compute and store test losses and accuracies in lists test_loss and test_accuracy, respectively, for each epoch. 
    2. Report test losses and accuracies every 10 epochs.
    
    """
    model.eval()
    
    running_loss = 0
    correct = 0
    total = 0
    criterion = BCELoss()
    
    with torch.no_grad():
        
        for batch in test_loader:            
            
            outputs = model(batch.to(device))
            label = batch.y.view(-1,1).to(device)

            loss = criterion(outputs.float(), label.float())    
            running_loss += loss.item()
            
            # probability that is larger than 0.5, classify as 1 
            pred = (outputs >= 0.5).float()

            total += label.size(0)
            correct += (pred == label).float().sum()
    
        loss = running_loss/len(test_loader)
        accuracy = 100*correct/total
    
        test_accuracy.append(accuracy)
        test_loss.append(loss)
        if epoch % 10 == 0:
            print('Test Loss: %.3f | Accuracy: %.3f'%(loss,accuracy))

In [23]:
#test_set as a whole loader
def test_metrics(test_loader):
    """
    A function to evaluate performance metrics of model on test set.
    
    Compute and return AUC-ROC and classficiation report(Accuracy, weighted average of F1-score, recall and precision ) of the model

    """
    model.eval()

    with torch.no_grad():
        labels = []
        preds = []
        
        for batch in test_loader:
            
            labels += list(batch.y.view(-1,1).numpy())
            preds += list(model(batch.to(device)).cpu().detach().numpy())
        
        pred_labels = [1 if i > 0.5 else 0 for i in preds]
        
        auc = roc_auc_score(list(labels), list(preds), average='weighted')
        report = classification_report(labels, pred_labels,output_dict=True)
        
        return auc, report
    

In [24]:
def print_metrics(metrics):
    AUC = [] #0
    precision = []
    recall = []
    f1_score = []
    accuracy = []
    for i in metrics:
        AUC.append(i[0])
        precision.append(i[1]['weighted avg']['precision'])
        recall.append(i[1]['weighted avg']['recall'])
        f1_score.append(i[1]['weighted avg']['f1-score'])
        accuracy.append(i[1]['accuracy'])
    
    print('AUC:',np.mean(AUC),'+/-',np.std(AUC))
    print('Accuracy:',np.mean(accuracy),'+/-',np.std(accuracy))
    print('Precision:',np.mean(precision),'+/-',np.std(precision))
    print('Recall:',np.mean(recall),'+/-',np.std(recall))
    print('F1-score:',np.mean(f1_score),'+/-',np.std(f1_score))
    
    

### Combination with 1 

### GAT 

In [23]:
class GAT(torch.nn.Module):
    def __init__(self):
        super(GAT, self).__init__()
    
        #before the Attention Mechanism, parse in a fully connected network and output a 50 dimensional vector 
        self.hidden = 50  
        self.in_head = 5 #repeat the mechanism for 5 times
        
        self.conv1 = GATConv(in_channels = 133, 
                             out_channels = self.hidden,
                             heads=self.in_head, concat=False) #set concat to be False, so it will take average instead
        
        self.linear1 = Linear(self.hidden, 100)
        self.linear2 = Linear(100, 25)
        self.linear3 = Linear(25, 10)
        self.linear4 = Linear(10, 1)

    def forward(self, data):
        
        x, edge_index, batch_index = data.x, data.edge_index, data.batch        
        #extract batched node and edge feature matrices, edge_index matrix, batch_index and token indices tensor 
        
        x = self.conv1(x.float(), edge_index)

        #aggregate the learned local node feature to capture the global property 
        #here, we apply global_mean_pool over the updated node feature
        x = gap(x,batch_index)

        # now train a fully connected graph classification network  
        x = F.relu(self.linear1(x.float()))
        x = F.relu(self.linear2(x))
        x = F.relu(self.linear3(x))
        x = torch.sigmoid(self.linear4(x))
        
        return x

In [27]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [25]:
train_losses = []
train_acc = []
#test_acc = []
test_losses= []
metrics = []
#for each fold

for i in range(5):
    
    gc.collect()
    torch.cuda.empty_cache()
    print('Split '+str(i+1)+' ......')
    train_set = pd.read_csv('dir_to_dataset/5_fold_cv_train_test/train_split'+str(i+1)+'.csv')
    test_set = pd.read_csv('dir_to_dataset/5_fold_cv_train_test/test_split'+str(i+1)+'.csv')
    train_loader = data_process(train_set,32)
    test_loader = data_process(test_set,len(test_set))
    
    model = GAT().float().to(device)
    
    model.apply(reset_weights)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.00075,weight_decay=0.001)
    
    #for each epoch
    train_loss = []
    test_loss = []
    train_accuracy = []
    test_accuracy = []
    
    early_stopping = EarlyStopping(patience=10, verbose=True)
    #update the model parameters with 100 epochs
    for epoch in range(200):
        train(epoch,train_loader)
        test(epoch,test_loader)
        
        early_stopping(test_loss[-1],model)
        
        if early_stopping.early_stop:
            print('Early stopping')
            break
    
    model.load_state_dict(torch.load('checkpoint.pt'))
    auc, report = test_metrics(test_loader)
    metric = [auc,report]
    
    
    train_losses.append(train_loss)
    train_acc.append(train_accuracy)
    test_losses.append(test_loss)
    metrics.append(metric)


Split 1 ......
Reset trainable parameters of layer = Linear(133, 250, bias=False)
Reset trainable parameters of layer = GATConv(133, 50, heads=5)
Reset trainable parameters of layer = Linear(in_features=50, out_features=100, bias=True)
Reset trainable parameters of layer = Linear(in_features=100, out_features=25, bias=True)
Reset trainable parameters of layer = Linear(in_features=25, out_features=10, bias=True)
Reset trainable parameters of layer = Linear(in_features=10, out_features=1, bias=True)
Epoch: 0
Train Loss: 0.782 | Accuracy: 16.667
Test Loss: 0.765 | Accuracy: 16.667
Validation loss decreased (inf --> 0.765339).  Saving model ...
Validation loss decreased (0.765339 --> 0.714188).  Saving model ...
Validation loss decreased (0.714188 --> 0.567170).  Saving model ...
Validation loss decreased (0.567170 --> 0.449595).  Saving model ...
Validation loss decreased (0.449595 --> 0.448404).  Saving model ...
EarlyStopping counter: 1 out of 10
EarlyStopping counter: 2 out of 10
Valid

Validation loss decreased (0.400426 --> 0.398973).  Saving model ...
Validation loss decreased (0.398973 --> 0.397944).  Saving model ...
Validation loss decreased (0.397944 --> 0.397247).  Saving model ...
EarlyStopping counter: 1 out of 10
Validation loss decreased (0.397247 --> 0.396595).  Saving model ...
Epoch: 120
Train Loss: 0.405 | Accuracy: 83.333
Test Loss: 0.404 | Accuracy: 83.333
EarlyStopping counter: 1 out of 10
EarlyStopping counter: 2 out of 10
EarlyStopping counter: 3 out of 10
Validation loss decreased (0.396595 --> 0.393460).  Saving model ...
Validation loss decreased (0.393460 --> 0.391552).  Saving model ...
Validation loss decreased (0.391552 --> 0.391534).  Saving model ...
EarlyStopping counter: 1 out of 10
Validation loss decreased (0.391534 --> 0.390481).  Saving model ...
EarlyStopping counter: 1 out of 10
Validation loss decreased (0.390481 --> 0.389500).  Saving model ...
Epoch: 130
Train Loss: 0.389 | Accuracy: 83.333
Test Loss: 0.389 | Accuracy: 83.333
V

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Reset trainable parameters of layer = Linear(133, 250, bias=False)
Reset trainable parameters of layer = GATConv(133, 50, heads=5)
Reset trainable parameters of layer = Linear(in_features=50, out_features=100, bias=True)
Reset trainable parameters of layer = Linear(in_features=100, out_features=25, bias=True)
Reset trainable parameters of layer = Linear(in_features=25, out_features=10, bias=True)
Reset trainable parameters of layer = Linear(in_features=10, out_features=1, bias=True)
Epoch: 0
Train Loss: 0.748 | Accuracy: 16.667
Test Loss: 0.729 | Accuracy: 16.667
Validation loss decreased (inf --> 0.729457).  Saving model ...
Validation loss decreased (0.729457 --> 0.679433).  Saving model ...
Validation loss decreased (0.679433 --> 0.536359).  Saving model ...
Validation loss decreased (0.536359 --> 0.450162).  Saving model ...
Validation loss decreased (0.450162 --> 0.446878).  Saving model ...
EarlyStopping counter: 1 out of 10
Validation loss decreased (0.446878 --> 0.446292).  Sav

Validation loss decreased (0.406382 --> 0.405666).  Saving model ...
EarlyStopping counter: 1 out of 10
Validation loss decreased (0.405666 --> 0.403189).  Saving model ...
EarlyStopping counter: 1 out of 10
Validation loss decreased (0.403189 --> 0.401462).  Saving model ...
Validation loss decreased (0.401462 --> 0.400029).  Saving model ...
Epoch: 120
Train Loss: 0.423 | Accuracy: 83.333
Test Loss: 0.402 | Accuracy: 83.333
EarlyStopping counter: 1 out of 10
Validation loss decreased (0.400029 --> 0.398200).  Saving model ...
Validation loss decreased (0.398200 --> 0.397220).  Saving model ...
Validation loss decreased (0.397220 --> 0.396225).  Saving model ...
EarlyStopping counter: 1 out of 10
EarlyStopping counter: 2 out of 10
Validation loss decreased (0.396225 --> 0.392988).  Saving model ...
Validation loss decreased (0.392988 --> 0.391743).  Saving model ...
EarlyStopping counter: 1 out of 10
Validation loss decreased (0.391743 --> 0.389867).  Saving model ...
Epoch: 130
Train

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Reset trainable parameters of layer = Linear(133, 250, bias=False)
Reset trainable parameters of layer = GATConv(133, 50, heads=5)
Reset trainable parameters of layer = Linear(in_features=50, out_features=100, bias=True)
Reset trainable parameters of layer = Linear(in_features=100, out_features=25, bias=True)
Reset trainable parameters of layer = Linear(in_features=25, out_features=10, bias=True)
Reset trainable parameters of layer = Linear(in_features=10, out_features=1, bias=True)
Epoch: 0
Train Loss: 0.653 | Accuracy: 83.333
Test Loss: 0.643 | Accuracy: 83.333
Validation loss decreased (inf --> 0.642873).  Saving model ...
Validation loss decreased (0.642873 --> 0.621836).  Saving model ...
Validation loss decreased (0.621836 --> 0.570106).  Saving model ...
Validation loss decreased (0.570106 --> 0.457728).  Saving model ...
Validation loss decreased (0.457728 --> 0.453337).  Saving model ...
Validation loss decreased (0.453337 --> 0.449198).  Saving model ...
Validation loss decre

Validation loss decreased (0.391767 --> 0.390988).  Saving model ...
EarlyStopping counter: 1 out of 10
Epoch: 120
Train Loss: 0.378 | Accuracy: 83.333
Test Loss: 0.394 | Accuracy: 83.333
EarlyStopping counter: 2 out of 10
Validation loss decreased (0.390988 --> 0.389738).  Saving model ...
EarlyStopping counter: 1 out of 10
EarlyStopping counter: 2 out of 10
EarlyStopping counter: 3 out of 10
Validation loss decreased (0.389738 --> 0.389042).  Saving model ...
EarlyStopping counter: 1 out of 10
EarlyStopping counter: 2 out of 10
EarlyStopping counter: 3 out of 10
EarlyStopping counter: 4 out of 10
Epoch: 130
Train Loss: 0.382 | Accuracy: 83.333
Test Loss: 0.430 | Accuracy: 83.333
EarlyStopping counter: 5 out of 10
Validation loss decreased (0.389042 --> 0.388940).  Saving model ...
EarlyStopping counter: 1 out of 10
EarlyStopping counter: 2 out of 10
EarlyStopping counter: 3 out of 10
Validation loss decreased (0.388940 --> 0.388715).  Saving model ...
Validation loss decreased (0.388

Validation loss decreased (0.444077 --> 0.443693).  Saving model ...
Validation loss decreased (0.443693 --> 0.443317).  Saving model ...
EarlyStopping counter: 1 out of 10
Validation loss decreased (0.443317 --> 0.443081).  Saving model ...
Validation loss decreased (0.443081 --> 0.442551).  Saving model ...
Validation loss decreased (0.442551 --> 0.442219).  Saving model ...
Validation loss decreased (0.442219 --> 0.441029).  Saving model ...
Epoch: 60
Train Loss: 0.438 | Accuracy: 83.333
Test Loss: 0.440 | Accuracy: 83.333
Validation loss decreased (0.441029 --> 0.440475).  Saving model ...
Validation loss decreased (0.440475 --> 0.440166).  Saving model ...
Validation loss decreased (0.440166 --> 0.439148).  Saving model ...
Validation loss decreased (0.439148 --> 0.438231).  Saving model ...
Validation loss decreased (0.438231 --> 0.437097).  Saving model ...
EarlyStopping counter: 1 out of 10
Validation loss decreased (0.437097 --> 0.434500).  Saving model ...
Validation loss dec

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Reset trainable parameters of layer = Linear(133, 250, bias=False)
Reset trainable parameters of layer = GATConv(133, 50, heads=5)
Reset trainable parameters of layer = Linear(in_features=50, out_features=100, bias=True)
Reset trainable parameters of layer = Linear(in_features=100, out_features=25, bias=True)
Reset trainable parameters of layer = Linear(in_features=25, out_features=10, bias=True)
Reset trainable parameters of layer = Linear(in_features=10, out_features=1, bias=True)
Epoch: 0
Train Loss: 0.716 | Accuracy: 16.667
Test Loss: 0.700 | Accuracy: 16.667
Validation loss decreased (inf --> 0.700384).  Saving model ...
Validation loss decreased (0.700384 --> 0.634491).  Saving model ...
Validation loss decreased (0.634491 --> 0.473166).  Saving model ...
Validation loss decreased (0.473166 --> 0.457025).  Saving model ...
Validation loss decreased (0.457025 --> 0.449428).  Saving model ...
Validation loss decreased (0.449428 --> 0.449186).  Saving model ...
Validation loss decre

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [28]:
print_metrics(metrics)

AUC: 0.7746527777777776 +/- 0.05371921670898749
Accuracy: 0.8347222222222224 +/- 0.002777777777777768
Precision: 0.717283950617284 +/- 0.04567901234567904
Recall: 0.8347222222222224 +/- 0.002777777777777768
F1-score: 0.7677956030897208 +/- 0.02043969102792631


### MorganFingerprint

In [31]:
class MFP(torch.nn.Module):
    def __init__(self):
        super(MFP, self).__init__()    
        #fully connected layers
        self.linear1 = Linear(2048, 100)
        self.linear2 = Linear(100, 25)
        self.linear3 = Linear(25, 10)
        self.linear4 = Linear(10, 1)

    def forward(self, data):
        
        mol_feature = data.mol_feature
        x = mol_feature.view(data.num_graphs,2048)
        x = F.relu(self.linear1(x.float()))
        x = F.relu(self.linear2(x))
        x = F.relu(self.linear3(x))
        x = torch.sigmoid(self.linear4(x))
        
        return x

In [32]:
train_losses = []
train_acc = []
#test_acc = []
test_losses= []
metrics = []
#for each fold

for i in range(5):
    
    gc.collect()
    torch.cuda.empty_cache()
    print('Split '+str(i+1)+' ......')
    train_set = pd.read_csv('dir_to_dataset/5_fold_cv_train_test/train_split'+str(i+1)+'.csv')
    test_set = pd.read_csv('dir_to_dataset/5_fold_cv_train_test/test_split'+str(i+1)+'.csv')
    train_loader = data_process(train_set,32)
    test_loader = data_process(test_set,len(test_set))
    
    model = MFP().float().to(device)
    
    model.apply(reset_weights)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001,weight_decay=0.001)
    
    #for each epoch
    train_loss = []
    test_loss = []
    train_accuracy = []
    test_accuracy = []
    
    early_stopping = EarlyStopping(patience=10, verbose=True)
    #update the model parameters with 100 epochs
    for epoch in range(100):
        train(epoch,train_loader)
        test(epoch,test_loader)
        
        early_stopping(test_loss[-1],model)
        
        if early_stopping.early_stop:
            print('Early stopping')
            break
    
    model.load_state_dict(torch.load('checkpoint.pt'))
    auc, report = test_metrics(test_loader)
    metric = [auc,report]
    
    
    train_losses.append(train_loss)
    train_acc.append(train_accuracy)
    test_losses.append(test_loss)
    metrics.append(metric)

Split 1 ......
Reset trainable parameters of layer = Linear(in_features=2048, out_features=100, bias=True)
Reset trainable parameters of layer = Linear(in_features=100, out_features=25, bias=True)
Reset trainable parameters of layer = Linear(in_features=25, out_features=10, bias=True)
Reset trainable parameters of layer = Linear(in_features=10, out_features=1, bias=True)
Epoch: 0
Train Loss: 0.686 | Accuracy: 86.285
Test Loss: 0.683 | Accuracy: 88.889
Validation loss decreased (inf --> 0.682557).  Saving model ...
Validation loss decreased (0.682557 --> 0.670037).  Saving model ...
Validation loss decreased (0.670037 --> 0.648369).  Saving model ...
Validation loss decreased (0.648369 --> 0.637158).  Saving model ...
EarlyStopping counter: 1 out of 10
EarlyStopping counter: 2 out of 10
EarlyStopping counter: 3 out of 10
EarlyStopping counter: 4 out of 10
Validation loss decreased (0.637158 --> 0.634424).  Saving model ...
Validation loss decreased (0.634424 --> 0.625924).  Saving model

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Reset trainable parameters of layer = Linear(in_features=2048, out_features=100, bias=True)
Reset trainable parameters of layer = Linear(in_features=100, out_features=25, bias=True)
Reset trainable parameters of layer = Linear(in_features=25, out_features=10, bias=True)
Reset trainable parameters of layer = Linear(in_features=10, out_features=1, bias=True)
Epoch: 0
Train Loss: 0.610 | Accuracy: 83.333
Test Loss: 0.555 | Accuracy: 83.333
Validation loss decreased (inf --> 0.555117).  Saving model ...
Validation loss decreased (0.555117 --> 0.411761).  Saving model ...
Validation loss decreased (0.411761 --> 0.336788).  Saving model ...
Validation loss decreased (0.336788 --> 0.306883).  Saving model ...
Validation loss decreased (0.306883 --> 0.277751).  Saving model ...
Validation loss decreased (0.277751 --> 0.254807).  Saving model ...
EarlyStopping counter: 1 out of 10
EarlyStopping counter: 2 out of 10
EarlyStopping counter: 3 out of 10
EarlyStopping counter: 4 out of 10
Epoch: 10


In [33]:
print_metrics(metrics)

AUC: 0.8888194444444444 +/- 0.02854360969325686
Accuracy: 0.8972222222222224 +/- 0.03497132864448381
Precision: 0.8665271456368844 +/- 0.08769662743191106
Recall: 0.8972222222222224 +/- 0.03497132864448381
F1-score: 0.877636318589845 +/- 0.06175434808390926


### Transformer

In [26]:
class Transformer(torch.nn.Module):
    def __init__(self):
        super(Transformer, self).__init__()
        
        #vocab_len to decide how many unique 'word', 1024 the dimension of each embeddings
        self.embedding = Embedding.from_pretrained(torch.from_numpy(wv.vectors),freeze=False)
        
        encoder_layer = TransformerEncoderLayer(d_model=1024, nhead=4)
        
        self.encoder = TransformerEncoder(encoder_layer,2)
        self.pos_encoder = PositionalEncoding(1024, 0.1)
        
        #fully connected layers
        self.linear1 = Linear(1024, 100)
        self.linear2 = Linear(100, 25)
        self.linear3 = Linear(25, 10)
        self.linear4 = Linear(10, 1)

    def forward(self, data):
        #list of variable length lists of indices
        wv_index = data.emb
        wv_index = [torch.tensor(item) for item in data.emb]
        
        padded = pad_sequence(wv_index,padding_value = 266)
        # seq_len x batch_size 
        
        fp = self.embedding(padded.to(device))
        #seq_len x batch_size x dim_embedding
        
        fp = self.pos_encoder(fp)
        fp = self.encoder(fp)
        #seq_len x batch_size x dim_embedding
        #learned smile representation 
        
        x = fp.sum(dim=0)
        # batch_size x 512 
        
        # now train a fully connected graph classification network  
        x = F.relu(self.linear1(x.float()))
        x = F.relu(self.linear2(x))
        x = F.relu(self.linear3(x))
        x = torch.sigmoid(self.linear4(x))
        
        return x

In [27]:
train_losses = []
train_acc = []
#test_acc = []
test_losses= []
metrics = []
#for each fold

for i in range(5):
    
    gc.collect()
    torch.cuda.empty_cache()
    print('Split '+str(i+1)+' ......')
    train_set = pd.read_csv('dir_to_dataset/5_fold_cv_train_test/train_split'+str(i+1)+'.csv')
    test_set = pd.read_csv('dir_to_dataset/5_fold_cv_train_test/test_split'+str(i+1)+'.csv')
    train_loader = data_process(train_set,32)
    test_loader = data_process(test_set,len(test_set))
    
    model = Transformer().float().to(device)
    
    model.apply(reset_weights)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.000075,weight_decay=0.001)
    
    #for each epoch
    train_loss = []
    test_loss = []
    train_accuracy = []
    test_accuracy = []
    
    early_stopping = EarlyStopping(patience=10, verbose=True)
    #update the model parameters with 100 epochs
    for epoch in range(100):
        train(epoch,train_loader)
        test(epoch,test_loader)
        
        early_stopping(test_loss[-1],model)
        
        if early_stopping.early_stop:
            print('Early stopping')
            break
    
    model.load_state_dict(torch.load('checkpoint.pt'))
    auc, report = test_metrics(test_loader)
    metric = [auc,report]
    
    
    train_losses.append(train_loss)
    train_acc.append(train_accuracy)
    test_losses.append(test_loss)
    metrics.append(metric)
    

Split 1 ......
Reset trainable parameters of layer = NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
Reset trainable parameters of layer = Linear(in_features=1024, out_features=2048, bias=True)
Reset trainable parameters of layer = Linear(in_features=2048, out_features=1024, bias=True)
Reset trainable parameters of layer = LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
Reset trainable parameters of layer = LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
Reset trainable parameters of layer = NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
Reset trainable parameters of layer = Linear(in_features=1024, out_features=2048, bias=True)
Reset trainable parameters of layer = Linear(in_features=2048, out_features=1024, bias=True)
Reset trainable parameters of layer = LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
Reset trainable parameters of layer = LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
Reset

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Split 3 ......
Reset trainable parameters of layer = NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
Reset trainable parameters of layer = Linear(in_features=1024, out_features=2048, bias=True)
Reset trainable parameters of layer = Linear(in_features=2048, out_features=1024, bias=True)
Reset trainable parameters of layer = LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
Reset trainable parameters of layer = LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
Reset trainable parameters of layer = NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
Reset trainable parameters of layer = Linear(in_features=1024, out_features=2048, bias=True)
Reset trainable parameters of layer = Linear(in_features=2048, out_features=1024, bias=True)
Reset trainable parameters of layer = LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
Reset trainable parameters of layer = LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
Reset

EarlyStopping counter: 1 out of 10
EarlyStopping counter: 2 out of 10
EarlyStopping counter: 3 out of 10
Epoch: 20
Train Loss: 0.229 | Accuracy: 89.583
Test Loss: 0.369 | Accuracy: 82.639
EarlyStopping counter: 4 out of 10
EarlyStopping counter: 5 out of 10
EarlyStopping counter: 6 out of 10
Validation loss decreased (0.301249 --> 0.279616).  Saving model ...
EarlyStopping counter: 1 out of 10
EarlyStopping counter: 2 out of 10
EarlyStopping counter: 3 out of 10
EarlyStopping counter: 4 out of 10
EarlyStopping counter: 5 out of 10
EarlyStopping counter: 6 out of 10
Epoch: 30
Train Loss: 0.147 | Accuracy: 94.444
Test Loss: 0.359 | Accuracy: 83.333
EarlyStopping counter: 7 out of 10
EarlyStopping counter: 8 out of 10
EarlyStopping counter: 9 out of 10
EarlyStopping counter: 10 out of 10
Early stopping


In [28]:
print_metrics(metrics)

AUC: 0.8544791666666667 +/- 0.026545224047897695
Accuracy: 0.8805555555555555 +/- 0.027568657279554435
Precision: 0.8483710769185524 +/- 0.07920078052845107
Recall: 0.8805555555555555 +/- 0.027568657279554435
F1-score: 0.8601858397143568 +/- 0.05395394315015024


## Combination with 2

### Transformer + GAT


In [29]:
class GAT_Transformer(torch.nn.Module):
    def __init__(self):
        super(GAT_Transformer, self).__init__()
    
        #before the Attention Mechanism, parse in a fully connected network and output a 50 dimensional vector 
        self.hidden = 50  
        self.in_head = 5 #repeat the mechanism for 5 times
        
        self.conv1 = GATConv(in_channels = 133, 
                             out_channels = self.hidden,
                             heads=self.in_head, concat=False) #set concat to be False, so it will take average instead
        
        
        #define a transformer encoder layer to learn about smiles string
        #vocab_len to decide how many unique 'word', 1024 the dimension of each embeddings
        self.embedding = Embedding.from_pretrained(torch.from_numpy(wv.vectors),freeze=False)
        
        encoder_layer = TransformerEncoderLayer(d_model=1024, nhead=2)
        
        self.encoder = TransformerEncoder(encoder_layer,4)
        self.pos_encoder = PositionalEncoding(1024, 0.1)
        
        #fully connected layers
        
        self.linear1 = Linear(self.hidden+1024, 100)
        self.linear2 = Linear(100, 25)
        self.linear3 = Linear(25, 10)
        self.linear4 = Linear(10, 1)

    def forward(self, data):
        
        x, edge_index, batch_index, wv_index = data.x, data.edge_index, data.batch, data.emb        
        #extract batched node and edge feature matrices, edge_index matrix, batch_index and token indices tensor 
        
        wv_index = [torch.tensor(item) for item in wv_index]
        
        padded = pad_sequence(wv_index,padding_value = 266)
        # seq_len x batch_size 
        
        fp = self.embedding(padded.to(device))
        #seq_len x batch_size x dim_embedding
        
        fp = self.pos_encoder(fp)
        fp = self.encoder(fp)
        #seq_len x batch_size x dim_embedding
        #learned smile representation 
        
        fp = fp.sum(dim=0)
        # batch_size x dim_embedding
        
        x = self.conv1(x.float(), edge_index)

        #aggregate the learned local node feature to capture the global property 
        #here, we apply global_mean_pool over the updated node feature
        x = gap(x,batch_index)
        
        #concatenate x and fp
        x = torch.cat([x, fp],dim=1)

        # now train a fully connected graph classification network  
        x = F.relu(self.linear1(x.float()))
        x = F.relu(self.linear2(x))
        x = F.relu(self.linear3(x))
        x = torch.sigmoid(self.linear4(x))
        
        return x

In [30]:
train_losses = []
train_acc = []
#test_acc = []
test_losses= []
metrics = []
#for each fold

for i in range(5):
    
    gc.collect()
    torch.cuda.empty_cache()
    print('Split '+str(i+1)+' ......')
    train_set = pd.read_csv('dir_to_dataset/5_fold_cv_train_test/train_split'+str(i+1)+'.csv')
    test_set = pd.read_csv('dir_to_dataset/5_fold_cv_train_test/test_split'+str(i+1)+'.csv')
    train_loader = data_process(train_set,32)
    test_loader = data_process(test_set,len(test_set))
    
    model = GAT_Transformer().float().to(device)
    
    model.apply(reset_weights)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.000075,weight_decay=0.001)
    
    #for each epoch
    train_loss = []
    test_loss = []
    train_accuracy = []
    test_accuracy = []
    
    early_stopping = EarlyStopping(patience=10, verbose=True)
    #update the model parameters with 100 epochs
    for epoch in range(100):
        train(epoch,train_loader)
        test(epoch,test_loader)
        
        early_stopping(test_loss[-1],model)
        
        if early_stopping.early_stop:
            print('Early stopping')
            break
    
    model.load_state_dict(torch.load('checkpoint.pt'))
    auc, report = test_metrics(test_loader)
    metric = [auc,report]
    
    
    train_losses.append(train_loss)
    train_acc.append(train_accuracy)
    test_losses.append(test_loss)
    metrics.append(metric)
    
    


Split 1 ......
Reset trainable parameters of layer = Linear(133, 250, bias=False)
Reset trainable parameters of layer = NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
Reset trainable parameters of layer = Linear(in_features=1024, out_features=2048, bias=True)
Reset trainable parameters of layer = Linear(in_features=2048, out_features=1024, bias=True)
Reset trainable parameters of layer = LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
Reset trainable parameters of layer = LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
Reset trainable parameters of layer = NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
Reset trainable parameters of layer = Linear(in_features=1024, out_features=2048, bias=True)
Reset trainable parameters of layer = Linear(in_features=2048, out_features=1024, bias=True)
Reset trainable parameters of layer = LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
Reset trainable parameters of l

Epoch: 0
Train Loss: 0.705 | Accuracy: 78.993
Test Loss: 0.623 | Accuracy: 83.333
Validation loss decreased (inf --> 0.622803).  Saving model ...
Validation loss decreased (0.622803 --> 0.622479).  Saving model ...
Validation loss decreased (0.622479 --> 0.622114).  Saving model ...
Validation loss decreased (0.622114 --> 0.621754).  Saving model ...
Validation loss decreased (0.621754 --> 0.621393).  Saving model ...
Validation loss decreased (0.621393 --> 0.621027).  Saving model ...
Validation loss decreased (0.621027 --> 0.620661).  Saving model ...
Validation loss decreased (0.620661 --> 0.620297).  Saving model ...
Validation loss decreased (0.620297 --> 0.619939).  Saving model ...
Validation loss decreased (0.619939 --> 0.619591).  Saving model ...
Epoch: 10
Train Loss: 0.619 | Accuracy: 83.333
Test Loss: 0.619 | Accuracy: 83.333
Validation loss decreased (0.619591 --> 0.619223).  Saving model ...
Validation loss decreased (0.619223 --> 0.618861).  Saving model ...
Validation l

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Split 4 ......
Reset trainable parameters of layer = Linear(133, 250, bias=False)
Reset trainable parameters of layer = NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
Reset trainable parameters of layer = Linear(in_features=1024, out_features=2048, bias=True)
Reset trainable parameters of layer = Linear(in_features=2048, out_features=1024, bias=True)
Reset trainable parameters of layer = LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
Reset trainable parameters of layer = LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
Reset trainable parameters of layer = NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
Reset trainable parameters of layer = Linear(in_features=1024, out_features=2048, bias=True)
Reset trainable parameters of layer = Linear(in_features=2048, out_features=1024, bias=True)
Reset trainable parameters of layer = LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
Reset trainable parameters of l

Epoch: 30
Train Loss: 0.269 | Accuracy: 86.632
Test Loss: 0.370 | Accuracy: 84.722
EarlyStopping counter: 3 out of 10
EarlyStopping counter: 4 out of 10
EarlyStopping counter: 5 out of 10
EarlyStopping counter: 6 out of 10
EarlyStopping counter: 7 out of 10
EarlyStopping counter: 8 out of 10
EarlyStopping counter: 9 out of 10
EarlyStopping counter: 10 out of 10
Early stopping


In [31]:
print_metrics(metrics)

AUC: 0.7736805555555556 +/- 0.1417788567204679
Accuracy: 0.8597222222222222 +/- 0.025382870670134804
Precision: 0.8383203605501371 +/- 0.07237372594670383
Recall: 0.8597222222222222 +/- 0.025382870670134804
F1-score: 0.8430819951807391 +/- 0.044820889338285944


### Transformer + MorganFingerprint

In [24]:
class Transformer_MFP(torch.nn.Module):
    def __init__(self):
        super(Transformer_MFP, self).__init__()
    
        #before the Attention Mechanism, parse in a fully connected network and output a 50 dimensional vector 
        
        
        #define a transformer encoder layer to learn about smiles string
        #vocab_len to decide how many unique 'word', 1024 the dimension of each embeddings
        self.embedding = Embedding.from_pretrained(torch.from_numpy(wv.vectors),freeze=False)
        
        encoder_layer = TransformerEncoderLayer(d_model=1024, nhead=2)
        
        self.encoder = TransformerEncoder(encoder_layer,4)
        self.pos_encoder = PositionalEncoding(1024, 0.1)
        
        #fully connected layers
        
        self.linear1 = Linear(2048+1024, 100)
        self.linear2 = Linear(100, 25)
        self.linear3 = Linear(25, 10)
        self.linear4 = Linear(10, 1)

    def forward(self, data):
        
        batch_index, wv_index, mol_feature = data.batch, data.emb, data.mol_feature        
        #extract batched node and edge feature matrices, edge_index matrix, batch_index and token indices tensor 
        
        wv_index = [torch.tensor(item) for item in wv_index]
        
        padded = pad_sequence(wv_index,padding_value = 266)
        # seq_len x batch_size 
        
        fp = self.embedding(padded.to(device))
        #seq_len x batch_size x dim_embedding
        
        fp = self.pos_encoder(fp)
        fp = self.encoder(fp)
        #seq_len x batch_size x dim_embedding
        #learned smile representation 
        
        fp = fp.sum(dim=0)
        # batch_size x dim_embedding
        
        mol_feature = mol_feature.view(data.num_graphs,2048)
        #concatenate x and fp
        
        x = torch.cat([mol_feature, fp],dim=1)

        # now train a fully connected graph classification network  
        x = F.relu(self.linear1(x.float()))
        x = F.relu(self.linear2(x))
        x = F.relu(self.linear3(x))
        x = torch.sigmoid(self.linear4(x))
        
        return x

In [25]:
train_losses = []
train_acc = []
#test_acc = []
test_losses= []
metrics = []
#for each fold

for i in range(5):
    
    gc.collect()
    torch.cuda.empty_cache()
    print('Split '+str(i+1)+' ......')
    train_set = pd.read_csv('dir_to_dataset/5_fold_cv_train_test/FYP/train_split'+str(i+1)+'.csv')
    test_set = pd.read_csv('dir_to_dataset/5_fold_cv_train_test/test_split'+str(i+1)+'.csv')
    train_loader = data_process(train_set,32)
    test_loader = data_process(test_set,len(test_set))
    
    model = Transformer_MFP().float().to(device)
    
    model.apply(reset_weights)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.000075,weight_decay=0.001)
    
    #for each epoch
    train_loss = []
    test_loss = []
    train_accuracy = []
    test_accuracy = []
    
    early_stopping = EarlyStopping(patience=10, verbose=True)
    #update the model parameters with 100 epochs
    for epoch in range(100):
        train(epoch,train_loader)
        test(epoch,test_loader)
        
        early_stopping(test_loss[-1],model)
        
        if early_stopping.early_stop:
            print('Early stopping')
            break
    
    model.load_state_dict(torch.load('checkpoint.pt'))
    auc, report = test_metrics(test_loader)
    metric = [auc,report]
    
    
    train_losses.append(train_loss)
    train_acc.append(train_accuracy)
    test_losses.append(test_loss)
    metrics.append(metric)
    
    


Split 1 ......
Reset trainable parameters of layer = NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
Reset trainable parameters of layer = Linear(in_features=1024, out_features=2048, bias=True)
Reset trainable parameters of layer = Linear(in_features=2048, out_features=1024, bias=True)
Reset trainable parameters of layer = LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
Reset trainable parameters of layer = LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
Reset trainable parameters of layer = NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
Reset trainable parameters of layer = Linear(in_features=1024, out_features=2048, bias=True)
Reset trainable parameters of layer = Linear(in_features=2048, out_features=1024, bias=True)
Reset trainable parameters of layer = LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
Reset trainable parameters of layer = LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
Reset

Epoch: 0
Train Loss: 0.788 | Accuracy: 74.653
Test Loss: 0.431 | Accuracy: 83.333
Validation loss decreased (inf --> 0.430815).  Saving model ...
EarlyStopping counter: 1 out of 10
EarlyStopping counter: 2 out of 10
EarlyStopping counter: 3 out of 10
EarlyStopping counter: 4 out of 10
EarlyStopping counter: 5 out of 10
Validation loss decreased (0.430815 --> 0.398722).  Saving model ...
EarlyStopping counter: 1 out of 10
EarlyStopping counter: 2 out of 10
EarlyStopping counter: 3 out of 10
Epoch: 10
Train Loss: 0.362 | Accuracy: 89.583
Test Loss: 0.384 | Accuracy: 87.500
Validation loss decreased (0.398722 --> 0.383591).  Saving model ...
EarlyStopping counter: 1 out of 10
EarlyStopping counter: 2 out of 10
EarlyStopping counter: 3 out of 10
EarlyStopping counter: 4 out of 10
EarlyStopping counter: 5 out of 10
EarlyStopping counter: 6 out of 10
Validation loss decreased (0.383591 --> 0.342554).  Saving model ...
EarlyStopping counter: 1 out of 10
EarlyStopping counter: 2 out of 10
Epoc

EarlyStopping counter: 1 out of 10
EarlyStopping counter: 2 out of 10
Validation loss decreased (0.320161 --> 0.313186).  Saving model ...
Validation loss decreased (0.313186 --> 0.311319).  Saving model ...
Validation loss decreased (0.311319 --> 0.295916).  Saving model ...
EarlyStopping counter: 1 out of 10
EarlyStopping counter: 2 out of 10
Validation loss decreased (0.295916 --> 0.271519).  Saving model ...
Epoch: 20
Train Loss: 0.221 | Accuracy: 91.319
Test Loss: 0.292 | Accuracy: 90.278
EarlyStopping counter: 1 out of 10
Validation loss decreased (0.271519 --> 0.269694).  Saving model ...
EarlyStopping counter: 1 out of 10
EarlyStopping counter: 2 out of 10
EarlyStopping counter: 3 out of 10
EarlyStopping counter: 4 out of 10
EarlyStopping counter: 5 out of 10
EarlyStopping counter: 6 out of 10
EarlyStopping counter: 7 out of 10
EarlyStopping counter: 8 out of 10
Epoch: 30
Train Loss: 0.179 | Accuracy: 92.708
Test Loss: 0.426 | Accuracy: 86.806
EarlyStopping counter: 9 out of 10

In [26]:
print_metrics(metrics)

AUC: 0.8484722222222223 +/- 0.03343200096122882
Accuracy: 0.8861111111111111 +/- 0.015590239111558069
Precision: 0.8837202520351166 +/- 0.01798267917927629
Recall: 0.8861111111111111 +/- 0.015590239111558069
F1-score: 0.877027331321184 +/- 0.015559003187079682


### GAT + MorganFingerprint

In [27]:
class GAT_MFP(torch.nn.Module):
    def __init__(self):
        super(GAT_MFP, self).__init__()
    
        #before the Attention Mechanism, parse in a fully connected network and output a 50 dimensional vector 
        self.hidden = 50  
        self.in_head = 5 #repeat the mechanism for 5 times
        self.conv1 = GATConv(in_channels = 133, 
                             out_channels = self.hidden, 
                             heads=self.in_head, concat=False) #set concat to be False, so it will take average instead

        
        #fully connected layers
        self.linear1 = Linear(self.hidden+2048,100)
        self.linear2 = Linear(100, 25)
        self.linear3 = Linear(25, 10)
        self.linear4 = Linear(10, 1)

    def forward(self, data):
        
        x, edge_index, batch_index, mol_feature = data.x, data.edge_index, data.batch, data.mol_feature    
        #extract node vectors, edge_index, batch index, and binary morgan fingerprint 
        
        x = self.conv1(x.float(), edge_index)

        #aggregate the learned local node feature to capture the global property 
        #here, we apply global_mean_pool over the updated node feature
        x = gap(x,batch_index)
        #also include some other global information i.e. binary morgan fingerprint
        x = torch.cat([x, mol_feature.reshape(data.num_graphs,2048)],dim=1)

        x = F.relu(self.linear1(x.float()))
        x = F.relu(self.linear2(x))
        x = F.relu(self.linear3(x))
        x = torch.sigmoid(self.linear4(x))
        
        return x

In [28]:
train_losses = []
train_acc = []
#test_acc = []
test_losses= []
metrics = []
#for each fold

for i in range(5):
    
    gc.collect()
    torch.cuda.empty_cache()
    print('Split '+str(i+1)+' ......')
    train_set = pd.read_csv('dir_to_dataset/5_fold_cv_train_test/train_split'+str(i+1)+'.csv')
    test_set = pd.read_csv('dir_to_dataset/5_fold_cv_train_test/test_split'+str(i+1)+'.csv')
    train_loader = data_process(train_set,32)
    test_loader = data_process(test_set,len(test_set))
    
    model = GAT_MFP().float().to(device)
    
    model.apply(reset_weights)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.000075,weight_decay=0.001)
    
    #for each epoch
    train_loss = []
    test_loss = []
    train_accuracy = []
    test_accuracy = []
    
    early_stopping = EarlyStopping(patience=10, verbose=True)
    #update the model parameters with 100 epochs
    for epoch in range(100):
        train(epoch,train_loader)
        test(epoch,test_loader)
        
        early_stopping(test_loss[-1],model)
        
        if early_stopping.early_stop:
            print('Early stopping')
            break
    
    model.load_state_dict(torch.load('checkpoint.pt'))
    auc, report = test_metrics(test_loader)
    metric = [auc,report]
    
    
    train_losses.append(train_loss)
    train_acc.append(train_accuracy)
    test_losses.append(test_loss)
    metrics.append(metric)
    
    


Split 1 ......
Reset trainable parameters of layer = Linear(133, 250, bias=False)
Reset trainable parameters of layer = GATConv(133, 50, heads=5)
Reset trainable parameters of layer = Linear(in_features=2098, out_features=100, bias=True)
Reset trainable parameters of layer = Linear(in_features=100, out_features=25, bias=True)
Reset trainable parameters of layer = Linear(in_features=25, out_features=10, bias=True)
Reset trainable parameters of layer = Linear(in_features=10, out_features=1, bias=True)
Epoch: 0
Train Loss: 0.600 | Accuracy: 83.333
Test Loss: 0.597 | Accuracy: 83.333
Validation loss decreased (inf --> 0.597306).  Saving model ...
Validation loss decreased (0.597306 --> 0.591037).  Saving model ...
Validation loss decreased (0.591037 --> 0.582555).  Saving model ...
Validation loss decreased (0.582555 --> 0.571094).  Saving model ...
Validation loss decreased (0.571094 --> 0.556010).  Saving model ...
Validation loss decreased (0.556010 --> 0.537784).  Saving model ...
Vali

Validation loss decreased (0.246503 --> 0.244391).  Saving model ...
Validation loss decreased (0.244391 --> 0.241723).  Saving model ...
Validation loss decreased (0.241723 --> 0.239103).  Saving model ...
Validation loss decreased (0.239103 --> 0.238446).  Saving model ...
Validation loss decreased (0.238446 --> 0.235916).  Saving model ...
Epoch: 50
Train Loss: 0.120 | Accuracy: 96.701
Test Loss: 0.234 | Accuracy: 92.361
Validation loss decreased (0.235916 --> 0.233836).  Saving model ...
Validation loss decreased (0.233836 --> 0.231130).  Saving model ...
Validation loss decreased (0.231130 --> 0.230572).  Saving model ...
Validation loss decreased (0.230572 --> 0.228926).  Saving model ...
Validation loss decreased (0.228926 --> 0.227748).  Saving model ...
Validation loss decreased (0.227748 --> 0.226935).  Saving model ...
Validation loss decreased (0.226935 --> 0.224399).  Saving model ...
Validation loss decreased (0.224399 --> 0.223644).  Saving model ...
Validation loss decr

Validation loss decreased (0.455635 --> 0.432241).  Saving model ...
Validation loss decreased (0.432241 --> 0.412974).  Saving model ...
Validation loss decreased (0.412974 --> 0.396554).  Saving model ...
Validation loss decreased (0.396554 --> 0.381492).  Saving model ...
Validation loss decreased (0.381492 --> 0.368605).  Saving model ...
Validation loss decreased (0.368605 --> 0.356482).  Saving model ...
Validation loss decreased (0.356482 --> 0.346767).  Saving model ...
Epoch: 20
Train Loss: 0.308 | Accuracy: 88.715
Test Loss: 0.337 | Accuracy: 87.500
Validation loss decreased (0.346767 --> 0.337409).  Saving model ...
Validation loss decreased (0.337409 --> 0.329441).  Saving model ...
Validation loss decreased (0.329441 --> 0.322673).  Saving model ...
Validation loss decreased (0.322673 --> 0.317241).  Saving model ...
Validation loss decreased (0.317241 --> 0.311371).  Saving model ...
Validation loss decreased (0.311371 --> 0.305603).  Saving model ...
Validation loss decr

Validation loss decreased (0.240895 --> 0.240521).  Saving model ...
EarlyStopping counter: 1 out of 10
Validation loss decreased (0.240521 --> 0.239623).  Saving model ...
Validation loss decreased (0.239623 --> 0.238675).  Saving model ...
Epoch: 60
Train Loss: 0.066 | Accuracy: 98.264
Test Loss: 0.239 | Accuracy: 91.667
EarlyStopping counter: 1 out of 10
Validation loss decreased (0.238675 --> 0.238110).  Saving model ...
Validation loss decreased (0.238110 --> 0.237424).  Saving model ...
EarlyStopping counter: 1 out of 10
Validation loss decreased (0.237424 --> 0.237280).  Saving model ...
EarlyStopping counter: 1 out of 10
EarlyStopping counter: 2 out of 10
EarlyStopping counter: 3 out of 10
EarlyStopping counter: 4 out of 10
EarlyStopping counter: 5 out of 10
Epoch: 70
Train Loss: 0.043 | Accuracy: 99.132
Test Loss: 0.238 | Accuracy: 93.056
EarlyStopping counter: 6 out of 10
EarlyStopping counter: 7 out of 10
EarlyStopping counter: 8 out of 10
EarlyStopping counter: 9 out of 10


In [29]:
print_metrics(metrics)

AUC: 0.905 +/- 0.016831648705582826
Accuracy: 0.9194444444444445 +/- 0.005555555555555585
Precision: 0.9184939410196581 +/- 0.00719648532790548
Recall: 0.9194444444444445 +/- 0.005555555555555585
F1-score: 0.9122381693589142 +/- 0.0058301255005948115


### Combination with all

In [28]:
class All(torch.nn.Module):
    def __init__(self):
        super(All, self).__init__()
    
        #before the Attention Mechanism, parse in a fully connected network and output a 50 dimensional vector 
        self.hidden = 50  
        self.in_head = 5 #repeat the mechanism for 5 times
        self.conv1 = GATConv(in_channels = 133, 
                             out_channels = self.hidden, 
                             heads=self.in_head, concat=False) #set concat to be False, so it will take average instead

        
        #define a transformer encoder layer to learn about smiles string
        #vocab_len to decide how many unique 'word', 1024 the dimension of each embeddings
        self.embedding = Embedding.from_pretrained(torch.from_numpy(wv.vectors),freeze=False)
        
        encoder_layer = TransformerEncoderLayer(d_model=1024, nhead=2)
        
        self.encoder = TransformerEncoder(encoder_layer,4)
        self.pos_encoder = PositionalEncoding(1024, 0.1)
        
        #fully connected layers
        self.linear1 = Linear(self.hidden+2048+1024,100)
        self.linear2 = Linear(100, 25)
        self.linear3 = Linear(25, 10)
        self.linear4 = Linear(10, 1)

    def forward(self, data):
        
        x, edge_index, batch_index, wv_index, mol_feature = data.x, data.edge_index, data.batch, data.emb, data.mol_feature        
        #extract batched node and edge feature matrices, edge_index matrix, batch_index and token indices tensor 
        
        wv_index = [torch.tensor(item) for item in wv_index]
        
        padded = pad_sequence(wv_index,padding_value = 266)
        # seq_len x batch_size 
        
        fp = self.embedding(padded.to(device))
        #seq_len x batch_size x dim_embedding
        
        fp = self.pos_encoder(fp)
        fp = self.encoder(fp)
        #seq_len x batch_size x dim_embedding
        #learned smile representation 
        
        fp = fp.sum(dim=0)
        x = self.conv1(x.float(), edge_index)

        #aggregate the learned local node feature to capture the global property 
        #here, we apply global_mean_pool over the updated node feature
        x = gap(x,batch_index)
        
        #also include some other global information i.e. binary morgan fingerprint
        x = torch.cat([x, mol_feature.reshape(data.num_graphs,2048), fp],dim=1)

        x = F.relu(self.linear1(x.float()))
        x = F.relu(self.linear2(x))
        x = F.relu(self.linear3(x))
        x = torch.sigmoid(self.linear4(x))
        
        return x

In [29]:
train_losses = []
train_acc = []
#test_acc = []
test_losses= []
metrics = []
#for each fold

for i in range(5):
    
    gc.collect()
    torch.cuda.empty_cache()
    print('Split '+str(i+1)+' ......')
    train_set = pd.read_csv('dir_to_dataset/5_fold_cv_train_test/train_split'+str(i+1)+'.csv')
    test_set = pd.read_csv('dir_to_dataset/5_fold_cv_train_test/test_split'+str(i+1)+'.csv')
    train_loader = data_process(train_set,6)
    test_loader = data_process(test_set,len(test_set))
    
    model = All().float().to(device)
    
    model.apply(reset_weights)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.000075,weight_decay=0.001)
    
    #for each epoch
    train_loss = []
    test_loss = []
    train_accuracy = []
    test_accuracy = []
    
    early_stopping = EarlyStopping(patience=10, verbose=True)
    #update the model parameters with 100 epochs
    for epoch in range(100):
        train(epoch,train_loader)
        test(epoch,test_loader)
        
        early_stopping(test_loss[-1],model)
        
        if early_stopping.early_stop:
            print('Early stopping')
            break
    
    model.load_state_dict(torch.load('checkpoint.pt'))
    auc, report = test_metrics(test_loader)
    metric = [auc,report]
    
    
    train_losses.append(train_loss)
    train_acc.append(train_accuracy)
    test_losses.append(test_loss)
    metrics.append(metric)
    
    


Split 1 ......
Reset trainable parameters of layer = Linear(133, 250, bias=False)
Reset trainable parameters of layer = NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
Reset trainable parameters of layer = Linear(in_features=1024, out_features=2048, bias=True)
Reset trainable parameters of layer = Linear(in_features=2048, out_features=1024, bias=True)
Reset trainable parameters of layer = LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
Reset trainable parameters of layer = LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
Reset trainable parameters of layer = NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
Reset trainable parameters of layer = Linear(in_features=1024, out_features=2048, bias=True)
Reset trainable parameters of layer = Linear(in_features=2048, out_features=1024, bias=True)
Reset trainable parameters of layer = LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
Reset trainable parameters of l

Epoch: 0
Train Loss: 0.523 | Accuracy: 83.681
Test Loss: 0.686 | Accuracy: 84.028
Validation loss decreased (inf --> 0.686269).  Saving model ...
EarlyStopping counter: 1 out of 10
Validation loss decreased (0.686269 --> 0.484493).  Saving model ...
EarlyStopping counter: 1 out of 10
EarlyStopping counter: 2 out of 10
EarlyStopping counter: 3 out of 10
EarlyStopping counter: 4 out of 10
Validation loss decreased (0.484493 --> 0.423754).  Saving model ...
EarlyStopping counter: 1 out of 10
EarlyStopping counter: 2 out of 10
Epoch: 10
Train Loss: 0.312 | Accuracy: 88.542
Test Loss: 0.400 | Accuracy: 85.417
Validation loss decreased (0.423754 --> 0.400499).  Saving model ...
EarlyStopping counter: 1 out of 10
EarlyStopping counter: 2 out of 10
EarlyStopping counter: 3 out of 10
EarlyStopping counter: 4 out of 10
EarlyStopping counter: 5 out of 10
Validation loss decreased (0.400499 --> 0.393127).  Saving model ...
EarlyStopping counter: 1 out of 10
EarlyStopping counter: 2 out of 10
Early

EarlyStopping counter: 3 out of 10
EarlyStopping counter: 4 out of 10
EarlyStopping counter: 5 out of 10
EarlyStopping counter: 6 out of 10
EarlyStopping counter: 7 out of 10
Validation loss decreased (0.333293 --> 0.311097).  Saving model ...
Epoch: 20
Train Loss: 0.278 | Accuracy: 89.583
Test Loss: 0.343 | Accuracy: 88.194
EarlyStopping counter: 1 out of 10
Validation loss decreased (0.311097 --> 0.299175).  Saving model ...
EarlyStopping counter: 1 out of 10
EarlyStopping counter: 2 out of 10
EarlyStopping counter: 3 out of 10
EarlyStopping counter: 4 out of 10
EarlyStopping counter: 5 out of 10
EarlyStopping counter: 6 out of 10
EarlyStopping counter: 7 out of 10
EarlyStopping counter: 8 out of 10
Epoch: 30
Train Loss: 0.229 | Accuracy: 89.931
Test Loss: 0.482 | Accuracy: 80.556
EarlyStopping counter: 9 out of 10
EarlyStopping counter: 10 out of 10
Early stopping


In [30]:
print_metrics(metrics)

AUC: 0.8186805555555555 +/- 0.03564834355405802
Accuracy: 0.8569444444444445 +/- 0.01734721666221776
Precision: 0.8532421823194486 +/- 0.020276658095891555
Recall: 0.8569444444444445 +/- 0.01734721666221776
F1-score: 0.8340719141374023 +/- 0.03695717136018627
