In [307]:
%load_ext autoreload
%autoreload 2

import os
os.environ['CUDA_VISIBLE_DEVICES'] = "0"

import random
from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole

from utils.load_networkx import networkx_feat
from utils.macro_dataset import MacroDataset
from utils import macro_unsupervised as unsup
from utils.macro_supervised_hyperparameter import MacroSupervised
from utils.macro_attribution import Attribution
from utils import plot

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


**Filepaths**

MON_SMILES : .csv or .txt file of macromolecule monomers
<br>
BOND_SMILES : .csv or .txt file of macromolecule bonds

TXT_DATA_PATH : For dataset, to be used for training
<br>
DF_PATH : For dataset, to be used for graph labels during training
<br>
MODEL_PATH : For models, to be used when training to save the model, optimizer, and configuration file
<br>
FIG_PATH : For plots, to be used when saving ROC-AUC or parity plot figures

**Variables**

FEAT : Type of features for macromolecule monomers and bonds -- fingerprints ('fp') or one-hot encodings ('onehot') (default = 'fp')
<br>
FP_RADIUS_MON : Radius of topological exploration for monomer fingerprint (default = 3)
<br>
FP_BITS_MON : Size of fingerprint bit-vector for monomer (default = 128)
<br>
FP_RADIUS_BOND : Radius of topological exploration for bond fingerprint (default = 3)
<br>
FP_BITS_BOND : Size of fingerprint bit-vector for bond (default = 16)

SEED : Random seed for shuffling dataset (default = 108)
<br>
TASK : Learning task (classification, regression) (default = 'classification')
<br>
MODEL : Model architecture for supervised learning (Weave, MPNN, Attentive FP, GCN, GAT) (default = 'Weave')
<br>
LABELNAME : Name of label to classify (default = 'Immunogenic')
<br>
SPLIT : Proportion of the dataset to use for training, validation and test (default = '0.8,0.1,0.1')
<br>
NORM: Normalization method for regression labels -- quantile transform ('qt') or standard scaler ('ss') (default = 'qt')

NUM_EPOCHS: Maximum number of epochs allowed for training (default = 1000)
<br>
NUM_WORKERS: Number of processes for data loading (default: 1)

SAVE_MODEL: Whether to save full model file. Model state dict will be saved automatically (default = False)
<br>
SAVE_OPT: Whether to save optimizer files (default = False)
<br>
SAVE_CONFIG: Whether to save configuration file (default = False)

PLOT_TYPE: Dataset to plot ('val' for validation or 'test' for test)

CUSTOM_PARAMS: Dictionary of hyperparameters for model

In [308]:
DESCRIPTORS = 'unique_descriptors.json'

SEED = 112
TASK = 'classification'
MODEL = 'AttentiveFP'
LABELNAME = 'immunogenic'

MON_SMILES = 'tables/SMILES_peptides_monomer.txt'
BOND_SMILES = 'tables/SMILES_peptides_bond.txt'
TXT_DATA_PATH = 'dataset_abridged/classification/'
DF_PATH = 'tables/immuno_peptides.txt'

SPLIT = '0.6,0.2,0.2'
SCALER = 'minmax'

NUM_EPOCHS = 3
NUM_WORKERS = 2

MODEL_PATH = 'past_trials/' + MODEL + '/' + str(NUM_EPOCHS) + '_epochs'
FIG_PATH = 'past_trials/' + MODEL + '/' + str(NUM_EPOCHS) + '_epochs'

SAVE_MODEL = True
SAVE_OPT = True
SAVE_CONFIG = True

CUSTOM_PARAMS = {}

**Load featurized NetworkX graphs**

In [301]:
NX_GRAPHS = networkx_feat(
    TXT_DATA_PATH = TXT_DATA_PATH, 
    MON_SMILES = MON_SMILES, 
    BOND_SMILES = BOND_SMILES, 
    DESCRIPTORS = DESCRIPTORS,
    SPLIT = SPLIT,
    SEED = SEED,
    SCALER = SCALER
)

Omitted Peptide: GID196
Omitted Peptide: UID5671


In [302]:
# plot.graph(NX_GRAPHS['sets']['train']['GID1'])

In [303]:
# plot.graph(NX_GRAPHS['sets']['test']['UID5863'])

**Load DGL dataset with labels and masks**

In [304]:
dgl_dict = MacroDataset(
    DF_PATH = DF_PATH,
    TASK = TASK, 
    LABELNAME = LABELNAME, 
    MODEL = MODEL,
    NX_GRAPHS = NX_GRAPHS)

## Supervised learning 
Training model and obtaining validation and test metrics

In [305]:
macro_supervised = MacroSupervised(
    MacroDataset = dgl_dict, 
    MODEL = MODEL, 
    NUM_EPOCHS = NUM_EPOCHS, 
    NUM_WORKERS = NUM_WORKERS, 
    CUSTOM_PARAMS = CUSTOM_PARAMS,
    MODEL_PATH = MODEL_PATH, 
    SAVE_MODEL = SAVE_MODEL, 
    SAVE_OPT = SAVE_OPT, 
    SAVE_CONFIG = SAVE_CONFIG)

Directory past_trials/AttentiveFP/3_epochs already exists.


In [306]:
macro_supervised.main()

1
2
3


-0.3902439024390244

In [None]:
macro_supervised.rocauc_plot('val')

In [None]:
macro_supervised.rocauc_plot('test')

#### Attribution

In [None]:
random.seed(SEED)
input_graph = dgl_dict.graphs[0]

attribution = Attribution(
    model_name = MODEL, 
    attribution_type = 'integrated_gradients')

attribution.calculate_attribution(
    model_instance = macro_supervised.model, 
    input_graph = input_graph)

In [None]:
attribution.plot_graph(
    input_graph = input_graph, 
    NX_GRAPHS = NX_GRAPHS)

In [None]:
attribution.plot_fp(node_idx=0)

In [None]:
index = 16
mol, bi = attribution.display_substructure(
    monomer = 'Xyl', 
    MON_SMILES_PATH = MON_SMILES, 
    RADIUS = FP_RADIUS_MON, 
    N_BITS = FP_BITS_MON)
Draw.DrawMorganBit(mol, index, bi)