In [41]:
import os
import torch

import joblib

num_cores = os.cpu_count()
num_GPUs = torch.cuda.device_count()

print('# Cores:' + str(num_cores))
print('# GPUs: ' + str(num_GPUs))

# Get the available GPUs directly as a list
print(f"Available GPUs: {list(range(torch.cuda.device_count()))}")

print('Visible GPUs Indices: ' + str(os.environ.get('CUDA_VISIBLE_DEVICES', 'All GPUs are visible')))

# Cores:8
# GPUs: 0
Available GPUs: []
Visible GPUs Indices: 0


In [42]:
os.environ['CUDA_VISIBLE_DEVICES'] = "0"

In [43]:
%load_ext autoreload
%autoreload 2

from utils.create_dataset_class import DataSet
from utils.multiclass_NN import multiclass_NN
from utils.split_dataset import split
from utils.scale_graph_features import scale

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [44]:
SEED = 103 # seed for splitting train/val/test set
db_file = 'data_preprocessing/db.csv' # file containing 

# See comments in box below
SPLIT_RATIO = '0.4,0.3,0.3' # Train, Val, Test
# SPLIT_RATIO = '0.7,0.3' # Train, Val, Test

SMILES = 'data_preprocessing/SMILES.txt'
DESCRIPTORS = 'monomer_data/unique_descriptors.json'

LABELNAME = 'binary_class' # label name used in db_file
TASK = 'classification' # task (LEAVE AS IS) - currently does not support regression or multiclass classification
MODEL = 'MPNN' # Model type ('MPNN', 'GAT', 'Weave', 'GCN', 'AttentiveFP')

NUM_EPOCHS = 4 # Number of epochs to tran
NUM_WORKERS = num_cores

MODEL_PATH = 'past_trials/' + MODEL + '/' + str(NUM_EPOCHS) + '_epochs' # Where to store model, loss curves, confusion matrix, etc.

SAVE_MODEL = True
SAVE_OPT = True
SAVE_CONFIG = True

CUSTOM_PARAMS = {} # Used in case you want to use custom hyperparameters; otherwise, hyperparameters are imported from model_hparams

In [45]:
# Split dataset into train/val/test sets
# If MIXED=FALSE: use SPLIT_RATIO = '0.XX, 0.XX'; this will create train/val sets only using peptides in db_file and make test set only polymers
# If MIXED=TRUE: use SPLIT_RATIO = '0.XX, 0.XX, 0.XX'; this will mix peptides and polymers into train/val/test sets. Polymers sampled from the same distribution are assigned to the same set. 
split_db = split(db_file, SEED, SPLIT_RATIO, MIXED = False)

In [46]:
# Scale dataset using only node and edge features in train set
features = scale(split_db['train'], SMILES, DESCRIPTORS)

In [47]:
# create dataloader
dataset = DataSet(db_file, features, split_db, LABELNAME, TASK, MODEL)

In [None]:
# initialize classifier and run
test = multiclass_NN(dataset, MODEL, NUM_EPOCHS, NUM_WORKERS, DESCRIPTORS, CUSTOM_PARAMS, MODEL_PATH, SAVE_MODEL, SAVE_OPT, SAVE_CONFIG)
test.main()

In [15]:
# dump feature dictionary (needed for inference)
joblib.dump(features, MODEL_PATH + '/features.pkl')

['features.pkl']