## Install Libraries

In [2]:
%pip install dgl -f https://data.dgl.ai/wheels/torch-2.3/cu118/repo.html

Looking in indexes: http://mirrors.aliyun.com/pypi/simple
Looking in links: https://data.dgl.ai/wheels/torch-2.3/cu118/repo.html
[0mNote: you may need to restart the kernel to use updated packages.


In [3]:
%pip install dglgo

Looking in indexes: http://mirrors.aliyun.com/pypi/simple
[0mNote: you may need to restart the kernel to use updated packages.


In [4]:
%pip install dgllife

Looking in indexes: http://mirrors.aliyun.com/pypi/simple
[0mNote: you may need to restart the kernel to use updated packages.


In [5]:
%pip install torch==2.3.0 -i https://pypi.tuna.tsinghua.edu.cn/simple/

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple/
[0mNote: you may need to restart the kernel to use updated packages.


In [6]:
%pip install deepchem

Looking in indexes: http://mirrors.aliyun.com/pypi/simple
[0mNote: you may need to restart the kernel to use updated packages.


In [7]:
%pip install tensorflow==2.12.0

Looking in indexes: http://mirrors.aliyun.com/pypi/simple
[0mNote: you may need to restart the kernel to use updated packages.


## Import Statements

In [8]:
import pickle
import statistics

import deepchem as dc

import numpy as np
import tensorflow as tf

from keras import backend as K
from keras.models import Sequential
from keras.layers import Dense, Dropout, BatchNormalization
from random import seed
from random import shuffle
from scipy.stats import pearsonr

# Seed
seed(13)

No normalization for SPS. Feature removed!
No normalization for AvgIpc. Feature removed!
2024-07-29 20:18:50.602052: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Skipped loading modules with pytorch-geometric dependency, missing a dependency. No module named 'torch_geometric'
Skipped loading modules with transformers dependency. No module named 'transformers'
cannot import name 'HuggingFaceModel' from 'deepchem.models.torch_models' (/root/miniconda3/envs/py39/lib/python3.9/site-packages/deepchem/models/torch_models/__init__.py)
Skipped loading modules with pytorch-geometric dependency, missing a dependency. cannot import name 'DMPNN' from 'deepchem.models.torch_models' (/root/miniconda3/envs/py39/lib/python3.9/site-packages/deepchem/models/torch_model

## Path to load/save models and predictions

In [9]:
path = "./SpectraPredictions/"

## Load Dataset into 5 seperate folds

In [10]:
with open("Cleaner NIST Dataset.pickle", "rb") as f:
    d = pickle.load(f)

smiles = d["smiles"]
sequences = d["sequences"]

# This single carbon node (methane) breaks the MolGraphConv featurizer below, so this compound is manually removed.
that_index = int(np.where(smiles == "C")[0])
smiles = np.concatenate((smiles[:that_index], smiles[that_index+1:]))
sequences = np.concatenate((sequences[:that_index], sequences[that_index+1:]))

# Zip each data sequence.
dataset = list(zip(smiles, sequences))
shuffle(dataset)

# Extract compounds that occur more than once so that repeats aren't distributed across folds.
single_occurrence_molecules = [x for x in dataset if list(d["smiles"]).count(x[0]) <= 1]
multiple_occurrence_molecules = [x for x in dataset if x[0] not in [h[0] for h in single_occurrence_molecules]]
multi = multiple_occurrence_molecules

# Create folds.
folds = {}
fold_size = len(single_occurrence_molecules) // 5
for i in range(1, 6):
    folds[i] = single_occurrence_molecules[((i - 1) * fold_size):(i * fold_size)]

# Add whatever wasn't added from single occurrences to the end of multiple occurrences.
multiple_occurrence_molecules += single_occurrence_molecules[(5 * fold_size):]
mult_fold_size = len(multiple_occurrence_molecules) // 5

# Add all these molecules across folds such that all repeat occurrences always occur within the same fold.
current_fold = 0
while len(multiple_occurrence_molecules) > 0:
    current_fold %= 5
    current_fold += 1
    current_molecule = multiple_occurrence_molecules[0]
    while current_molecule[0] in [h[0] for h in multiple_occurrence_molecules]:
        folds[current_fold].append(multiple_occurrence_molecules.pop([h[0] for h in multiple_occurrence_molecules].index(current_molecule[0])))

# Print the length of each fold.
for i in range(1, 6):
    print(len(folds[i]))

1502
1502
1501
1500
1500


## Create test and train sets

In [11]:
# Helper Functions
def normalize(s):
    """
    Normalize the input series from 0->1 and return it.
    """
    # 找到输入序列中的最大值
    max_val = max(s)
    # 计算缩放比例
    scale = 1 / max_val
    # 如果最大值为 0，则将缩放比例设为 0
    if max_val == 0:
      scale = 0
    # 通过列表推导式对输入序列进行缩放
    return [j * scale for j in s]

def floor_out(x):
    """
    Add a floor threshold of 0.01 to reduce noise in spectra.
    """
    # 通过列表推导式对输入序列进行处理，将小于 0.01 的值设为 0
    return [j if j > 0.01 else 0 for j in x]

def normal_many(x):
    """
    Normalize and floor in series.
    """
    # 对输入的多个序列分别调用 normalize 函数并进行 floor_out 处理
    return np.array([floor_out(normalize(j)) for j in x])

# Create fold sets
dataset_splits = {1: {}, 2: {}, 3: {}, 4: {}, 5: {}}
for i in range(1, 6):
    # For each i-th split, the testing set will be the i-th fold.
    test = folds[i]
    train = []
    for x in range(1, 6):
        if x != i:
            train += folds[x]

    dataset_splits[i]["test_smiles"] = [j[0] for j in test]
    dataset_splits[i]["test_y"] = normal_many([j[1] for j in test])
    dataset_splits[i]["train_smiles"] = [j[0] for j in train]
    dataset_splits[i]["train_y"] = normal_many([j[1] for j in train])

## Define Loss Functions

In [12]:
def euc_dist_keras(y_true, y_pred):
    """
    Euclidean distance loss function.
    """
    return K.sqrt(K.sum(K.square(y_true - y_pred), axis=-1, keepdims=True))

def pearson_first(y_true, y_pred):
    """
    Return pearson correlation for two single tensors.
    """
    return pearsonr(y_true, y_pred)[0]

def wrapped_pearson_correlation(y_true, y_pred):
    y = tf.py_function(func = pearson_first, inp = [y_true, y_pred], Tout = tf.float32)
    return y

## Helper Function for DC featurizers

In [15]:
def clean(arr):
    # Helper Function for DC featurizers
    arr = list(map(float, arr))
    return [item for item in arr if not np.isnan(item)]

## Run SMILES through DC Featurizer

In [13]:
# Use it for the MorganFP/DNN Model
featurizer = dc.feat.CircularFingerprint(radius = 2, size = 1024, chiral = False, features = False)

for i in range(1, 6):
    dataset_splits[i]["test_x"] = featurizer.featurize(dataset_splits[i]["test_smiles"])
    dataset_splits[i]["train_x"] = featurizer.featurize(dataset_splits[i]["train_smiles"])



## Other Models Featurizers and Training

In [16]:
# GraphConv featurization
# Use it for the GCN Model
graph_featurizer = dc.feat.ConvMolFeaturizer()

for i in range(1, 6):
    dataset_splits[i]["test_x_graph"] = graph_featurizer.featurize(dataset_splits[i]["test_smiles"])
    dataset_splits[i]["train_x_graph"] = graph_featurizer.featurize(dataset_splits[i]["train_smiles"])

In [18]:
# MolGraphConv featurization
# Use it for the GAT Model and AttentiveFP Model
gat = dc.feat.MolGraphConvFeaturizer(use_edges=True)

for i in range(1, 6):
    dataset_splits[i]["test_x_mgc"] = list(gat.featurize(dataset_splits[i]["test_smiles"]))
    dataset_splits[i]["train_x_mgc"] = list(gat.featurize(dataset_splits[i]["train_smiles"]))

In [17]:
# Weave featurization
# Use it for the MPNN Model
mpnn = dc.feat.WeaveFeaturizer()

for i in range(1, 6):
    dataset_splits[i]["test_x_mpnn"] = mpnn.featurize(dataset_splits[i]["test_smiles"])
    dataset_splits[i]["train_x_mpnn"] = mpnn.featurize(dataset_splits[i]["train_smiles"])

## GCN Model

In [19]:
# GraphConvModel training loop
import pickle
for i in range(1, 6):
    dtrain = dc.data.NumpyDataset(X = dataset_splits[i]["train_x_graph"], y = dataset_splits[i]["train_y"])
    dtest = dc.data.NumpyDataset(X = dataset_splits[i]["test_x_graph"], y = dataset_splits[i]["test_y"])
    gcnmodel = dc.models.GraphConvModel(1586, mode='regression', dropout = 0.1, batch_normalize = True, dense_layer_size=2048, batch_size = 64, learning_rate = 0.001, activation_fns = [tf.nn.sigmoid, tf.nn.sigmoid, tf.nn.sigmoid, tf.nn.sigmoid, tf.nn.sigmoid, tf.nn.sigmoid, tf.nn.sigmoid])
    gcnmodel.fit(dtrain, nb_epoch = 100)

    # Collect evaluation metrics
    g1predictions = gcnmodel.predict(dtest)
    graph_r2s = []
    total_r2, count = 0, 0
    for x in range(len(dataset_splits[i]["test_y"])):
        current_r2 = wrapped_pearson_correlation(normalize(g1predictions[x]), dataset_splits[i]["test_y"][x])
        total_r2 += 0 if np.isnan(current_r2) else current_r2
        graph_r2s.append(current_r2)
        count += 1
    current_fold_loss = round(float(total_r2 / count), 5)
    print("R2 Loss for fold", i, ":", current_fold_loss)
    gr2 = clean(list(map(float, graph_r2s)))
    print("I:", i, "Mean", statistics.mean(gr2), "Median", statistics.median(gr2), "STDev", statistics.stdev(gr2))  # experiment
    fold_predictions_path = path + "GC_" + str(i) + "_preds.pickle"
    with open(fold_predictions_path, 'wb') as handle:
        pickle.dump(g1predictions, handle)

2024-07-29 20:44:33.595581: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1635] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 10400 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3060, pci bus id: 0000:06:00.0, compute capability: 8.6
2024-07-29 20:44:42.246706: I tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:637] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.


R2 Loss for fold 1 : 0.81471
I: 1 Mean 0.8147116061746479 Median 0.8597041368484497 STDev 0.16071329721707991
R2 Loss for fold 2 : 0.80843
I: 2 Mean 0.8084342565750839 Median 0.8530180156230927 STDev 0.16521299660209157
R2 Loss for fold 3 : 0.78578
I: 3 Mean 0.7857767509843789 Median 0.839667558670044 STDev 0.17904297239174083
R2 Loss for fold 4 : 0.77779
I: 4 Mean 0.7777885304385175 Median 0.8338046371936798 STDev 0.20449741978674432
R2 Loss for fold 5 : 0.78421
I: 5 Mean 0.7842058328737815 Median 0.8186803162097931 STDev 0.161426876451146


## GAT Model

In [22]:
# GAT model training loop
for i in range(1, 6):
    dtrain = dc.data.NumpyDataset(X = dataset_splits[i]["train_x_mgc"], y = dataset_splits[i]["train_y"])
    dtest = dc.data.NumpyDataset(X = dataset_splits[i]["test_x_mgc"], y = dataset_splits[i]["test_y"])
    model = dc.models.GATModel(1586, mode='regression', dropout = 0.1, graph_attention_layers = [64, 64], batch_normalize = True, dense_layer_size=2048, batch_size = 64, learning_rate = 0.001)
    model.fit(dtrain, nb_epoch = 100)
    gat_predictions = model.predict(dtest)

    graph_r2s = []
    total_r2, count = 0, 0
    for x in range(len(dataset_splits[i]["test_y"])):
        current_r2 = wrapped_pearson_correlation(normalize(gat_predictions[x]), dataset_splits[i]["test_y"][x])
        total_r2 += 0 if np.isnan(current_r2) else current_r2
        graph_r2s.append(current_r2)
        count += 1
    current_fold_loss = round(float(total_r2 / count), 5)
    print("R2 Loss for fold", i, ":", current_fold_loss)
    gr2 = clean(list(map(float, graph_r2s)))
    print("I:", i, "Mean", statistics.mean(gr2), "Median", statistics.median(gr2), "STDev", statistics.stdev(gr2))#expirement

    fold_predictions_path = path + "GAT_" + str(i) + "_preds.pickle"
    with open(fold_predictions_path, 'wb') as handle:
        pickle.dump(gat_predictions, handle)

R2 Loss for fold 1 : 0.81801
I: 1 Mean 0.8180109501212399 Median 0.8566862344741821 STDev 0.15036030267346187
R2 Loss for fold 2 : 0.80694
I: 2 Mean 0.8069420624325937 Median 0.8492433726787567 STDev 0.1617374868406156
R2 Loss for fold 3 : 0.81797
I: 3 Mean 0.8179700509815674 Median 0.8523098826408386 STDev 0.14409518932975307
R2 Loss for fold 4 : 0.80932
I: 4 Mean 0.8093209292876223 Median 0.8491477370262146 STDev 0.15864592593350568
R2 Loss for fold 5 : 0.80985
I: 5 Mean 0.8098485268702109 Median 0.8476189374923706 STDev 0.14835246763062696


## MPNN

In [20]:
# MPNN model training loop
from deepchem.models.torch_models import MPNNModel
import dgl

for i in range(1, 6):
    dtrain = dc.data.NumpyDataset(X = dataset_splits[i]["train_x_mgc"], y = dataset_splits[i]["train_y"])
    dtest = dc.data.NumpyDataset(X = dataset_splits[i]["test_x_mgc"], y = dataset_splits[i]["test_y"])
    model = MPNNModel(1586, mode='regression', dropout = 0.1, batch_normalize = True, dense_layer_size=2048, batch_size = 64, n_pair_feat = 14, n_atom_feat = 75)
    model.fit(dtrain, nb_epoch = 100)
    
    # Collect evaluation metrics
    graph_r2s = []
    g2predictions = model.predict(dtest)
    total_r2, count = 0, 0
    for x in range(len(dataset_splits[i]["test_y"])):
        current_r2 = wrapped_pearson_correlation(normalize(g2predictions[x]), dataset_splits[i]["test_y"][x])
        total_r2 += 0 if np.isnan(current_r2) else current_r2
        graph_r2s.append(current_r2)
        count += 1
    current_fold_loss = round(float(total_r2 / count), 5)
    print("R2 Loss for fold", i, ":", current_fold_loss)
    gr2 = clean(list(map(float, graph_r2s)))
    print("I:", i, "Mean", statistics.mean(gr2), "Median", statistics.median(gr2), "STDev", statistics.stdev(gr2))
    fold_predictions_path = path + "MPNN_" + str(i) + "_preds.pickle"
    with open(fold_predictions_path, 'wb') as handle:
        pickle.dump(g2predictions, handle)

R2 Loss for fold 1 : 0.87266
I: 1 Mean 0.8726625029039824 Median 0.9197896718978882 STDev 0.14751928391923136
R2 Loss for fold 2 : 0.86707
I: 2 Mean 0.8670678569951694 Median 0.9156762361526489 STDev 0.15188804616381624
R2 Loss for fold 3 : 0.87947
I: 3 Mean 0.8794649293736805 Median 0.9230608940124512 STDev 0.13089380823460342
R2 Loss for fold 4 : 0.86566
I: 4 Mean 0.8656610524455707 Median 0.9171813130378723 STDev 0.1563935400089547
R2 Loss for fold 5 : 0.87143
I: 5 Mean 0.8714331456410388 Median 0.9221212565898895 STDev 0.1483586901281833


## AttentiveFP

In [21]:
# AttentiveFP model training loop
import deepchem as dc
from deepchem.models import AttentiveFPModel
for i in range(1, 6):
    dtrain = dc.data.NumpyDataset(X = dataset_splits[i]["train_x_mgc"], y = dataset_splits[i]["train_y"])
    dtest = dc.data.NumpyDataset(X = dataset_splits[i]["test_x_mgc"], y = dataset_splits[i]["test_y"])
    fpmodel = AttentiveFPModel(n_tasks = 1586, mode='regression', dropout = 0.1, batch_normalize = True, dense_layer_size=2048, batch_size = 64, learning_rate = 0.001, activation_fns = "p")

    fpmodel.fit(dtrain, nb_epoch = 100)
    fp_predictions = fpmodel.predict(dtest)

    graph_r2s = []
    total_r2, count = 0, 0
    for x in range(len(dataset_splits[i]["test_y"])):
        current_r2 = wrapped_pearson_correlation(normalize(fp_predictions[x]), dataset_splits[i]["test_y"][x])
        total_r2 += 0 if np.isnan(current_r2) else current_r2
        graph_r2s.append(current_r2)
        count += 1
    current_fold_loss = round(float(total_r2 / count), 5)
    print("R2 Loss for fold", i, ":", current_fold_loss)
    gr2 = clean(list(map(float, graph_r2s)))
    print("I:", i, "Mean", statistics.mean(gr2), "Median", statistics.median(gr2), "STDev", statistics.stdev(gr2))#expirement
    fold_predictions_path = path + "AFP_" + str(i) + "_preds.pickle"
    with open(fold_predictions_path, 'wb') as handle:
        pickle.dump(fp_predictions, handle)

R2 Loss for fold 1 : 0.89686
I: 1 Mean 0.8968606252823467 Median 0.9430180788040161 STDev 0.1289642558111036
R2 Loss for fold 2 : 0.88614
I: 2 Mean 0.8861444415127767 Median 0.9387686550617218 STDev 0.14821071139578737
R2 Loss for fold 3 : 0.89666
I: 3 Mean 0.8966613921382124 Median 0.9411072134971619 STDev 0.12491222093169739
R2 Loss for fold 4 : 0.88445
I: 4 Mean 0.8844502476416528 Median 0.9401044547557831 STDev 0.1507498102469397
R2 Loss for fold 5 : 0.89102
I: 5 Mean 0.8910170437910905 Median 0.9381065368652344 STDev 0.13914504721551746


## MorganFP/DNN Model

In [23]:
# MorganFP model using a dense layer as output
for i in range(1, 6):
    fpmodel = Sequential()
    fpmodel.add(Dense(4096, input_dim = 1024))
    fpmodel.add(BatchNormalization())
    fpmodel.add(Dropout(0.1))
    fpmodel.add(Dense(2048, activation = "relu"))
    fpmodel.add(BatchNormalization())
    fpmodel.add(Dropout(0.1))
    fpmodel.add(Dense(1024, activation = "relu"))

    fpmodel.add(Dense(1586, activation = "sigmoid"))

    fpmodel.compile(loss = euc_dist_keras, optimizer = "Adam")
    fpmodel.fit(dataset_splits[i]["train_x"], dataset_splits[i]["train_y"], batch_size = 64, epochs = 100, verbose = 0)
    
    # Collect evaluation metrics
    morgan_predictions = fpmodel.predict(dataset_splits[i]["test_x"])
    total_r2, count = 0, 0
    total_p = 0
    fp_r2s = []
    for x in range(len(morgan_predictions)):
        current_r2 = wrapped_pearson_correlation(normalize(morgan_predictions[x]), dataset_splits[i]["test_y"][x])
        total_r2 += 0 if np.isnan(current_r2) else current_r2
        fp_r2s.append(current_r2)
        count += 1

    current_fold_loss = round(float(total_r2 / count), 5)
    print("R2 Loss for fold", i, ":", current_fold_loss)
    clean_fp_r2s = clean(list(map(float, fp_r2s)))
    print("I", i, statistics.mean(clean_fp_r2s), statistics.median(clean_fp_r2s), statistics.stdev(clean_fp_r2s))
    fold_predictions_path = path + "MFP_" + str(i) + "_preds.pickle"
    with open(fold_predictions_path, 'wb') as handle:
        pickle.dump(morgan_predictions, handle)

2024-07-30 00:18:41.794204: I tensorflow/compiler/xla/service/service.cc:169] XLA service 0x7f7cb00154b0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2024-07-30 00:18:41.794233: I tensorflow/compiler/xla/service/service.cc:177]   StreamExecutor device (0): NVIDIA GeForce RTX 3060, Compute Capability 8.6
2024-07-30 00:18:41.923489: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2024-07-30 00:18:42.108950: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:424] Loaded cuDNN version 8902
2024-07-30 00:18:42.845357: I ./tensorflow/compiler/jit/device_compiler.h:180] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


R2 Loss for fold 1 : 0.87613
I 1 0.8761328835729832 0.9292419850826263 0.1470439858590974
R2 Loss for fold 2 : 0.86651
I 2 0.8665077828010979 0.9324342906475067 0.16434536622204693
R2 Loss for fold 3 : 0.87628
I 3 0.876281177431474 0.9309589862823486 0.14218630824217438
R2 Loss for fold 4 : 0.867
I 4 0.8670001057932775 0.9281333982944489 0.160239668420966
R2 Loss for fold 5 : 0.87263
I 5 0.8726316900004943 0.930243730545044 0.15306113348055472
