In [3]:
import deepchem as dc
from deepchem.models import GCNModel
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

### Cleaning Dataset

In [4]:
data_ = pd.read_csv("../../data/FilteredData.csv")

In [5]:
def SingleAtomRemover(x):
    singleAtom = ["[Kr]","[Ne]","C","[Li]","O","[Ar]",'[Xe]']
    if x in singleAtom:
        return 1
    else: 
        return 0  

In [6]:
data = data_[["Structure","BBB+/BBB-"]]
data.replace({"BBB+":1,"BBB-":0},inplace=True)

In [7]:
data.rename(columns={"Structure":"SMILES","BBB+/BBB-":"LABELS"},inplace=True)
data["Isit"] = data["SMILES"].apply(SingleAtomRemover)
### Remove single element
data.drop(data[data["Isit"] == 1].index,inplace=True)

In [8]:
data.to_csv(r"data.csv")

In [9]:
clean_data = pd.read_csv(r"data.csv")

### Converting in DeepChem Format

In [10]:
smiles = clean_data.SMILES.to_list()

In [11]:
labels = clean_data.LABELS.to_list()

#### Featurizing the node

In [12]:
featurizer = dc.feat.MolGraphConvFeaturizer()

In [13]:
X = featurizer.featurize(smiles)

### converting into numpy datset

In [54]:
dataset = dc.data.NumpyDataset(X=X, y=labels)

In [55]:
splitter = dc.splits.RandomStratifiedSplitter()

In [56]:
train_dataset, test_dataset = splitter.train_test_split(dataset,)

### Model implementation

In [87]:
# training model
model = GCNModel(mode='classification',
                 graph_conv_layers = [64,32],
                 n_tasks=1,batch_size=256, learning_rate=0.001,model_dir=("DeepGCN"))

In [88]:
metrics1 = dc.metrics.Metric(dc.metrics.accuracy_score)
metrics2 = dc.metrics.Metric(dc.metrics.f1_score)
metrics3 = dc.metrics.Metric(dc.metrics.roc_auc_score)

In [89]:
from deepchem.models.callbacks import ValidationCallback
vc_valid = ValidationCallback(test_dataset, interval=100, metrics=[metrics1,metrics2,metrics3],)
vc_valid2 = ValidationCallback(train_dataset, interval=100, metrics=[metrics1,metrics2,metrics3],)

In [90]:
model.fit(train_dataset, nb_epoch=300,callbacks=[vc_valid,vc_valid2])

Step 100 validation: accuracy_score=0.726727 f1_score=0.805139 roc_auc_score=0.8319
Step 100 validation: accuracy_score=0.756381 f1_score=0.827072 roc_auc_score=0.859379
Step 200 validation: accuracy_score=0.767267 f1_score=0.822857 roc_auc_score=0.844227
Step 200 validation: accuracy_score=0.810435 f1_score=0.851689 roc_auc_score=0.886435
Step 300 validation: accuracy_score=0.755255 f1_score=0.822633 roc_auc_score=0.850896
Step 300 validation: accuracy_score=0.79542 f1_score=0.850562 roc_auc_score=0.903628
Step 400 validation: accuracy_score=0.75976 f1_score=0.821826 roc_auc_score=0.849439
Step 400 validation: accuracy_score=0.823198 f1_score=0.866534 roc_auc_score=0.917966
Step 500 validation: accuracy_score=0.785285 f1_score=0.824969 roc_auc_score=0.85434
Step 500 validation: accuracy_score=0.849474 f1_score=0.872739 roc_auc_score=0.928609
Step 600 validation: accuracy_score=0.776276 f1_score=0.808729 roc_auc_score=0.848446
Step 600 validation: accuracy_score=0.847598 f1_score=0.865

0.060903925895690915

In [91]:
train_score = model.evaluate(train_dataset, [metrics1,metrics2,metrics3],)
test_score = model.evaluate(test_dataset, [metrics1,metrics2,metrics3],)

print('Training set score:', train_score)
print('Test set score:', test_score)



Training set score: {'accuracy_score': 0.954954954954955, 'f1_score': 0.9642644431209053, 'roc_auc_score': 0.9987441464452959}
Test set score: {'accuracy_score': 0.7837837837837838, 'f1_score': 0.837471783295711, 'roc_auc_score': 0.819963577881841}


In [92]:
train_preds = model.predict(train_dataset)

In [93]:
test_preds = model.predict(test_dataset)

In [94]:
from sklearn.metrics import accuracy_score
def accuracy_counter(y_prob,y_true):
    
    y_prob = np.array(y_prob)
    y_prob = np.where(y_prob <= 0.5, 0, y_prob)
    y_prob = np.where(y_prob > 0.5, 1, y_prob)
    
    binary =  [np.argmax(i) for i in y_prob]
    accuracy = accuracy_score(np.array(y_true),np.array(binary)) 
    
    return accuracy

In [95]:
print(f"Training accuracy: {accuracy_counter(train_preds,train_dataset.y)}  Test accuracy: {accuracy_counter(test_preds,test_dataset.y)} ",)

Training accuracy: 0.954954954954955  Test accuracy: 0.7837837837837838 
