In [47]:
import deepchem as dc
from deepchem.models import GATModel
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

### Cleaning Dataset

In [48]:
data_ = pd.read_csv("../../data/FilteredData.csv")

In [49]:
def SingleAtomRemover(x):
    singleAtom = ["[Kr]","[Ne]","C","[Li]","O","[Ar]",'[Xe]']
    if x in singleAtom:
        return 1
    else: 
        return 0  

In [50]:
data = data_[["Structure","BBB+/BBB-"]]
data.replace({"BBB+":1,"BBB-":0},inplace=True)

In [51]:
data.rename(columns={"Structure":"SMILES","BBB+/BBB-":"LABELS"},inplace=True)
data["Isit"] = data["SMILES"].apply(SingleAtomRemover)
### Remove single element
data.drop(data[data["Isit"] == 1].index,inplace=True)

In [52]:
data.to_csv(r"data.csv")

In [53]:
clean_data = pd.read_csv(r"data.csv")

### Converting in DeepChem Format

In [54]:
smiles = clean_data.SMILES.to_list()

In [55]:
labels = clean_data.LABELS.to_list()

#### Featurizing the node

In [56]:
featurizer = dc.feat.MolGraphConvFeaturizer()

In [57]:
X = featurizer.featurize(smiles)

### convering into numpy datset

In [40]:
dataset = dc.data.NumpyDataset(X=X, y=labels)

In [41]:
splitter = dc.splits.RandomStratifiedSplitter()

In [42]:
train_dataset, test_dataset = splitter.train_test_split(dataset)

In [63]:
# training model
model = GATModel(mode='classification
                 graph_attention_layers = [64,32],
                 n_tasks=1,batch_size=256, learning_rate=0.001,)

In [64]:
from deepchem.models.callbacks import ValidationCallback
metrics1 = dc.metrics.Metric(dc.metrics.accuracy_score)
metrics2 = dc.metrics.Metric(dc.metrics.f1_score)
metrics3 = dc.metrics.Metric(dc.metrics.roc_auc_score)

In [65]:
vc_valid = ValidationCallback(train_dataset, interval=100, metrics=[metrics1,metrics2,metrics3],)
vc_valid2 = ValidationCallback(test_dataset, interval=100, metrics=[metrics1,metrics2,metrics3],)

In [66]:
model.fit(train_dataset, nb_epoch=300,callbacks=[vc_valid,vc_valid2])

Step 100 validation: accuracy_score=0.765015 f1_score=0.829521 roc_auc_score=0.848891
Step 100 validation: accuracy_score=0.732733 f1_score=0.805677 roc_auc_score=0.8128
Step 200 validation: accuracy_score=0.798048 f1_score=0.827675 roc_auc_score=0.876008
Step 200 validation: accuracy_score=0.752252 f1_score=0.780293 roc_auc_score=0.830708
Step 300 validation: accuracy_score=0.814565 f1_score=0.847153 roc_auc_score=0.898662
Step 300 validation: accuracy_score=0.753754 f1_score=0.795511 roc_auc_score=0.836261
Step 400 validation: accuracy_score=0.836712 f1_score=0.869173 roc_auc_score=0.922087
Step 400 validation: accuracy_score=0.768769 f1_score=0.810811 roc_auc_score=0.845485
Step 500 validation: accuracy_score=0.820946 f1_score=0.86297 roc_auc_score=0.892703
Step 500 validation: accuracy_score=0.741742 f1_score=0.800464 roc_auc_score=0.785769
Step 600 validation: accuracy_score=0.859985 f1_score=0.882668 roc_auc_score=0.941591
Step 600 validation: accuracy_score=0.755255 f1_score=0.7

0.03386830568313599

In [58]:
train_preds = model.predict(train_dataset)

In [59]:
test_preds = model.predict(test_dataset)

In [62]:
train_score = model.evaluate(train_dataset, [metrics1,metrics2,metrics3],)
test_score = model.evaluate(test_dataset, [metrics1,metrics2,metrics3],)

print('Training set score:', train_score)
print('Test set score:', test_score)


Training set score: {'accuracy_score': 0.9557057057057057, 'f1_score': 0.9626818469323214, 'roc_auc_score': 0.9952846364883402}
Test set score: {'accuracy_score': 0.7642642642642643, 'f1_score': 0.8010139416983524, 'roc_auc_score': 0.8357528026110401}


In [60]:
from sklearn.metrics import accuracy_score
def accuracy_counter(y_prob,y_true):
    
    y_prob = np.array(y_prob)
    y_prob = np.where(y_prob <= 0.5, 0, y_prob)
    y_prob = np.where(y_prob > 0.5, 1, y_prob)
    
    binary =  [np.argmax(i) for i in y_prob]
    accuracy = accuracy_score(np.array(y_true),np.array(binary)) 
    
    return accuracy

In [61]:
print(f"Training accuracy: {accuracy_counter(train_preds,train_dataset.y)}  Test accuracy: {accuracy_counter(test_preds,test_dataset.y)} ",)

Training accuracy: 0.9557057057057057  Test accuracy: 0.7642642642642643 
