In [None]:
from deepchem.feat import ConvMolFeaturizer
from deepchem.utils import download_url
from deepchem.data import DiskDataset
import deepchem as dc
from Bio.PDB import PDBParser, PDBIO
from rdkit import Chem
import numpy as np
import os


In [None]:
dataset_path = "Full Dataset/"
pdb_files = os.listdir(dataset_path)

In [None]:
# Check for inhibitors
def check_for_inhibitors(pdb_file):
    parser = PDBParser(QUIET=True)
    structure = parser.get_structure('structure', pdb_file)

    inhibitors = []
    for model in structure:
        for chain in model:
            for residue in chain:
                # Checking heteroatoms for inhibitors
                if residue.id[0] != ' ':
                    inhibitors.append(residue.resname)
    
    return 1 if inhibitors else 0

# check_for_inhibitors(dataset_path + pdb_files[2])

featurizer = ConvMolFeaturizer()

In [None]:
# features = []
# labels = []

In [None]:
for pdb_path in pdb_files:
    mol = Chem.MolFromPDBFile(dataset_path + pdb_path)
    if mol is not None:
        try:
            featurized_mol = featurizer.featurize([mol])[0]
            features.append(featurized_mol)
            print(f"Featurized {pdb_path}")
        except Exception as e:
            print(f"Failed to featurize {pdb_path}: {e}")
            continue
        
        labels.append(check_for_inhibitors(dataset_path + pdb_path))

# Convert labels to numpy array
labels = np.array(labels)

# Ensure features is a list of arrays, not a single numpy array
features = np.array(features, dtype=object)


In [None]:
print(len(features))
print(len(labels))

In [None]:
# Create the dataset and save it
dataset = DiskDataset.from_numpy(features, labels)

In [None]:
deepchem_dataset = dc.data.DiskDataset.from_numpy(features, labels)
deepchem_dataset.move('pdb_full_dataset')

In [None]:
from deepchem.data import Dataset

# Split the dataset into training, validation, and test sets
dataset = dc.data.DiskDataset('pdb_full_dataset')


# Use a splitter to split the dataset
splitter = dc.splits.RandomSplitter()
train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(dataset, frac_train=0.8, frac_valid=0.1, frac_test=0.1)

In [None]:
from deepchem.models import GraphConvModel

# Define the model
model = GraphConvModel(n_tasks=1, mode='classification', n_classes=2, batch_size=50, learning_rate=0.001, model_dir="Graph Models/")

# Train the model
model.fit(train_dataset, nb_epoch=12)


In [None]:
# Evaluate the model on the validation set - OLD MODEL
metric = dc.metrics.Metric(dc.metrics.roc_auc_score, np.mean)
train_score = model.evaluate(train_dataset, [metric])
valid_score = model.evaluate(valid_dataset, [metric])
test_score = model.evaluate(test_dataset, [metric])

print("Train AUC: ", train_score)
print("Validation AUC: ", valid_score)
print("Test AUC: ", test_score)


In [None]:
# Evaluate the model on the validation set - NEW MODEL
metric = dc.metrics.Metric(dc.metrics.roc_auc_score, np.mean)
train_score = model.evaluate(train_dataset, [metric])
valid_score = model.evaluate(valid_dataset, [metric])
test_score = model.evaluate(test_dataset, [metric])

print("Train AUC: ", train_score)
print("Validation AUC: ", valid_score)
print("Test AUC: ", test_score)


In [None]:
# Evaluate the model on the validation set - NEW MODEL trained 5 more epochs
metric = dc.metrics.Metric(dc.metrics.roc_auc_score, np.mean)
train_score = model.evaluate(train_dataset, [metric])
valid_score = model.evaluate(valid_dataset, [metric])
test_score = model.evaluate(test_dataset, [metric])

print("Train AUC: ", train_score)
print("Validation AUC: ", valid_score)
print("Test AUC: ", test_score)


In [None]:
model.save_checkpoint()

In [None]:
print(model.model_dir)


In [None]:
old_test_path = "Plasmodium/Mixed Final Dataset Test"
old_test_files = os.listdir(old_test_path)

In [None]:
def predict_from_pdb(model):

    correct = 0
    total = 0

    test_path = os.listdir("Plasmodium/Mixed Final Dataset Test")

    for pdb_file in test_path:
        print(pdb_file)

        pdb_file = "Plasmodium/Mixed Final Dataset Test/" + pdb_file

        try:

            
            featurized_mol = featurizer.featurize([mol])[0]


            features = pad_features(features, 100 * 24)

            features = features.reshape(1, -1)

            deepchem_dataset_entry = dc.data.NumpyDataset(features)

            predictions = model.predict(deepchem_dataset_entry)

            # print(predictions)

            probabilities = torch.sigmoid(torch.tensor(predictions)).numpy()
            # print(probabilities)
            label = (probabilities > 0.5).astype(int)

            inhibitors_check = 1 if check_for_inhibitors(pdb_file) else 0

            total += 1
            
            print(f'{label=}, {inhibitors_check=}')

            if label == 1:
                model_inhibitor_check = 1
                print(f"{pdb_file[0]} contains inhibitors")
            else:
                model_inhibitor_check = 0
                print(f"{pdb_file[0]} does not contain inhibitors")

            if inhibitors_check == model_inhibitor_check:
                print("CORRECT")
                correct += 1
            else:
                print("WRONG")
                        
        except Exception as e:
            print(f"Error processing {pdb_file}: {e}")

    print(f"Accuracy: {correct / total * 100:.2f}%")

# Assuming your model is already trained and available as `model`
print("MODEL PREDICTIONS")
predict_from_pdb(model)

In [None]:

# for pdb_path in pdb_files:
#     mol = Chem.MolFromPDBFile(dataset_path + pdb_path)
#     if mol is not None:
#         try:
#             featurized_mol = featurizer.featurize([mol])[0]
#             features.append(featurized_mol)
#             print(f"Featurized {pdb_path}")
#         except Exception as e:
#             print(f"Failed to featurize {pdb_path}: {e}")
#             continue
        
#         labels.append(check_for_inhibitors(dataset_path + pdb_path))

# # Convert labels to numpy array
# labels = np.array(labels)

# # Ensure features is a list of arrays, not a single numpy array
# features = np.array(features, dtype=object)


In [None]:
old_test_path = "Plasmodium/Mixed Final Dataset Test"
old_test_files = os.listdir(old_test_path)

In [None]:
total = 0
correct = 0
wrong = 0

for pdb in old_test_files:
    # print(old_test_path + "/" + pdb)

    try:
    
        features = np.array(featurizer.featurize([Chem.MolFromPDBFile(old_test_path + "/" + pdb)]), dtype=object)
        prediction = model.predict_on_batch(features)

        prediction_label = np.argmax(prediction)
        ground_truth = check_for_inhibitors(old_test_path + "/" + pdb)

        if prediction_label == ground_truth:
            print(f"Correct prediction for {pdb} || Prediction: {prediction_label} || Ground Truth: {ground_truth}")
            correct += 1
        else:
            print(f"Incorrect prediction for {pdb} || Prediction: {prediction_label} || Ground Truth: {ground_truth}")
            wrong += 1

        total += 1
    
    except Exception as e:
        print(f"Error processing {pdb}: {e}")


print(f"Accuracy: {correct / total * 100:.2f}%")

In [None]:
features = np.array(featurizer.featurize([Chem.MolFromPDBFile(old_test_path + "/" + pdb)]), dtype=object)
prediction = model.predict_on_batch(features)

print(np.argmax(prediction))
check_for_inhibitors(old_test_path + "/" + pdb)