In [None]:
# Import necessary packages

import torch
import torch.nn as nn
import numpy as np
import pickle
import openpyxl
import re

In [None]:
# Initializing/mounting Google Drive

from google.colab import drive
drive.mount('/content/drive')

In [None]:
def predict(net, dis_list):

    # Get disease batches and a dictionary of drug information
    dis_batches, drug_dict = get_disease_batches(nodes_mapping, dis_list)
    dis_batches = torch.LongTensor(dis_batches)
    dictionaries_norm = []

    # Loop through each batch of diseases
    for i in range(len(dis_batches)):
        # Pass input features and adjacency matrices through the model
        embed, logits = net(input_features.to(device), ax.to(device), a2x.to(device), dis_batches[i].to(device))
        probs = standardize(logits)
        dct_norm = dict()

        # Loop through each probability and map it to drug or disease
        for j in range(len(probs)):
            x = get_node_name(dis_batches[i, j, 0].item())

            # Check if the node name exists in the drug dictionary
            if x in drug_dict.keys():
                dct_norm[drug_dict[x][0]] = probs[j].item()

            # Otherwise, add just the node name with probability
            else:
                dct_norm[x] = probs[j].item()

        # Append the dictionary to the list
        dictionaries_norm.append(dct_norm)

    return embed, dictionaries_norm

def load_variable(filename):

    # Load a file using pickle
    return pickle.load(open(filename, 'rb'))

def get_node_name(id):

    # Get node name corresponding to the given node ID
    return list(nodes_mapping.keys())[list(nodes_mapping.values()).index(id)]

def get_node_id(name):

    # Get node ID corresponding to the given node name
    return list(nodes_mapping.values())[list(nodes_mapping.keys()).index(name)]

def load_model_on_cpu(model, path):

    # Load the model's state dictionary from the specified path
    model.load_state_dict(torch.load(path, map_location=torch.device('cpu')))
    return model

def get_disease_batches(nodes_mapping, disease_list):

    # Create batches of drugs and diseases for model input
    dct = get_drug_name_desc_dict()
    keys = list(nodes_mapping.keys())
    drugs = []
    batches = []

    # Identify all drug nodes (compounds) based on the name pattern
    for key in keys:
        if re.search(r"Compound+", key, re.I):
            if key in dct.keys():
                drugs.append(nodes_mapping[key])

    # Create a batch for each disease in the disease list
    for disease in disease_list:
        disease_id = get_node_id(disease)
        batch = []

        # For each drug pair a drug with a disease
        for drug in drugs:
            batch.append((drug, disease_id))
        batches.append(batch)

    return batches, dct

def get_drug_name_desc_dict():

    # Load drug details from an Excel file into a dictionary
    sheet = openpyxl.load_workbook('/content/drive/My Drive/Drug_details.xlsx').active
    dct = dict()

    # Loop through rows in the Excel sheet to populate the dictionary with drug information
    for i in range(1, sheet.max_row + 1):
        dct[sheet.cell(row=i, column=1).value] = (
            sheet.cell(row=i, column=2).value,
            sheet.cell(row=i, column=3).value,
            sheet.cell(row=i, column=4).value
        )

    return dct

def standardize(t):

    # Normalize a tensor: (value - mean) / standard deviation
    mean = torch.mean(t)
    stdev = torch.std(t)
    standard_t = (t - mean) / stdev
    return standard_t

def get_rank(dct, key):

    # Find the rank of a key based on its value in a sorted dictionary
    lst = sorted(dct.items(), key=lambda t: t[1])[::-1]

    # Loop through dictionary, check if key matches, and find rank
    for i in range(len(lst)):
        if key == lst[i][0]:
            break
    return i + 1


In [None]:
# Define activation functions to be used in the network
L_Relu = nn.LeakyReLU(0.2)
sig = nn.Sigmoid()
Relu = nn.ReLU()
tanh = nn.Tanh()

class GDRnet(nn.Module):
  def __init__(self):
    super(GDRnet, self).__init__()
    decoder_dim = 250
    input_dim = 400
    r = 3

    # Define linear layers for transformations
    self.theta0 = nn.Linear(input_dim,decoder_dim)
    self.theta1 = nn.Linear(input_dim,decoder_dim)
    self.theta2 = nn.Linear(input_dim,decoder_dim)

    # Layer to combine transformed inputs
    self.combine1 = nn.Linear(decoder_dim*r,decoder_dim)

    # Additional layers for further processing
    self.layer8 = nn.Linear(decoder_dim,decoder_dim)
    self.layer9 = nn.Linear(decoder_dim,decoder_dim)

  def decoder(self,t,batch):

    # Decoder method to compute new values based on batch data
    self.t_new = torch.empty(len(batch)).to(device)

    # Iterate through batch indices
    for i in range(len(batch)):

      # Compute dot product of vectors from t and transformed batch indices
      self.c = torch.dot(t[batch[i,0].item()],self.layer8(t[batch[i,1].item()])).to(device)
      self.t_new[i] = self.c

    # Return the computed values
    return self.t_new

  def forward(self,X,ax,a2x,batch):

    # Apply transformations to inputs and pass through activation function
    t1 = tanh(self.theta0(X))
    t2 = tanh(self.theta1(ax))
    t3 = tanh(self.theta2(a2x))

    # Concatenate transformed inputs along the feature dimension and pass through Leaky Relu activation
    c = torch.cat((t1,t2,t3),dim=1)
    c = L_Relu(self.combine1(c))
    t1 = self.decoder(c,batch)

    # Return the combined tensor and the output from the decoder
    return c,t1

In [None]:
# Specify device for computation
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load input feaatures like preprocessed input features, node mappings, and adjacency matric
input_features = load_variable("/content/drive/My Drive/input_features.p")
nodes_mapping = load_variable("/content/drive/My Drive/nodes_mapping.p")
A_tilda = load_variable("/content/drive/My Drive/A_tilda.p")

# Perform transformations and convert to pytorch vectors
ax = A_tilda*np.array(input_features)
a2x = A_tilda*ax
a2x = torch.tensor(a2x,dtype=torch.float)
ax = torch.tensor(ax,dtype=torch.float)

# Initialize model and load pretrained weights
empty_model = GDRnet()
net = load_model_on_cpu(empty_model,"/content/drive/My Drive/DR_model").to(device)

  return pickle.load(open(filename,'rb'))
  model.load_state_dict(torch.load(path,map_location=torch.device('cpu')))


In [None]:
# Get information about node mappings and input features

print(nodes_mapping)
print(np.shape(input_features))

{'Compound::DB06950': 0, 'Compound::DB03440': 1, 'Compound::DB01905': 2, 'Gene::11004': 3, 'Gene::11475': 4, 'Gene::290317': 5, 'Compound::DB02834': 6, 'Disease::MESH:D003635': 7, 'Disease::MESH:D011254': 8, 'Disease::MESH:D003537': 9, 'Gene::80178': 10, 'Compound::DB14014': 11, 'Gene::22174': 12, 'Gene::240888': 13, 'Gene::drugbank:BE0002832': 14, 'Disease::MESH:C537560': 15, 'Gene::5649': 16, 'Gene::27109': 17, 'Gene::63982': 18, 'Gene::56919': 19, 'Gene::2647': 20, 'Gene::10980': 21, 'Gene::26280': 22, 'Gene::169981': 23, 'Gene::253970': 24, 'Gene::3250': 25, 'Gene::53841': 26, 'Disease::MESH:C567893': 27, 'Gene::342035': 28, 'Gene::60395': 29, 'Gene::346007': 30, 'Gene::10896': 31, 'Disease::MESH:D030401': 32, 'Gene::27178': 33, 'Gene::1112': 34, 'Gene::220979': 35, 'Gene::114803': 36, 'Gene::57228': 37, 'Gene::124565': 38, 'Gene::342945': 39, 'Gene::389827': 40, 'Gene::8777': 41, 'Gene::12442': 42, 'Gene::25796': 43, 'Gene::6541': 44, 'Gene::1652': 45, 'Gene::drugbank:BE0004826': 

In [None]:
# Make embedding and drug predictions for a specific disease. Npte that MESH::D008288 is Malaria

embeddings,drugs = predict(net,["Disease::MESH:D008288"])
len(embeddings)

42493

In [None]:
print(drugs)

[{"4-chloro-N'-[(1E)-(3,5-dibromo-2,4-dihydroxyphenyl)methylidene]benzohydrazide": 0.03680821880698204, 'N-hexadecanoylglycine': -3.0350964069366455, '2-(2-Hydroxy-5-Methoxy-Phenyl)-1h-Benzoimidazole-5-Carboxamidine': -0.3608761727809906, 'IDD552': -0.076129250228405, 'SR-9011': 0.21909406781196594, 'CR665': 0.1676982194185257, '4-methylthio-2-oxobutanoic acid': -1.0212429761886597, "Uridine-5'-diphosphate-mannose": -1.1069368124008179, 'Ciclesonide': 1.170403242111206, 'Busulfan': 1.3068338632583618, 'Acetazolamide': 0.7979269027709961, '1-[4-(AMINOSULFONYL)PHENYL]-1,6-DIHYDROPYRAZOLO[3,4-E]INDAZOLE-3-CARBOXAMIDE': -0.07181438058614731, 'Rolicyclidine': -0.2061707228422165, '2,4-Diamino-5-Methyl-6-[(3,4,5-Trimethoxy-N-Methylanilino)Methyl]Pyrido[2,3-D]Pyrimidine': -0.30941343307495117, '2,3,5,6-Tetrafluoro-4-Methoxy-Benzamide': -0.46775710582733154, 'CHF 4227': -0.43085867166519165, 'Lorpiprazole': 1.091214656829834, 'Ciprofloxacin': 2.0953474044799805, '2-Hydroxy-Tryptophan': -1.1149

In [None]:
#top 30,000 predicted drugs for Malaria
sorted(drugs[0].items(),key=lambda t:t[1])[::-1][:30000]

[('Tetracycline', 2.9030215740203857),
 ('Clindamycin', 2.861628532409668),
 ('Doxycycline', 2.7223026752471924),
 ('Metronidazole', 2.70158052444458),
 ('Minocycline', 2.687775135040283),
 ('Ivermectin', 2.589362382888794),
 ('Chloroquine', 2.574556589126587),
 ('Rifapentine', 2.5660510063171387),
 ('Erythromycin', 2.538343667984009),
 ('Proguanil', 2.5356032848358154),
 ('Sulfadiazine', 2.512777805328369),
 ('Dapsone', 2.502814531326294),
 ('Clarithromycin', 2.467073678970337),
 ('Rifabutin', 2.4631707668304443),
 ('Trimethoprim', 2.445631980895996),
 ('Primaquine', 2.4132375717163086),
 ('Praziquantel', 2.4030723571777344),
 ('Demeclocycline', 2.401435375213623),
 ('Atovaquone', 2.3816237449645996),
 ('Sulfamethoxazole', 2.368584394454956),
 ('Terbinafine', 2.365410327911377),
 ('Rifaximin', 2.363856315612793),
 ('Rifampicin', 2.3388352394104004),
 ('Loperamide', 2.2780063152313232),
 ('Hydroxychloroquine', 2.2549021244049072),
 ('Telithromycin', 2.2393932342529297),
 ('Ketoconazole

In [None]:
# Import relevant packages
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
import numpy as np

# Split embeddings into training and testing data
def split_data(embeddings, labels, test_ratio=0.2, random_seed=42):
    X_train, X_test, y_train, y_test = train_test_split(
        embeddings, labels, test_size=test_ratio, random_state=random_seed
    )
    return X_train, X_test, y_train, y_test

# Train a logistic regression model with randomized search for faster hyperparameter tuning
def train_logistic_regression(X_train, y_train):

    # Parameter grid for randomized search
    hyperparameter_grid = {
        'C': [0.1, 1, 10],
        'penalty': ['l2'],
        'solver': ['lbfgs'],
        'max_iter': [1000]
    }

    # Base logistic regression model
    logistic_model = LogisticRegression(random_state=42)

    # Randomized search configuration
    random_search = RandomizedSearchCV(
        estimator=logistic_model,
        param_distributions=hyperparameter_grid,
        n_iter=50,
        cv=2,
        scoring='accuracy',
        random_state=42
    )

    # Fit the model to training data
    random_search.fit(X_train, y_train)

    # Output the best hyperparameters
    print("Best Hyperparameters:", random_search.best_params_)

    return random_search.best_estimator_

# Evaluate the logistic regression model
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))

# Generate binary labels from drug data based on a score threshold
def generate_labels(drug_data, score_threshold=0.5):
    labels = []

    # For each drug
    for drug_dict in drug_data:

        # Find the drug with the highest score
        top_drug = max(drug_dict, key=drug_dict.get)
        top_score = drug_dict[top_drug]

        # Assign label based on the threshold
        label = 1 if top_score >= score_threshold else 0
        labels.append(label)
    return labels

# Simulated number of samples
num_samples = 250
labels = np.zeros(num_samples).astype(int)

# Extract items from first drug dictionary
items = list(drugs[0].items())

# Assign binary labels based on scores
for i in range(num_samples):
    if items[i][1] > 0:
        labels[i] = 1

# Convert embeddings from PyTorch tensor to NumPy array
embeddings_numpy = embeddings.detach().numpy()

# Standardize the feature matrix
scaler = StandardScaler()
embeddings_scaled = np.transpose(scaler.fit_transform(embeddings_numpy))

# Check the shapes of labels and embeddings
print("Labels shape:", np.shape(labels))
print("Embeddings shape:", np.shape(embeddings_scaled))

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = split_data(embeddings_scaled, labels)

# Train the logistic regression model
trained_model = train_logistic_regression(X_train, y_train)

# Evaluate the model's performance
evaluate_model(trained_model, X_test, y_test)

(250,)
(250, 42493)




Best Hyperparameters: {'solver': 'lbfgs', 'penalty': 'l2', 'max_iter': 1000, 'C': 0.1}
Accuracy: 0.58
Classification Report:
               precision    recall  f1-score   support

           0       0.52      0.50      0.51        22
           1       0.62      0.64      0.63        28

    accuracy                           0.58        50
   macro avg       0.57      0.57      0.57        50
weighted avg       0.58      0.58      0.58        50



In [None]:
# Print AUROC and AUPRC scores
print("Logit AUROC", roc_auc_score(y_test,logistic_model.predict_proba(X_test)[:,1]))
print("Logit AUPRC", average_precision_score(y_test,logistic_model.predict_proba(X_test)[:,1]))

Logit AUROC 0.5032467532467533
Logit AUPRC 0.6313018826838195
