In [None]:
import os

os.environ["DGLBACKEND"] = "pytorch"

import dgl
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import sqlite3

In [None]:
conn = sqlite3.connect("event.db")
df_drug = pd.read_sql('select * from drug;', conn)
df_event = pd.read_sql('select * from event_number;', conn)
df_interaction = pd.read_sql('select * from event;', conn)

In [None]:
drugs_to_remove = ['Fomepizole', '    Lglutamine']
df_drug = df_drug[~df_drug['name'].isin(drugs_to_remove)]
df_drug.reset_index(drop=True, inplace=True)

In [None]:
def get_interaction_df(df, type):
    df_type = pd.DataFrame()
    df_type[type] = df[type].str.split('|')
    df_type['name'] = df['name']

    new_data = {'drug': [], type: []}

    for idx, row in df_type.iterrows():
        drug = row['name']
        others = row[type]
        for other in others:
            new_data['drug'].append(drug)
            new_data[type].append(other)

    type_df = pd.DataFrame(new_data)
    
    return type_df

In [None]:
drug_drug_df = df_interaction[['name1', 'name2']]
drug_target_df = get_interaction_df(df_drug, 'target')
drug_enzyme_df = get_interaction_df(df_drug, 'enzyme')
drug_pathway_df = get_interaction_df(df_drug, 'pathway')

In [None]:
unique_drugs = pd.concat([drug_drug_df.iloc[:, 0], drug_drug_df.iloc[:, 1]]).unique()
drug_dict = {name: i for i, name in enumerate(unique_drugs)}
drug_drug_df.iloc[:, 0] = drug_drug_df.iloc[:, 0].map(drug_dict)
drug_drug_df.iloc[:, 1] = drug_drug_df.iloc[:, 1].map(drug_dict)

In [None]:
def get_source_destination_nodes(df, drug_dict):
    unique_names = df.iloc[:, 1].unique()
    name_dict = {name: i for i, name in enumerate(unique_names)}

    df.iloc[:, 0] = df.iloc[:, 0].map(drug_dict)
    df.iloc[:, 1] = df.iloc[:, 1].map(name_dict)
    
    source_nodes = df.iloc[:, 0].tolist()
    destination_nodes = df.iloc[:, 1].tolist()

    return source_nodes, destination_nodes

In [None]:
drug_drug_source_nodes = drug_drug_df['name1'].tolist()
drug_drug_destination_nodes = drug_drug_df['name2'].tolist()
drug_target_source_nodes, drug_target_destination_nodes = get_source_destination_nodes(drug_target_df, drug_dict)
drug_enzyme_source_nodes, drug_enzyme_destination_nodes = get_source_destination_nodes(drug_enzyme_df, drug_dict)
drug_pathway_source_nodes, drug_pathway_destination_nodes = get_source_destination_nodes(drug_pathway_df, drug_dict)

In [None]:
# Create a heterograph with 4 node types and 4 edge types.
graph_data = {
   ('drug', 'interacts', 'drug'): (drug_drug_source_nodes, drug_drug_destination_nodes),
   ('drug', 'affects', 'target'): (drug_target_source_nodes, drug_target_destination_nodes),
   ('drug', 'regulates', 'enzyme'): (drug_enzyme_source_nodes, drug_enzyme_destination_nodes),
   ('drug', 'alters', 'pathway'): (drug_pathway_source_nodes, drug_pathway_destination_nodes)
}
g = dgl.heterograph(graph_data)
g

In [None]:
in_degrees_drug_drug = g.in_degrees(etype=('drug', 'interacts', 'drug'))
in_degrees_drug_target = g.in_degrees(etype=('drug', 'affects', 'target'))
in_degrees_drug_enzyme = g.in_degrees(etype=('drug', 'regulates', 'enzyme'))
in_degrees_drug_pathway = g.in_degrees(etype=('drug', 'alters', 'pathway'))

g.nodes['drug'].data['feat'] = in_degrees_drug_drug.unsqueeze(0).transpose(0, 1)
g.nodes['target'].data['feat'] = in_degrees_drug_target.unsqueeze(0).transpose(0, 1)
g.nodes['enzyme'].data['feat'] = in_degrees_drug_enzyme.unsqueeze(0).transpose(0, 1)
g.nodes['pathway'].data['feat'] = in_degrees_drug_pathway.unsqueeze(0).transpose(0, 1)

In [None]:
from sklearn.model_selection import train_test_split

edge_types = g.canonical_etypes

train_edges = {etype: [] for etype in edge_types}
test_edges = {etype: [] for etype in edge_types}

for etype in edge_types:
    u, v = g.edges(etype=etype)
    eids = np.arange(g.num_edges(etype=etype))
    eids = np.random.permutation(eids)
    test_size = int(len(eids) * 0.3)
    train_size = len(eids) - test_size

    train_eids, test_eids = train_test_split(eids, train_size=train_size, test_size=test_size, random_state=42)

    train_edges[etype].append((u[train_eids], v[train_eids]))
    test_edges[etype].append((u[test_eids], v[test_eids]))

train_g = dgl.heterograph(
    {etype: (np.concatenate([u for u, v in train_edges[etype]]), np.concatenate([v for u, v in train_edges[etype]])) for etype in edge_types},
    num_nodes_dict={ntype: g.num_nodes(ntype) for ntype in g.ntypes}
)

test_g = dgl.heterograph(
    {etype: (np.concatenate([u for u, v in test_edges[etype]]), np.concatenate([v for u, v in test_edges[etype]])) for etype in edge_types},
    num_nodes_dict={ntype: g.num_nodes(ntype) for ntype in g.ntypes}
)

for ntype in g.ntypes:
    train_g.nodes[ntype].data['feat'] = g.nodes[ntype].data['feat']
    
for ntype in g.ntypes:
    test_g.nodes[ntype].data['feat'] = g.nodes[ntype].data['feat']

In [None]:
import dgl.function as fn

class HeteroDotProductPredictor(nn.Module):
    def forward(self, graph, h, etype):
        with graph.local_scope():
            graph.ndata['h'] = h
            graph.apply_edges(fn.u_dot_v('h', 'h', 'score'), etype=etype)
            return graph.edges[etype].data['score']

In [None]:
def construct_negative_graph(graph, k, etype):
    utype, _, vtype = etype
    src, dst = graph.edges(etype=etype)
    neg_src = src.repeat_interleave(k)
    neg_dst = torch.randint(0, graph.num_nodes(vtype), (len(src) * k,))
    return dgl.heterograph(
        {etype: (neg_src, neg_dst)},
        num_nodes_dict={ntype: graph.num_nodes(ntype) for ntype in graph.ntypes})

In [None]:
import dgl.nn as dglnn

# Define a Heterograph Conv model
class RGCN(nn.Module):
    def __init__(self, in_feats, hid_feats, out_feats, rel_names):
        super().__init__()

        self.conv1 = dglnn.HeteroGraphConv({
            rel: dglnn.GraphConv(in_feats, hid_feats)
            for rel in rel_names}, aggregate='mean')
        self.conv2 = dglnn.HeteroGraphConv({
            rel: dglnn.GraphConv(hid_feats, out_feats)
            for rel in rel_names}, aggregate='mean')

    def forward(self, graph, inputs):
        # inputs are features of nodes
        h = self.conv1(graph, inputs)
        h = {k: F.relu(v) for k, v in h.items()}
        h = self.conv2(graph, h)
        return h

In [None]:
class Model(nn.Module):
    def __init__(self, in_features, hidden_features, out_features, rel_names):
        super().__init__()
        self.sage = RGCN(in_features, hidden_features, out_features, rel_names)
        self.pred = HeteroDotProductPredictor()
    def forward(self, g, neg_g, x, etype):
        h = self.sage(g, x)
        return self.pred(g, h, etype), self.pred(neg_g, h, etype)

In [None]:
def compute_loss(pos_score, neg_score):
    # Margin loss
    n_edges = pos_score.shape[0]
    return (1 - pos_score + neg_score.view(n_edges, -1)).clamp(min=0).mean()

k = 5
model = Model(g.nodes['drug'].data['feat'].shape[1], 20, 5, train_g.etypes)

drug_feats = train_g.nodes['drug'].data['feat']
target_feats = train_g.nodes['target'].data['feat']
enzyme_feats = train_g.nodes['enzyme'].data['feat']
pathway_feats = train_g.nodes['pathway'].data['feat']
node_features = {'drug': drug_feats, 'target': target_feats, 'enzyme': enzyme_feats, 'pathway': pathway_feats}

opt = torch.optim.Adam(model.parameters())

for epoch in range(100):
    train_neg_g = construct_negative_graph(train_g, k, ('drug', 'interacts', 'drug'))
    pos_score, neg_score = model(train_g, train_neg_g, node_features, ('drug', 'interacts', 'drug'))
    loss = compute_loss(pos_score, neg_score)
    opt.zero_grad()
    loss.backward()
    opt.step()
    print(loss.item())

In [None]:
from sklearn.metrics import roc_curve, auc, precision_recall_curve
import matplotlib.pyplot as plt

with torch.no_grad():
    test_neg_g = construct_negative_graph(test_g, k, ('drug', 'interacts', 'drug'))
    pos_score, neg_score = model(test_g, test_neg_g, node_features, ('drug', 'interacts', 'drug'))

    pos_score = pos_score.numpy()
    neg_score = neg_score.numpy()

    pos_labels = np.ones(len(pos_score))
    neg_labels = np.zeros(len(neg_score))

    all_scores = np.concatenate([pos_score, neg_score])
    all_labels = np.concatenate([pos_labels, neg_labels])

    fpr, tpr, _ = roc_curve(all_labels, all_scores)

    roc_auc = auc(fpr, tpr)
    print("AUC:", roc_auc)

    precision, recall, _ = precision_recall_curve(all_labels, all_scores)

    f1 = 2 * (precision * recall) / (precision + recall)

    plt.figure()
    lw = 2
    plt.plot(fpr, tpr, color='darkorange', lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic')
    plt.legend(loc="lower right")
    plt.figtext(0.5, -0.02, 'AUC = %0.2f' % roc_auc, wrap=True, horizontalalignment='center', fontsize=12)
    plt.show()

    # Plot Precision vs. Recall
    plt.figure()
    plt.plot(recall, precision, color='blue', lw=lw, label='Precision-Recall curve')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision vs. Recall')
    plt.legend(loc="lower left")
    plt.show()

    # Plot F1 Score vs. Recall
    plt.figure()
    plt.plot(recall, f1, color='green', lw=lw, linestyle='--', label='F1 Score')
    plt.xlabel('Recall')
    plt.ylabel('F1 Score')
    plt.title('F1 Score vs. Recall')
    plt.legend(loc="lower left")
    plt.show()


In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns

def plot_confusion_matrix(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=['Negative', 'Positive'], yticklabels=['Negative', 'Positive'])
    plt.xlabel('Predicted labels')
    plt.ylabel('True labels')
    plt.title('Confusion Matrix')
    plt.show()

threshold = 0.5
y_true = [1] * len(pos_score) + [0] * len(neg_score)
y_pred = [1 if s >= threshold else 0 for s in pos_score] + [1 if s >= threshold else 0 for s in neg_score]

plot_confusion_matrix(y_true, y_pred)
