<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Load-index,-data-and-results" data-toc-modified-id="Load-index,-data-and-results-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Load index, data and results</a></span><ul class="toc-item"><li><span><a href="#Id-index-map-loading" data-toc-modified-id="Id-index-map-loading-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Id-index map loading</a></span></li><li><span><a href="#Generate-/-use-polypharmacy-side-effect-id-name-map" data-toc-modified-id="Generate-/-use-polypharmacy-side-effect-id-name-map-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Generate / use polypharmacy side effect id-name map</a></span></li><li><span><a href="#Data-loading-with-selected-d-d-edge-labels" data-toc-modified-id="Data-loading-with-selected-d-d-edge-labels-1.3"><span class="toc-item-num">1.3&nbsp;&nbsp;</span>Data loading with selected d-d edge labels</a></span></li></ul></li><li><span><a href="#Performance-Comparation" data-toc-modified-id="Performance-Comparation-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Performance Comparation</a></span></li><li><span><a href="#Drug-embeddings" data-toc-modified-id="Drug-embeddings-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Drug embeddings</a></span></li><li><span><a href="#Model-Characteristics-and-Relablility" data-toc-modified-id="Model-Characteristics-and-Relablility-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Model Characteristics and Relablility</a></span><ul class="toc-item"><li><span><a href="#The-side-effects-evaluated-in-Zitnik-et-al.-(2018)" data-toc-modified-id="The-side-effects-evaluated-in-Zitnik-et-al.-(2018)-4.1"><span class="toc-item-num">4.1&nbsp;&nbsp;</span>The side effects evaluated in Zitnik et al. (2018)</a></span></li><li><span><a href="#The-side-effects-evaluated-in-our-work" data-toc-modified-id="The-side-effects-evaluated-in-our-work-4.2"><span class="toc-item-num">4.2&nbsp;&nbsp;</span>The side effects evaluated in our work</a></span></li></ul></li></ul></div>

In [1]:
from data.utils import load_data_torch, process_prot_edge
from torch_geometric.data import Data
from src.utils import process_edges
import matplotlib.pyplot as plt
import pandas as pd
import pickle
import numpy as np
import csv
import torch
import os

torch.manual_seed(1111)
np.random.seed(1111)

!!! Note: make sure the scipy and torch packages have also been installed properly to run this notebook.

## Load index, data and results

### Id-index map loading

In [2]:
# drug id - index
with open('./data/index_map/drug-map.pkl', 'rb') as f:
    drug_map = pickle.load(f)
inv_drug_map = {v: k for k, v in drug_map.items()}

# combo id - index
with open('./data/index_map/combo_map.pkl', 'rb') as f:
    combo_map = pickle.load(f)
inv_combo_map = {v: k for k, v in combo_map.items()}

### Generate / use polypharmacy side effect id-name map

In [3]:
######################################################
# generate polypharmacy side effect id - name map
# combo_name_map = {}
# with open('../data/index_map/bio-decagon-combo.csv', 'r') as f:
#     reader = csv.reader(f)
#     next(reader)
#     for _, _, id, name in reader:
#         id = int(id.split('C')[-1])
#         combo_name_map[id] = name

# # save map
# with open('../data/index_map/combo-name-map.pkl', 'wb') as f:
#     pickle.dump(combo_name_map, f)

# use map
with open('./data/index_map/combo-name-map.pkl', 'rb') as f:
    combo_name_map = pickle.load(f)
inv_combo_name_map = {v: k for k, v in combo_name_map.items()}

### Data loading with selected d-d edge labels

In [4]:
# selected-drug idx - drug idx
with open('./data/decagon_et.pkl', 'rb') as f:   # the whole dataset
    et_list = pickle.load(f)
inv_et_list = {et_list[i]: i for i in range(len(et_list))}

# load training data
feed_dict = load_data_torch("./data/", et_list, mono=True)
data = Data.from_dict(feed_dict)


loading data
remove  0  isolated drugs:  []
remove finished
1097  polypharmacy side effects
data has been loaded


In [5]:
data.train_idx, data.train_et, data.train_range,data.test_idx, data.test_et, data.test_range = process_edges(data.dd_edge_index)

print(data.test_et.size())

torch.Size([924708])


## Performance Comparation

In [28]:
# model loading
models = {'DR-DF': 'dd-rgcn-dist(16-64-32-16)', 
          'DR-NN': 'dd-rgcn-nn(16-64-32-16)', 
          'PR-HMP-NN': 'pd-32-16-8-16-963', 
          'TIP': 'fm-(32-16)-(16-16-32-32-16)'}

# loading function for recorded test scores druging training
def get_test_out(model_name):
    with open('../out/'+ models[model_name] +'/test_out.pkl', 'rb') as f:
        record = pickle.load(f)
    return record


In [31]:
# get averaged auprc scores for each epoches
scores = dict()
prc_final = dict()
for model in models.keys():
    out = get_test_out(model)
    scores[model] = out
    prc_final[model] = out[99][0]

In [62]:
# print final AUPRC scores  for each models
lines = '---------------------------------------------'
print(lines)
print('|{:10s}|'.format('AUPRC scores for TIP model and its variants'))
print(lines)
formats = '|{:14s}|{:28.3f}|'
for model, scores in prc_final.items():
    print(formats.format(model, scores))
print(lines)


---------------------------------------------
|AUPRC scores for TIP model and its variants|
---------------------------------------------
|DR-DF         |                       0.948|
|DR-NN         |                       0.944|
|PR-HMP-NN     |                       0.746|
|TIP           |                       0.948|
---------------------------------------------


## Drug embeddings 


## Model Characteristics and Relablility 

### The side effects evaluated in Zitnik et al. (2018)
The side effects with the top 10 best performance and 10 best performance, according to averaged auprc scores

In [6]:
# ######################################################
# side effect name - original index reported in decagon
decagon_best_name = ["Mumps", "Carbuncle", "Coccydynia", "Tympanic membrane perfor", "Dyshidrosis", "Spondylosis", "Schizoaffective disorder", "Breast dysplasia", "Ganglion", "Uterine polyp"]
decagon_worst_name = ["Bleeding", "Body temperature increased",  "Emesis", "Renal disorder", "Leucopenia", "Diarrhea", "Icterus", "Nausea", "Itch", "Anaemia"]
decagon_best_org_id = [26780, 7078, 9193, 206504, 32633, 38019, 36337, 16034, 1258666, 156369]
decagon_worst_org_id = [19080, 15967, 42963, 22658, 23530, 11991, 22346, 27497, 33774, 2871]

# get index
decagon_best_idx = [inv_et_list[combo_map[i]] for i in decagon_best_org_id]
decagon_worst_idx = [inv_et_list[combo_map[i]] for i in decagon_worst_org_id]



### The side effects evaluated in our work
The side effects with the top 20 best and 20 worst performance, according to averaged auprc scores.

In [14]:
# ######################################################
# Evaluation
name = 'RGCN-DistMult on d-net'
lines = '-------------------------------------------------------------------------------------------------------'

with open('../out/dd-rgcn-dist/test_record.pkl', 'rb') as f:
    dist_record = pickle.load(f)
auprc = np.array(dist_record[len(dist_record)-1])[0, :]
sorted_idx = np.argsort(auprc, kind='quicksort')

print(lines)
print(' {:37s}   {:6s}| {:45s}  {:6s}'.format('The Highest AUPRC Score', '  Edge', 'The Lowest AUPRC Score', '   Edge'))
print(lines)

for i in range(20):
    print(' {:30s} {:7.4f}  {:6d}| {:38s} {:7.4f}  {:6d}'.format(
        combo_name_map[inv_combo_map[et_list[sorted_idx[-(i+1)]]]], auprc[sorted_idx[-(i+1)]], feed_dict['dd_adj_list'][-(i+1)].nnz,
        combo_name_map[inv_combo_map[et_list[sorted_idx[i]]]], auprc[sorted_idx[i]], feed_dict['dd_adj_list'][i].nnz))
print(lines)

decag_best_in_us = [962 - np.where(sorted_idx == i)[0] for i in decagon_best_idx]
decag_worst_in_us = [np.where(sorted_idx == i)[0] for i in decagon_worst_idx]

-------------------------------------------------------------------------------------------------------
 The Highest AUPRC Score                   Edge| The Lowest AUPRC Score                            Edge
-------------------------------------------------------------------------------------------------------
 cervical vertebral fracture     0.9963     516| Bleeding                                0.8308   12062
 hordeolum                       0.9942     546| agitated                                0.8447   19930
 Mumps                           0.9934     602| hypoglycaemia neonatal                  0.8467   12309
 spondylosis                     0.9931     847| Difficulty breathing                    0.8512   14192
 night cramps                    0.9920     689| thrombocytopenia                        0.8534    7126
 fibrosing alveolitis            0.9913     661| asystole                                0.8562    8621
 diaphragmatic hernia            0.9906     853| Aspartate Amino

## Information Importance Evaluation

In [6]:
# load prediction scores for different trained models on testing dataset
head = ['protein', 'drug', 'protein+drug']
print(os.listdir('qu_out/eva'))
score = []

for pkl in os.listdir('qu_out/eva'):
    with open('./qu_out/eva/{}'.format(pkl), 'rb') as f:
        score.append(pickle.load(f))


['ppm-ggm-distmult.pkl', 'tip-add.pkl', 'tip-cat.pkl', 'dist.pkl']


In [16]:
# side effect info - index and name
side_effect_idx = [inv_combo_map[i] for i in data.test_et.tolist()]
side_effect_name = [combo_name_map[i] for i in side_effect_idx]

In [22]:
# drug info - drug bank index
drug1_cid = [inv_drug_map[i] for i in data.test_idx[0].tolist()]
drug2_cid = [inv_drug_map[i] for i in data.test_idx[1].tolist()]

In [24]:
# table construction
a = pd.DataFrame({'side effect': data.test_et, 
                  'side effect index (C)': side_effect_idx,
                  'side effect name': side_effect_name,
                  'drug1': data.test_idx[0],
                  'drug1 STITCH (CID)': drug1_cid,
                  'drug2': data.test_idx[1], 
                  'drug2 STITCH (CID)': drug2_cid,
                  'protein-based prediction': score[0], 
                  'drug-based prediction': score[3], 
                  'protein&drug-based prediction': score[2]})
a

Unnamed: 0,side effect,side effect index (C),side effect name,drug1,drug1 STITCH (CID),drug2,drug2 STITCH (CID),protein-based prediction,drug-based prediction,protein&drug-based prediction
0,0,151714,hypermagnesemia,1,3345,58,853,0.500000,0.980667,0.603309
1,0,151714,hypermagnesemia,16,5391,350,4171,0.500000,0.855139,0.594710
2,0,151714,hypermagnesemia,28,3446,291,2471,0.500000,0.967403,0.636370
3,0,151714,hypermagnesemia,58,853,126,3016,0.984455,0.500000,0.636280
4,0,151714,hypermagnesemia,58,853,176,3062,0.992585,0.918021,0.573244
...,...,...,...,...,...,...,...,...,...,...
924703,1096,9952,febrile convulsion,428,5291,197,60147,0.500000,0.778182,0.542837
924704,1096,9952,febrile convulsion,204,4679,199,4594,0.500000,0.634582,0.344946
924705,1096,9952,febrile convulsion,315,3961,204,4679,0.500000,0.955419,0.368865
924706,1096,9952,febrile convulsion,246,2806,244,443871,0.500000,0.477580,0.379254


In [26]:
# save to csv file
a.to_csv('evaluation_table.csv', index=False)