### Use Embeddings extracted from SSP for Drugs

#### Import libraries and BioBERT model

In [1]:
!pip install progressbar2

Defaulting to user installation because normal site-packages is not writeable


In [2]:
import pandas as pd
import pickle
import numpy as np
import torch
from progressbar import ProgressBar

  from pandas.core import (


#### Load the Drugs common names from DrugBank

In [3]:
with open('./Data/rx_norm_to_db.pkl', 'rb') as file:
    rx_norm_to_db = pickle.load(file)

#### Load the embeddings file

In [5]:
ssp_embeddings = pd.read_csv("./Data/Embeddings/DeepDDI-drug_similarity.csv")
ssp_embeddings.head()

Unnamed: 0,DB,DB00006,DB00014,DB00027,DB00035,DB00050,DB00080,DB00091,DB00093,DB00104,...,DB13874,DB13878,DB13879,DB13882,DB13908,DB13909,DB13910,DB13911,DB13925,DB13928
0,DB00006,1.0,0.630568,0.694057,0.502846,0.691275,0.600644,0.479255,0.489524,0.436847,...,0.154394,0.387755,0.31134,0.068602,0.164557,0.073107,0.307863,0.069221,0.522109,0.588346
1,DB00014,0.630568,1.0,0.604824,0.642393,0.793826,0.637409,0.473214,0.624837,0.596354,...,0.254937,0.476671,0.408759,0.122622,0.253465,0.108108,0.420712,0.114504,0.632997,0.381986
2,DB00027,0.694057,0.604824,1.0,0.407295,0.679111,0.558298,0.547576,0.423194,0.456389,...,0.170323,0.326409,0.241417,0.075253,0.163209,0.057225,0.325359,0.080863,0.481515,0.570597
3,DB00035,0.502846,0.642393,0.407295,1.0,0.583333,0.591224,0.449689,0.931751,0.723781,...,0.270386,0.521368,0.491582,0.136126,0.294686,0.117949,0.345351,0.12933,0.6975,0.255708
4,DB00050,0.691275,0.793826,0.679111,0.583333,1.0,0.60757,0.445387,0.559113,0.554601,...,0.231788,0.42619,0.34153,0.126923,0.231884,0.094697,0.403008,0.115587,0.594883,0.447619


#### Get the embedding from DeepDDI's Structural Similarity Profile 

In [6]:
embeddings = {}
i = 0
step = len(rx_norm_to_db.keys())//100
bar = ProgressBar(max_value=100)
for rx_norm_id in rx_norm_to_db.keys():
    if i%step==0:
        bar.next()
    output_embedding = torch.from_numpy(np.array(
        ssp_embeddings[ssp_embeddings['DB'] == rx_norm_to_db[rx_norm_id]].iloc[0][1:].values, dtype=np.float64))
    embeddings[rx_norm_id] = output_embedding
    i += 1
bar.finish()

[38;2;0;255;0m100%[39m [38;2;0;255;0m(100 of 100)[39m |######################| Elapsed Time: 0:00:02 Time:  0:00:020000


In [8]:
# Get the embeddings in a list
embeddings768 = []
for rx_norm_id in rx_norm_to_db.keys():
    embeddings768.append(embeddings[rx_norm_id].detach().numpy())
embeddings768 = np.array(embeddings768)

#### Reduce the embeddings dimensionality with PCA

In [9]:
import numpy as np
from sklearn.decomposition import PCA

size = 128
pca = PCA(n_components=size, svd_solver='full')
embeddings_pca = pca.fit_transform(embeddings768)

print('Number of components:', pca.n_components_)
print('Covariance covered:', pca.explained_variance_ratio_.sum())

Number of components: 128
Covariance covered: 0.9959271248546943


In [10]:
embeddings_size_pca = {}
i = 0
for rx_norm_id in embeddings.keys():
    embeddings_size_pca[rx_norm_id] = embeddings_pca[i]
    i += 1

In [11]:
with open('./Data/TWOSIDES-ssp_embeddings_'+str(size)+'_pca.pkl', 'wb') as file:
    pickle.dump(embeddings_size_pca, file)
    print(file.name)

./Data/TWOSIDES-ssp_embeddings_128_pca.pkl
