### Classification Network - Second Level - Preparation

In [1]:
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow.keras.losses import BinaryCrossentropy, CategoricalCrossentropy, \
CategoricalHinge, BinaryFocalCrossentropy,\
MeanSquaredError, LogCosh, CosineSimilarity, Huber, MeanSquaredLogarithmicError, MeanAbsoluteError

import pandas as pd
import pickle
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, f1_score
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score

import matplotlib.pyplot as plt

  from pandas.core import (


#### Load the dataset

In [2]:
seen = pd.read_csv('./Data/C56_71_TWOSIDES_one_hot_train.csv')
unseen = pd.read_csv('./Data/C56_71_TWOSIDES_one_hot_test.csv')
selected_classes = [
 'class_00', 'class_01', 'class_02', 'class_03', 'class_04', 'class_05', 'class_06', 'class_07', 'class_08', 'class_09',
 'class_10', 'class_11', 'class_12', 'class_13', 'class_14', 'class_15', 'class_16', 'class_17',             'class_19',
 'class_20', 'class_21', 'class_22', 'class_23', 'class_24', 'class_25', 'class_26', 'class_27', 'class_28', 'class_29',
 'class_30', 'class_31', 'class_32', 'class_33', 'class_34', 'class_35', 'class_36', 'class_37', 'class_38', 'class_39',
 'class_40', 'class_41', 'class_42', 'class_43', 'class_44',             'class_46', 'class_47', 'class_48', 'class_49',
 'class_50', 'class_51', 'class_52', 'class_53', 'class_54', 'class_55'
]
unseen_classes = ['class_00', 'class_18', 'class_45']
class_columns = selected_classes
unseen_columns = unseen_classes


In [3]:
print('Seen pairs:', seen.shape[0])
for column in class_columns:
    print('Has/Doesn\'t have', column, seen[seen[column] == 1.0].shape[0], '/', (seen[seen[column] == 0.0].shape[0]-100000))


Seen pairs: 155137
Has/Doesn't have class_00 100000 / -44863
Has/Doesn't have class_01 46981 / 8156
Has/Doesn't have class_02 50429 / 4708
Has/Doesn't have class_03 13274 / 41863
Has/Doesn't have class_04 53738 / 1399
Has/Doesn't have class_05 19342 / 35795
Has/Doesn't have class_06 54162 / 975
Has/Doesn't have class_07 28799 / 26338
Has/Doesn't have class_08 29005 / 26132
Has/Doesn't have class_09 31261 / 23876
Has/Doesn't have class_10 54575 / 562
Has/Doesn't have class_11 51810 / 3327
Has/Doesn't have class_12 47356 / 7781
Has/Doesn't have class_13 52445 / 2692
Has/Doesn't have class_14 20528 / 34609
Has/Doesn't have class_15 52627 / 2510
Has/Doesn't have class_16 45863 / 9274
Has/Doesn't have class_17 40630 / 14507
Has/Doesn't have class_19 17600 / 37537
Has/Doesn't have class_20 53792 / 1345
Has/Doesn't have class_21 44431 / 10706
Has/Doesn't have class_22 13823 / 41314
Has/Doesn't have class_23 52057 / 3080
Has/Doesn't have class_24 42749 / 12388
Has/Doesn't have class_25 48962 /

In [4]:
print('Unseen pairs:', unseen.shape[0])

Unseen pairs: 103560


#### Load the Embeddings

In [5]:
with open('./Data/Embeddings/biobert_embeddings_128_pca.pkl', 'rb') as file:
    biobert = pickle.load(file)    
with open('./Data/Embeddings/chemberta_embeddings_128_pca.pkl', 'rb') as file:
    chemberta = pickle.load(file)    
with open('./Data/Embeddings/kg_bio2rdf_embeddings_128_pca.pkl', 'rb') as file:
    bio2rdf = pickle.load(file)    
with open('./Data/Embeddings/ssp_embeddings_128_pca.pkl', 'rb') as file:
    ssp = pickle.load(file)    

drugs = set(np.concatenate([
    seen['drug1'].unique(), seen['drug2'].unique(), unseen['drug1'].unique(), unseen['drug2'].unique()
]))
embeddings = {}
for rx_norm_id in list(drugs):
    embeddings[rx_norm_id] = np.concatenate([
        biobert[rx_norm_id], chemberta[rx_norm_id], bio2rdf[rx_norm_id], ssp[rx_norm_id]
    ])

seen['embedded'] = seen.apply(lambda x: pd.Series(
    [np.concatenate([embeddings[x['drug1']], embeddings[x['drug2']]])], 
    index=['embedded']), axis=1)
print('Seen shape:', seen.shape)
unseen['embedded'] = unseen.apply(lambda x: pd.Series(
    [np.concatenate([embeddings[x['drug1']], embeddings[x['drug2']]])], 
    index=['embedded']), axis=1)
print('Unseen shape:', unseen.shape)
seen.head()

Seen shape: (155137, 59)
Unseen shape: (103560, 6)


Unnamed: 0,drug1,drug2,class_00,class_01,class_02,class_03,class_04,class_05,class_06,class_07,...,class_47,class_48,class_49,class_50,class_51,class_52,class_53,class_54,class_55,embedded
0,55681,1243041,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[2.168276786804199, 1.0936528444290161, 0.9610..."
1,134547,6313,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,...,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,"[2.232297420501709, 0.9437393546104431, 0.0993..."
2,4094,47858,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[-0.5903611183166504, -1.876413106918335, 1.70..."
3,8627,3920,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[-0.37425464391708374, 1.5704320669174194, -0...."
4,797195,41493,0.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,...,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,"[3.0385594367980957, 0.587708055973053, 0.8335..."


#### Load the First Level model and get values

In [19]:
size = len(seen['embedded'][0])
x_seen = np.concatenate(seen['embedded'].values).reshape(-1, size)
x_unseen = np.concatenate(unseen['embedded'].values).reshape(-1, size)
filename = 'TWOSIDES_00.pkl'
with open(filename, 'rb') as file:
    model = pickle.load(file)
y_seen = model.predict(x_seen)
y_unseen = model.predict(x_unseen)

[1m4849/4849[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 4ms/step
[1m3237/3237[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 4ms/step


#### Store values

In [7]:
y_seen = pd.DataFrame(y_seen)
for column in class_columns:
    y_seen[column] = seen[column]
y_seen.to_csv('./Data/C56_71_TWOSIDES_seen.csv', index=False)
y_unseen = pd.DataFrame(y_unseen)
for column in unseen_columns:
    y_unseen[column] = unseen[column]
y_unseen.to_csv('./Data/C56_71_TWOSIDES_unseen.csv', index=False)