In [150]:
import pandas as pd
import pickle
import requests
import numpy as np
import sys
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import random
from scipy.spatial.distance import cosine
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from keras.models import Sequential, Model
from keras.layers import Dense, Activation, Concatenate, Input, concatenate, Dropout
from keras.optimizers import Adagrad
from keras import regularizers

In [2]:
perturbagen_class = pickle.load(open("../data/all_perturbagen_class.pkl", "rb"))
perturbagens = list(perturbagen_class.keys())

In [3]:
df = pd.read_csv("../data/full_geneexp_phase2_1004.csv")
def getcellline(x):
    cellline = x.split("_")[1]
    return cellline
df['celline'] = df['signature'].apply(getcellline)

In [4]:
df["atc_level_one"] = [val[0] for val in list(df["atc"])]
imp_columns = ['target', 'atc', 'celline', 'atc_level_one']+list(df.columns[7:985])
df = df[imp_columns]

In [5]:
df.head()

Unnamed: 0,target,atc,celline,atc_level_one,780,7849,6193,23,9552,387,...,54681,11000,6915,6253,7264,5467,2767,23038,57048,79716
0,BRD-K70330367,N04BB01,YAPC,N,-1.153825,-0.050292,-0.517787,0.244794,-0.39911,0.317308,...,0.440725,0.115513,-0.282635,0.666874,-0.31993,-0.131123,-0.213662,-1.007887,-0.18283,0.524592
1,BRD-K70330367,N04BB01,YAPC,N,-0.150379,0.557618,-0.106715,-0.680913,-0.356638,0.821385,...,-0.017656,2.434079,-0.580103,-0.610206,0.188383,-0.309202,0.056737,-0.003205,-0.434692,-0.177766
2,BRD-K70330367,N04BB01,HA1E,N,0.03845,0.620755,-0.324466,0.13699,0.14121,-0.112033,...,0.224989,0.076725,-0.925917,0.711872,-0.341481,0.437485,-0.052817,-0.376699,-0.047295,-0.2157
3,BRD-K70330367,N04BB01,PC3,N,2.36355,-0.06165,0.69155,0.52495,-0.593,0.10915,...,0.21265,0.1141,-0.30755,-1.15695,0.50825,-0.2302,-0.314,-1.20845,-0.6305,-0.44765
4,BRD-K70330367,N04BB01,PC3,N,-0.5128,-0.44235,0.6974,0.10815,-0.55235,0.93815,...,0.3585,-0.6418,-0.60315,-0.79245,0.3848,-0.458,-0.0748,-0.19675,-0.66975,0.07485


In [18]:
atc_level_vals = np.unique(df.atc_level_one)
cell_lines = [k for k,_ in Counter(df.celline).most_common(7)]

### Dataset Prep

In [19]:
df = df[df.celline.isin(cell_lines)]

le = LabelEncoder()
le.fit(df['atc_level_one'])

df['atc_level_one'] = le.transform(df['atc_level_one'])

pert_train, pert_test = train_test_split(np.unique(df.target), test_size=0.3)

df_train = df[df.target.isin(pert_train)]
df_test = df[df.target.isin(pert_test)]

df_cell_line_train = {}
for cell_line in cell_lines:
    df_cell_line_train[cell_line] = df_train[df_train.celline==cell_line]
    
df_cell_line_test = {}
for cell_line in cell_lines:
    df_cell_line_test[cell_line] = df_test[df_test.celline==cell_line]

In [53]:
oe = OneHotEncoder()
oe.fit(le.transform(atc_level_vals).reshape(len(atc_level_vals), 1))
oe.transform(np.array([0]).reshape(-1, 1)).toarray()

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


array([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

### Base ANN

In [80]:
class network:
    def __init__(self):
        self.model = Sequential()
        self.model.add(Dense(25, activation='selu', input_dim=978))
        self.model.add(Dense(14, activation='softmax'))
        self.model.compile(optimizer='adagrad',
                      loss='categorical_crossentropy',
                      metrics=['accuracy'])
    def predict(self, X):
        return self.model.predict(X)
    
    def fit(self, X, y):
        self.model.fit(X, y, epochs=20)

In [81]:
network_models = {}

for cell_line in cell_lines:
    X = df_cell_line_train[cell_line][df.columns[4:]]
    y = oe.transform(np.array(df_cell_line_train[cell_line]['atc_level_one']).reshape(-1, 1)).toarray()
    print(y.shape)
    
    network_models[cell_line] = network()
    network_models[cell_line].fit(X, y) 
    
    y_pred = network_models[cell_line].predict(df_cell_line_test[cell_line][df.columns[4:]])
    y_true = oe.transform(np.array(df_cell_line_test[cell_line]['atc_level_one']).reshape(-1, 1)).toarray()
    print(accuracy_score(df_cell_line_test[cell_line]['atc_level_one'], np.argmax(y_pred, axis=1)))


(4614, 14)
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
0.21924603174603174
(4548, 14)
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
0.1942043721403152
(4534, 14)
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
0.19553072625698323
(4498, 14)
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/

Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
0.20113753877973112
(4374, 14)
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
0.14921171171171171
(4374, 14)
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
0.16176470588235295


In [83]:
pred_cell_line = []
pred = []
for pert in pert_test:
    predictions = []
    all_predictions = []
    for cell_line in cell_lines:
        mini_df = df_cell_line_test[cell_line][df_cell_line_test[cell_line]['target']==pert]
        if mini_df.shape[0]==0:
            continue
        answers = []
        for index in mini_df.index:
            answers+=(list(np.argmax(network_models[cell_line].predict(np.array(
                mini_df[df.columns[4:]].loc[index]).reshape(1, -1)), axis=1)))
            all_predictions.append(answers[-1])
        predictions.append(Counter(answers).most_common(1)[0][0])
    pred.append(Counter(all_predictions).most_common(1)[0][0])
    pred_cell_line.append(Counter(predictions).most_common(1)[0][0])

In [84]:
y_true = [np.array(df_test[df_test['target']==pert]['atc_level_one'])[0] for pert in pert_test]
accuracy_score(y_true, pred)

0.3021276595744681

### DenseNet

In [152]:
class denseNetwork:
    def __init__(self, d=6, k=40):
        self.d = d
        self.k = k
        
        _input = Input(shape=(978, ))
        Inputs = [_input]
        for layer in range(d-1):
            _dense = Dense(k, activation='selu', kernel_regularizer=regularizers.l2(0.01),
                activity_regularizer=regularizers.l1(0.01))(_input)
            _dense = Dropout(rate=0.9)(_dense)
            _input = Input(shape=(978, ))
            Inputs.append(_input)
            _input = concatenate([_dense, _input])

        merge_output = Dense(14, activation='softmax')(_input)

        self.model = Model(inputs=Inputs, outputs=merge_output)
        ada_grad = Adagrad(lr=0.1, epsilon=1e-08, decay=0.0)
        self.model.compile(optimizer=ada_grad, loss='categorical_crossentropy',
                       metrics=['accuracy'])
    
    def predict(self, X):
        return self.model.predict([X for _ in range(self.d)])
    
    def fit(self, X, y):
        self.model.fit([X for _ in range(self.d)], y, epochs=20)

In [154]:
denseNetwork_models = {}

for cell_line in cell_lines:
    X = df_cell_line_train[cell_line][df.columns[4:]]
    y = oe.transform(np.array(df_cell_line_train[cell_line]['atc_level_one']).reshape(-1, 1)).toarray()
    print(y.shape)
    
    denseNetwork_models[cell_line] = denseNetwork(3, 20)
    denseNetwork_models[cell_line].fit(X, y) 
    
    y_pred = denseNetwork_models[cell_line].predict(df_cell_line_test[cell_line][df.columns[4:]])
    y_true = oe.transform(np.array(df_cell_line_test[cell_line]['atc_level_one']).reshape(-1, 1)).toarray()
    print(accuracy_score(df_cell_line_test[cell_line]['atc_level_one'], np.argmax(y_pred, axis=1)))


(4614, 14)
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
0.1949404761904762
(4548, 14)
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
0.1784443314692425
(4534, 14)
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
0.18638902996444895
(4498, 14)
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/2

Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
0.1892450879007239
(4374, 14)
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
0.15371621621621623
(4374, 14)
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
0.14649321266968326


In [155]:
pred_cell_line = []
pred = []
for pert in pert_test:
    predictions = []
    all_predictions = []
    for cell_line in cell_lines:
        mini_df = df_cell_line_test[cell_line][df_cell_line_test[cell_line]['target']==pert]
        if mini_df.shape[0]==0:
            continue
        answers = []
        for index in mini_df.index:
            answers+=(list(np.argmax(denseNetwork_models[cell_line].predict(np.array(
                mini_df[df.columns[4:]].loc[index]).reshape(1, -1)), axis=1)))
            all_predictions.append(answers[-1])
        predictions.append(Counter(answers).most_common(1)[0][0])
    pred.append(Counter(all_predictions).most_common(1)[0][0])
    pred_cell_line.append(Counter(predictions).most_common(1)[0][0])

In [156]:
y_true = [np.array(df_test[df_test['target']==pert]['atc_level_one'])[0] for pert in pert_test]
accuracy_score(y_true, pred)

0.2936170212765957