In [60]:
import pandas as pd
import pickle
import requests
import numpy as np
import sys
import matplotlib.pyplot as plt
import seaborn as sns

In [61]:
perturbagen_class = pickle.load(open("../data/all_perturbagen_class.pkl", "rb"))

In [62]:
perturbagens = list(perturbagen_class.keys())

In [63]:
def make_df(perturbagen_cells, perturbagen_class, perturbagens=perturbagens):
    all_cells = []
    for perturbagen in perturbagen_cells:
        all_cells = all_cells+list(perturbagen_cells[perturbagen])
    all_cells = np.unique(all_cells)
    print(all_cells)
    print(len(perturbagens))
    data = pd.DataFrame()
    data["perturbagen_id"] = perturbagens
    data["perturbagen_class"] = [perturbagen_class[perturbagen] for perturbagen in perturbagens]
    data["atc_level_one"] = [perturbagen_class[perturbagen][0] for perturbagen in perturbagens]
    
    for perturbagen in perturbagens:
            if len(perturbagen_cells[perturbagen])==0:
                data.drop(data[data['perturbagen_id']==perturbagen].index, inplace = True)
                continue
            for cell_line in perturbagen_cells[perturbagen]:
                if cell_line not in data.columns:
                    data[cell_line] = [0 for a in range(len(data))]
                data.loc[data["perturbagen_id"]==perturbagen, cell_line] = 1

    return data

def plot_graphs(data, top_count=10):
    frequency_tuple = [(np.sum(data[cell_line])/len(data), cell_line) for cell_line in data.columns[3:]]
    frequency_tuple.sort(reverse=True)
    plt.figure(figsize=(20, 6))
    cell_lines = [item[1] for item in frequency_tuple]
    frequency = [item[0] for item in frequency_tuple]
    plt.bar(cell_lines[:top_count], frequency[:top_count])
    plt.show()
    
    atc_level_one = np.unique(data["atc_level_one"])
    freq_cell_lines = cell_lines[:top_count]
    class_vs_cell = []
    for l1_class in atc_level_one:
        class_data = data[data["atc_level_one"]==l1_class]
        freq_cell_data = class_data[freq_cell_lines]
        class_vs_cell.append(freq_cell_data.sum()/len(freq_cell_data))
    
    plt.figure(figsize=(20,10))
    sns.heatmap(class_vs_cell, yticklabels=atc_level_one, xticklabels=freq_cell_lines, cmap="YlGnBu", annot=True)


### Perturbagens from REST

In [64]:
def makeURL(query_pert_id):
    URL = "https://api.clue.io/api/sigs?filter={%22where%22:{%22pert_id%22:%22"+query_pert_id+"%22},%22fields%22:{%22pert_id%22:1,%22cell_id%22:1}}&user_key=74671df08accc74116f393adf8a1d42f"
    return URL

perturbagen_cells_rest = {}

counter = 0
for perturbagen in perturbagens:
    r = requests.get(url = makeURL(perturbagen))
    data = r.json()
    cells = [a["cell_id"] for a in data]
    perturbagen_cells_rest[perturbagen] = cells
    counter+=1
    sys.stdout.write(f"\r{counter}/{len(perturbagens)}")

878/1338

KeyboardInterrupt: 

In [None]:
perturbagen_cells_rest = pickle.load(open("../data/perturbagen_cells_rest.pkl", "rb"))
data_rest = make_df(perturbagen_cells_rest, perturbagen_class)
# plot_graphs(data_rest)

### Perturbagens from Metadata

In [57]:
df = pd.read_csv("../data/full_geneexp_phase2_1004.csv")
def getcellline(x):
    cellline = x.split("_")[1]
    return cellline
df['celline'] = df['signature'].apply(getcellline)
df

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,smiles,name,id,inchi_key,atc,780,7849,6193,...,6253,7264,5467,2767,23038,57048,79716,target,signature,celline
0,0,75,NC12CC3CC(CC(C3)C1)C2,amantadine,BRD-K70330367,DKNWSYNQZKUICI-UHFFFAOYSA-N,N04BB01,-1.153825,-0.050292,-0.517787,...,0.666874,-0.319930,-0.131123,-0.213662,-1.007887,-0.182830,0.524592,BRD-K70330367,REP.A022_YAPC_24H:E21,YAPC
1,1,75,NC12CC3CC(CC(C3)C1)C2,amantadine,BRD-K70330367,DKNWSYNQZKUICI-UHFFFAOYSA-N,N04BB01,-0.150379,0.557618,-0.106715,...,-0.610206,0.188383,-0.309202,0.056737,-0.003205,-0.434692,-0.177766,BRD-K70330367,REP.A022_YAPC_24H:E20,YAPC
2,2,75,NC12CC3CC(CC(C3)C1)C2,amantadine,BRD-K70330367,DKNWSYNQZKUICI-UHFFFAOYSA-N,N04BB01,0.038450,0.620755,-0.324466,...,0.711872,-0.341481,0.437485,-0.052817,-0.376699,-0.047295,-0.215700,BRD-K70330367,REP.A022_HA1E_24H:E22,HA1E
3,3,75,NC12CC3CC(CC(C3)C1)C2,amantadine,BRD-K70330367,DKNWSYNQZKUICI-UHFFFAOYSA-N,N04BB01,2.363550,-0.061650,0.691550,...,-1.156950,0.508250,-0.230200,-0.314000,-1.208450,-0.630500,-0.447650,BRD-K70330367,REP.A022_PC3_24H:E22,PC3
4,4,75,NC12CC3CC(CC(C3)C1)C2,amantadine,BRD-K70330367,DKNWSYNQZKUICI-UHFFFAOYSA-N,N04BB01,-0.512800,-0.442350,0.697400,...,-0.792450,0.384800,-0.458000,-0.074800,-0.196750,-0.669750,0.074850,BRD-K70330367,REP.A022_PC3_24H:E21,PC3
5,5,75,NC12CC3CC(CC(C3)C1)C2,amantadine,BRD-K70330367,DKNWSYNQZKUICI-UHFFFAOYSA-N,N04BB01,0.064885,0.255498,0.487935,...,0.012580,0.466549,0.506769,0.252124,-0.926013,-0.038393,0.809444,BRD-K70330367,REP.A022_HA1E_24H:E24,HA1E
6,6,75,NC12CC3CC(CC(C3)C1)C2,amantadine,BRD-K70330367,DKNWSYNQZKUICI-UHFFFAOYSA-N,N04BB01,-0.096850,-1.951950,-0.593000,...,-0.127950,-0.676850,-0.402000,0.091000,0.079100,-0.208850,-0.729450,BRD-K70330367,REP.A022_PC3_24H:E24,PC3
7,7,75,NC12CC3CC(CC(C3)C1)C2,amantadine,BRD-K70330367,DKNWSYNQZKUICI-UHFFFAOYSA-N,N04BB01,0.170652,1.501590,0.020055,...,-0.675660,0.387090,-0.398165,-0.231521,0.122634,0.617264,-0.167454,BRD-K70330367,REP.A022_YAPC_24H:E19,YAPC
8,8,75,NC12CC3CC(CC(C3)C1)C2,amantadine,BRD-K70330367,DKNWSYNQZKUICI-UHFFFAOYSA-N,N04BB01,-1.641350,-0.886200,0.873150,...,0.045100,1.384750,-1.069100,-0.110050,-0.089550,-0.299000,1.350000,BRD-K70330367,REP.A022_PC3_24H:E19,PC3
9,9,75,NC12CC3CC(CC(C3)C1)C2,amantadine,BRD-K70330367,DKNWSYNQZKUICI-UHFFFAOYSA-N,N04BB01,-1.035050,0.361950,0.028750,...,-0.057650,-0.084950,-0.189050,0.015000,0.757350,2.744350,0.447750,BRD-K70330367,REP.A022_PC3_24H:E20,PC3


In [65]:
perturbagen_cells_meta = dict()
perturbagen_class_meta = dict()
for i in np.unique(df['target']):
    l = list(np.unique(df[df['target']==i]['celline'].tolist()))
    perturbagen_cells_meta[i] = l
    atc = np.unique(df[df['target']==i]['atc'])[0]
    perturbagen_class_meta[i] = atc
#     print(atc)

D01AC16
N04AA02
G03AC10
N01BB01
C08CA08
N02AX05
N07BC04
A03FA05
N01BB03
L02BA03
C07AB02
J05AX09
R06AB04
N06AX12
L01CD02
C07AB09
C07AG01
C07AA17
A07XA04
A01AB22
C01BC04
C08DA01
B01AE03
J05AE09
C07AA05
A04AA02
C07AG02
C10AA03
N05BA06
J01DE01
A03FA02
L04AX06
A10BG01
C05AA06
C02CA04
N06AB07
N06AX17
A02BX14
C01CE04
B01AC10
G01AF15
V04CJ02
R03CC12
C01EB16
L04AX04
N03AD02
J01CG02
G03CA04
G03CA03
N06AA06
C08EX02
N06AX03
A04AA01
P01AX06
C02BB01
S01BA08
M03BB02
P02BA01
C08CA01
A10BX06
A02BC02
J02AC02
C04AX28
J01MA01
B01AA03
L02BG01
L01XE10
D07AD01
C01BC03
J01MA09
L02BG02
J05AE01
C08CA04
M02AX06
N04BD01
L01CD01
D06BB05
C07AB04
L02BB03
C01BA03
D06AX12
C08CA02
N06AB03
M03BA03
L01XX09
D01AC07
C07AA07
L01CB01
J05AF01
D01AC12
N04AA03
M01AE07
A01AC02
L01XE27
D07AC11
A08AA03
A01AC01
G03CA07
N02BG03
D01AC08
A02BC04
G03AC01
M01AB15
J01MA15
L04AA31
G01AF06
C02DG01
M01AE10
N05CA01
A02BX06
S01EE03
P01AX05
R07AB01
N06AB04
A10BG03
D11AX22
N06DA02
J01DI03
D07AC07
B01AA04
B01AF01
C01CA19
C09AA09
N06AX16
D01AC03


In [72]:
data_meta = make_df(perturbagen_cells_meta, perturbagen_class_meta, perturbagen_cells_meta.keys())
data_meta

['A375' 'A549' 'ASC' 'ASC.C' 'BT20' 'CD34' 'HA1E' 'HCC515' 'HELA' 'HEPG2'
 'HME1' 'HS578T' 'HT29' 'HUES3' 'HUVEC' 'JURKAT' 'LNCAP' 'MCF10A' 'MCF7'
 'MDAMB231' 'MNEU.E' 'NEU' 'NPC' 'NPC.CAS9' 'NPC.TAK' 'PC3' 'SKBR3' 'SKL'
 'SKL.C' 'YAPC']
782


Unnamed: 0,perturbagen_id,perturbagen_class,atc_level_one,A375,HA1E,HELA,HT29,MCF7,PC3,YAPC,...,NPC.CAS9,NPC.TAK,SKL,SKL.C,BT20,HS578T,LNCAP,MCF10A,MDAMB231,SKBR3
0,BRD-A00218260,D01AC16,D,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,BRD-A00546892,N04AA02,N,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
2,BRD-A00938334,G03AC10,G,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
3,BRD-A01636364,N01BB01,N,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
4,BRD-A02006392,C08CA08,C,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
5,BRD-A02710418,N02AX05,N,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
6,BRD-A02990301,N07BC04,N,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
7,BRD-A03061970,A03FA05,A,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
8,BRD-A03216249,N01BB03,N,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
9,BRD-A03249105,L02BA03,L,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0


In [71]:
plot_graphs(data_rest)

TypeError: unsupported operand type(s) for -: 'str' and 'float'