# Exploration of L1000 metadata

In [1]:
import pandas as pd
import os
import numpy as np
from ai.causalcell.datasets.l1000_dataset import get_fingerprint
os.chdir('/Users/paul/PycharmProjects/causal_cell_embedding/')



## Online ressource

Check this [ggdoc](https://docs.google.com/document/d/1q2gciWRhVCAAnlvF2iRLuJ7whrGP6QjpsCMq1yWz7dU/edit#heading=h.pg4vm8v5m1i8) to know more about metadata

In [2]:
paths_to_L1000_files = {
    "phase1": {
        "path_to_dir": "Data/L1000_PhaseI",
        "path_to_data": "Data/L1000_PhaseI/GSE92742_Broad_LINCS/GSE92742_Broad_LINCS_Level5_COMPZ.MODZ_"
            "n473647x12328.gctx",
        "path_to_sig_info": "Data/L1000_PhaseI/GSE92742_Broad_LINCS/GSE92742_Broad_LINCS_sig_info.txt",
        "path_to_gene_info": "Data/L1000_PhaseI/GSE92742_Broad_LINCS/GSE92742_Broad_LINCS_gene_info.txt",
        "path_to_pert_info": "Data/L1000_PhaseI/GSE92742_Broad_LINCS/GSE92742_Broad_LINCS_pert_info.txt",
        "path_to_cell_info": "Data/L1000_PhaseI/GSE92742_Broad_LINCS/GSE92742_Broad_LINCS_cell_info.txt"
    }, "phase2": {
        "path_to_dir": "Data/L1000_PhaseII",
        "path_to_data": "Data/L1000_PhaseII/GSE70138_Broad_LINCS/Level5_COMPZ_n118050x12328_2017-03-06.gctx",
        "path_to_sig_info": "Data/L1000_PhaseII/GSE70138_Broad_LINCS/sig_info_2017-03-06.txt",
        "path_to_gene_info": "Data/L1000_PhaseII/GSE70138_Broad_LINCS/gene_info_2017-03-06.txt",
        "path_to_pert_info": "Data/L1000_PhaseII/GSE70138_Broad_LINCS/pert_info_2017-03-06.txt",
        "path_to_cell_info": "Data/L1000_PhaseII/GSE70138_Broad_LINCS/cell_info_2017-04-28.txt"
    }}

# Phase 2

In [4]:
phase="phase2"
radius=2
nBits=1024

In [4]:
assert phase in ["phase1", "phase2"]

# Data path
path_to_data = paths_to_L1000_files[phase]["path_to_data"]

# Read metadata
sig_info = pd.read_csv(paths_to_L1000_files[phase]["path_to_sig_info"], sep="\t", index_col="sig_id",
                            usecols=["sig_id", "pert_id", "cell_id", "pert_idose", "pert_itime"])
pert_info = pd.read_csv(paths_to_L1000_files[phase]["path_to_pert_info"], sep="\t",
                             index_col="pert_id")
cell_info = pd.read_csv(paths_to_L1000_files[phase]["path_to_cell_info"], sep="\t", index_col="cell_id")
gene_info = pd.read_csv(paths_to_L1000_files[phase]["path_to_gene_info"], sep="\t")

# Get list of landmark genes
landmark_gene_list = gene_info[gene_info['pr_is_lm'] == 1]["pr_gene_id"].astype(str)

# Load fingerprints
fps = pert_info.apply(lambda row: get_fingerprint(row["canonical_smiles"], radius, nBits), axis=1)
pert_info["fps"] = fps

### Signature info

In [5]:
sig_info

Unnamed: 0_level_0,pert_id,cell_id,pert_idose,pert_itime
sig_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
LJP005_A375_24H:A03,DMSO,A375,-666,24 h
LJP005_A375_24H:A04,DMSO,A375,-666,24 h
LJP005_A375_24H:A05,DMSO,A375,-666,24 h
LJP005_A375_24H:A06,DMSO,A375,-666,24 h
LJP005_A375_24H:A07,BRD-K76908866,A375,10.0 um,24 h
...,...,...,...,...
XPR002_YAPC.311_96H:G22,BRDN0001054782,YAPC.311,-666,96 h
XPR002_YAPC.311_96H:G23,BRDN0001055014,YAPC.311,-666,96 h
XPR002_YAPC.311_96H:J16,BRDN0000585515,YAPC.311,-666,96 h
XPR002_YAPC.311_96H:M15,BRDN0001054777,YAPC.311,-666,96 h


In [6]:
sig_info['pert_idose'].value_counts().index

Index(['10.0 um', '0.04 um', '0.37 um', '1.11 um', '0.12 um', '3.33 um',
       '-666', '20.0 um', '1.0 um', '2.0 um', '5.0 um', '3.0 um', '0.5 um',
       '0.1 um', '0.01 um', '0.05 um', '0.11 um', '0.02 um', '25.0 um',
       '0.33 um', '0.0 um', '3.5 um', '0.25 um', '0.4 um', '6.0 um', '1.5 um',
       '12.0 um', '0.07 um', '0.03 um', '0.06 um', '0.14 um', '1.12 um',
       '0.75 um', '5.5 um', '1.6 um', '3.85 um', '3.36 um', '3.73 um',
       '1.28 um', '4.17 um', '2.7 um', '0.43 um', '0.9 um', '0.53 um',
       '0.35 um', '0.67 um', '0.21 um', '0.46 um', '0.08 um', '1.51 um',
       '0.13 um', '10.72 um', '3.19 um', '0.3 um', '4.8 um', '11.55 um',
       '0.22 um', '5.77 um', '0.18 um', '6.79 um', '0.64 um', '11.2 um',
       '0.41 um', '1.24 um', '10.07 um', '1.06 um', '10.05 um', '1.92 um',
       '0.17 um', '0.15 um', '1.39 um', '9.58 um', '2.26 um', '4.52 um',
       '1.19 um', '3.57 um', '8.11 um', '3.35 um', '8.0 um', '4.0 um',
       '1.7 um', '2.5 um', '4.5 um', '40.0 um']

In [7]:
def get_concentration(s):
    if s == '-666':
        return -1.
    return float(s[:-3])

In [8]:
sig_info['pert_idose'].apply(get_concentration).value_counts().index

Float64Index([ 10.0,  0.04,  0.37,  1.11,  0.12,  3.33,  -1.0,  20.0,   1.0,
                2.0,   5.0,   3.0,   0.5,   0.1,  0.01,  0.05,  0.33,   0.0,
               25.0,  0.11,  0.02,  0.25,   3.5,   0.4,  12.0,   1.5,   6.0,
               0.06,  0.14,  0.07,  1.12,  0.03,  0.75,   5.5,  0.15,  11.2,
               0.22,  5.77,  0.43,  0.13,  1.19,  4.17, 10.07,  3.36,   2.7,
               1.39,  0.46,  0.08,  0.53,  1.51,  2.26,  3.57,  1.28,  4.52,
               3.35,   1.6,  9.58,  3.85,  0.64,  1.92,  0.35, 10.05,  8.11,
                4.8,   0.3, 10.72,  0.18,  0.67,  3.19,  1.24,  0.17,  3.73,
               0.41,  0.21, 11.55,   0.9,  6.79,  1.06,   8.0,   2.5,   4.5,
                4.0,   1.7,  40.0],
             dtype='float64')

In [9]:
sig_info['pert_itime'].value_counts().index

Index(['24 h', '3 h', '96 h', '6 h'], dtype='object')

In [10]:
sig_info['pert_itime'].apply(lambda s: float(s[:-2])).value_counts()

24.0    107440
3.0       5313
96.0      4179
6.0       1118
Name: pert_itime, dtype: int64

In [11]:
sig_info['pert_itime_value'] = sig_info['pert_itime'].apply(lambda s: float(s[:-2]))

In [12]:
sig_info

Unnamed: 0_level_0,pert_id,cell_id,pert_idose,pert_itime,pert_itime_value
sig_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
LJP005_A375_24H:A03,DMSO,A375,-666,24 h,24.0
LJP005_A375_24H:A04,DMSO,A375,-666,24 h,24.0
LJP005_A375_24H:A05,DMSO,A375,-666,24 h,24.0
LJP005_A375_24H:A06,DMSO,A375,-666,24 h,24.0
LJP005_A375_24H:A07,BRD-K76908866,A375,10.0 um,24 h,24.0
...,...,...,...,...,...
XPR002_YAPC.311_96H:G22,BRDN0001054782,YAPC.311,-666,96 h,96.0
XPR002_YAPC.311_96H:G23,BRDN0001055014,YAPC.311,-666,96 h,96.0
XPR002_YAPC.311_96H:J16,BRDN0000585515,YAPC.311,-666,96 h,96.0
XPR002_YAPC.311_96H:M15,BRDN0001054777,YAPC.311,-666,96 h,96.0


In [13]:
sig_fps = pert_info.loc[list(sig_info['pert_id'])]['fps']
sig_fps.index = sig_info.index
sig_info['fps'] = sig_fps

In [14]:
sig_info

Unnamed: 0_level_0,pert_id,cell_id,pert_idose,pert_itime,pert_itime_value,fps
sig_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
LJP005_A375_24H:A03,DMSO,A375,-666,24 h,24.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
LJP005_A375_24H:A04,DMSO,A375,-666,24 h,24.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
LJP005_A375_24H:A05,DMSO,A375,-666,24 h,24.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
LJP005_A375_24H:A06,DMSO,A375,-666,24 h,24.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
LJP005_A375_24H:A07,BRD-K76908866,A375,10.0 um,24 h,24.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...,...,...
XPR002_YAPC.311_96H:G22,BRDN0001054782,YAPC.311,-666,96 h,96.0,
XPR002_YAPC.311_96H:G23,BRDN0001055014,YAPC.311,-666,96 h,96.0,
XPR002_YAPC.311_96H:J16,BRDN0000585515,YAPC.311,-666,96 h,96.0,
XPR002_YAPC.311_96H:M15,BRDN0001054777,YAPC.311,-666,96 h,96.0,


### Perturbation info

In [15]:
pert_info

Unnamed: 0_level_0,canonical_smiles,inchi_key,pert_iname,pert_type,fps
pert_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
BRD-K70792160,CCN(CC)CCCCN1c2ccccc2Oc2ccc(Cl)cc12,GYBXAGDWMCJZJK-UHFFFAOYSA-N,10-DEBC,trt_cp,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ..."
BRD-K68552125,CCCCCCCCCCCCCC(=O)O[C@@H]1[C@@H](C)[C@]2(O)[C@...,PHEDXBVPIONUQT-RGYGYFBISA-N,phorbol-myristate-acetate,trt_cp,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ..."
BRD-K92301463,CCCCC(C)(C)[C@H](O)\C=C\[C@H]1[C@H](O)CC(=O)[C...,QAOBBBBDJSWHMU-WMBBNPMCSA-N,"16,16-dimethylprostaglandin-e2",trt_cp,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
BRD-A29731977,CCCCCC(=O)O[C@@]1(CCC2C3CCC4=CC(=O)CC[C@]4(C)C...,DOMWKUIIPQCAJU-JKPPDDDBSA-N,17-hydroxyprogesterone-caproate,trt_cp,"[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ..."
BRD-K07954936,OC(=O)CCCC[C@@H]1SC[C@@H]2NC(=N)N[C@H]12,WWVANQJRLPIHNS-ZKWXMUAHSA-N,2-iminobiotin,trt_cp,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...,...
BRDN0000585417,-666,-666,LACZ,ctl_vector,
BRDN0000562990,-666,-666,EGFP,ctl_vector,
BRDN0000585533,-666,-666,LUCIFERASE,ctl_vector,
BRDN0000563287,-666,-666,EGFP,ctl_vector,


In [16]:
(pert_info['canonical_smiles'] == '-666').value_counts()

False    1797
True      373
Name: canonical_smiles, dtype: int64

Check that DMSO has a SMILES

In [17]:
pert_info.loc['DMSO']

canonical_smiles                                              CS(=O)C
inchi_key                                 IAZDPXIOMUYVGZ-UHFFFAOYSA-N
pert_iname                                                       DMSO
pert_type                                                 ctl_vehicle
fps                 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
Name: DMSO, dtype: object

In [18]:
np.nonzero(pert_info.loc['DMSO']['fps'])

(array([ 33, 241, 427, 552, 589, 650]),)

Analyse examples that do not have a SMILES

In [19]:
pert_info[pert_info['canonical_smiles'] == '-666']

Unnamed: 0_level_0,canonical_smiles,inchi_key,pert_iname,pert_type,fps
pert_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
BRDN0001054785,-666,-666,AKT2,trt_xpr,
BRDN0001055003,-666,-666,CAMK1,trt_xpr,
BRDN0001054852,-666,-666,KRAS,trt_xpr,
BRDN0001054858,-666,-666,CRKL,trt_xpr,
BRDN0001055155,-666,-666,IGF2BP2,trt_xpr,
...,...,...,...,...,...
BRDN0000585417,-666,-666,LACZ,ctl_vector,
BRDN0000562990,-666,-666,EGFP,ctl_vector,
BRDN0000585533,-666,-666,LUCIFERASE,ctl_vector,
BRDN0000563287,-666,-666,EGFP,ctl_vector,


In [20]:
pert_info[pert_info['canonical_smiles'] == '-666']['pert_type'].value_counts()

trt_xpr       353
ctl_vector     19
ctl_untrt       1
Name: pert_type, dtype: int64

In [21]:
pert_info[pert_info['canonical_smiles'] != '-666']['pert_type'].value_counts()

trt_cp         1796
ctl_vehicle       1
Name: pert_type, dtype: int64

In [22]:
pert_info[pert_info['pert_type'] == 'ctl_vector']

Unnamed: 0_level_0,canonical_smiles,inchi_key,pert_iname,pert_type,fps
pert_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
BRDN0000585635,-666,-666,LUCIFERASE,ctl_vector,
BRDN0000562876,-666,-666,EGFP,ctl_vector,
BRDN0000563002,-666,-666,EGFP,ctl_vector,
BRDN0000562868,-666,-666,EGFP,ctl_vector,
BRDN0000585768,-666,-666,LUCIFERASE,ctl_vector,
BRDN0000585800,-666,-666,LACZ,ctl_vector,
BRDN0000563266,-666,-666,EGFP,ctl_vector,
BRDN0000561170,-666,-666,EGFP,ctl_vector,
BRDN0000562787,-666,-666,EGFP,ctl_vector,
BRDN0000585793,-666,-666,LACZ,ctl_vector,


In [23]:
pert_info[pert_info['pert_type'] == 'ctl_untrt']

Unnamed: 0_level_0,canonical_smiles,inchi_key,pert_iname,pert_type,fps
pert_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
CMAP-000,-666,-666,UnTrt,ctl_untrt,


In [24]:
pert_info[pert_info['pert_type'] == 'trt_xpr']

Unnamed: 0_level_0,canonical_smiles,inchi_key,pert_iname,pert_type,fps
pert_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
BRDN0001054785,-666,-666,AKT2,trt_xpr,
BRDN0001055003,-666,-666,CAMK1,trt_xpr,
BRDN0001054852,-666,-666,KRAS,trt_xpr,
BRDN0001054858,-666,-666,CRKL,trt_xpr,
BRDN0001055155,-666,-666,IGF2BP2,trt_xpr,
...,...,...,...,...,...
BRDN0000733244,-666,-666,ZEB1,trt_xpr,
BRDN0000733763,-666,-666,BCL2L1,trt_xpr,
BRDN0001054822,-666,-666,BRAF,trt_xpr,
BRDN0001054938,-666,-666,AURKB,trt_xpr,


Note: trt_xpr corresponds to CRISPR for LoF (Loss of Function)

### Cell infos

In [25]:
cell_info

Unnamed: 0_level_0,cell_type,base_cell_id,precursor_cell_id,modification,sample_type,primary_site,subtype,original_growth_pattern,provider_catalog_id,original_source_vendor,donor_age,donor_sex,donor_ethnicity
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
A375,cell line,A375,-666,-666,tumor,skin,malignant melanoma,adherent,CRL-1619,ATCC,54,F,-666
A375.311,cell line,A375,A375,genetically modified to stably express Cas9 pr...,tumor,skin,malignant melanoma,adherent,CRL-1619,ATCC,54,F,-666
A549,cell line,A549,-666,-666,tumor,lung,non small cell lung cancer| carcinoma,adherent,CCL-185,ATCC,58,M,Caucasian
A549.311,cell line,A549,A549,genetically modified to stably express Cas9 p...,tumor,lung,non small cell lung cancer| carcinoma,adherent,CCL-185,ATCC,58,M,Caucasian
A673,cell line,A673,-666,-666,tumor,bone,ewing's sarcoma,adherent,CRL-1598,ATCC,-666,F,-666
...,...,...,...,...,...,...,...,...,...,...,...,...,...
CD34,primary,CD34,-666,-666,normal,bone,bone marrow,suspension,-666,-666,-666,-666,-666
PHH,primary,PHH,-666,-666,primary,liver,normal primary liver,-666,-666,CellzDirect,-666,-666,-666
SKB,primary,SKB,-666,-666,normal,muscle,myoblast,-666,CC-2580,Lonza,-666,-666,-666
SKL,primary,SKL,-666,-666,primary,muscle,normal primary skeletal muscle cells,adherent,CC-2561,LONZA,-666,-666,-666


In [26]:
cell_info.precursor_cell_id.value_counts()

-666    84
PC3      2
HA1E     2
NPC      2
MCF7     2
A549     1
NEU      1
A375     1
YAPC     1
HELA     1
HT29     1
Name: precursor_cell_id, dtype: int64

In [27]:
cell_info.cell_type.value_counts()

cell line         83
primary            7
differentiated     6
ESC                1
iPSC               1
Name: cell_type, dtype: int64

In [28]:
cell_info.base_cell_id.value_counts()

HA1E       3
MCF7       3
NPC        3
PC3        3
A549       2
          ..
OV7        1
NCIH716    1
SHSY5Y     1
SNUC5      1
SKB        1
Name: base_cell_id, Length: 82, dtype: int64

In [29]:
cell_info.modification.value_counts()

-666                                                                        72
genetically modified to stably express Cas9 protein                         10
immortalized normal                                                          7
differentiated from iPSC, but not terminally differentiated                  2
terminally differentiated to be neurons                                      1
genetically  modified to stably express Cas9 protein                         1
differentiated from ESC to be motor neurons                                  1
bone marrow cells that were immortalized                                     1
hTERT-immortalized normal kidney cells immunoselected for DBA-positivity     1
NEU exposed to KCl (potassium chloride) solution to activate neurons         1
NPC that were genetically modified to stably express Cas9 protein            1
Name: modification, dtype: int64

In [30]:
cell_info.sample_type.value_counts()

tumor      70
normal     19
primary     8
-666        1
Name: sample_type, dtype: int64

In [31]:
cell_info.primary_site.value_counts()

large intestine                       17
lung                                  13
breast                                 9
haematopoietic and lymphoid tissue     8
ovary                                  6
kidney                                 6
skin                                   6
prostate                               5
bone                                   4
-666                                   4
central nervous system                 3
muscle                                 3
liver                                  3
endometrium                            3
adipose                                2
pancreas                               2
blood                                  1
stomach                                1
vascular system                        1
autonomic ganglia                      1
Name: primary_site, dtype: int64

In [32]:
cell_info.subtype.value_counts()

adenocarcinoma                                          11
colorectal adenocarcinoma                               10
carcinoma                                                7
-666                                                     6
colorectal carcinoma                                     5
malignant melanoma                                       4
non small cell lung cancer| adenocarcinoma               3
normal stem fibroblast-derived iPScs                     3
normal kidney                                            3
pancreatic carcinoma                                     2
acute myeloid leukemia (AML)                             2
non small cell lung cancer| large cell carcinoma         2
epithelial                                               2
hepatocellular carcinoma                                 2
normal primary skeletal muscle cells                     2
non small cell lung cancer| carcinoma                    2
bone marrow                                             

In [33]:
cell_info.original_growth_pattern.value_counts()

adherent      73
suspension    13
mix            7
-666           5
Name: original_growth_pattern, dtype: int64

### Gene information

In [34]:
gene_info

Unnamed: 0,pr_gene_id,pr_gene_symbol,pr_gene_title,pr_is_lm,pr_is_bing
0,780,DDR1,discoidin domain receptor tyrosine kinase 1,1,1
1,7849,PAX8,paired box 8,1,1
2,2978,GUCA1A,guanylate cyclase activator 1A,0,0
3,2049,EPHB3,EPH receptor B3,0,1
4,2101,ESRRA,estrogen related receptor alpha,0,1
...,...,...,...,...,...
12323,4034,LRCH4,leucine-rich repeats and calponin homology (CH...,0,1
12324,399664,MEX3D,mex-3 RNA binding family member D,0,1
12325,54869,EPS8L1,EPS8 like 1,0,1
12326,90379,DCAF15,DDB1 and CUL4 associated factor 15,0,1


# Phase 1

In [35]:
phase="phase1"
radius=2
nBits=1024

In [36]:
assert phase in ["phase1", "phase2"]

# Data path
path_to_data = paths_to_L1000_files[phase]["path_to_data"]

# Read metadata
sig_info = pd.read_csv(paths_to_L1000_files[phase]["path_to_sig_info"], sep="\t", index_col="sig_id",
                            usecols=["sig_id", "pert_id", "cell_id", "pert_idose", "pert_itime"])
pert_info = pd.read_csv(paths_to_L1000_files[phase]["path_to_pert_info"], sep="\t",
                             index_col="pert_id")
cell_info = pd.read_csv(paths_to_L1000_files[phase]["path_to_cell_info"], sep="\t", index_col="cell_id")
gene_info = pd.read_csv(paths_to_L1000_files[phase]["path_to_gene_info"], sep="\t")

# Get list of landmark genes
landmark_gene_list = gene_info[gene_info['pr_is_lm'] == 1]["pr_gene_id"].astype(str)

# Load fingerprints
fps = pert_info.apply(lambda row: get_fingerprint(row["canonical_smiles"], radius, nBits), axis=1)
pert_info["fps"] = fps

### Signature info

In [37]:
sig_info

Unnamed: 0_level_0,pert_id,cell_id,pert_idose,pert_itime
sig_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AML001_CD34_24H:A05,DMSO,CD34,0.1 %,24 h
AML001_CD34_24H:A06,DMSO,CD34,0.1 %,24 h
AML001_CD34_24H:B05,DMSO,CD34,0.1 %,24 h
AML001_CD34_24H:B06,DMSO,CD34,0.1 %,24 h
AML001_CD34_24H:BRD-A03772856:0.37037,BRD-A03772856,CD34,500 nM,24 h
...,...,...,...,...
TAK004_U2OS_96H:TRCN0000370007:1,TRCN0000370007,U2OS,1 µL,96 h
TAK004_U2OS_96H:TRCN0000370678:1,TRCN0000370678,U2OS,1 µL,96 h
TAK004_U2OS_96H:TRCN0000370697:1,TRCN0000370697,U2OS,1 µL,96 h
TAK004_U2OS_96H:TRCN0000370751:1,TRCN0000370751,U2OS,1 µL,96 h


In [38]:
sig_info['pert_idose'].value_counts()

10 µM                  122797
2 µL                    96847
1 µL                    58578
5 µM                    46744
1.5 µL                  37836
                        ...  
45 ng/mL                   11
300 ng|300 ng               9
0 ng/mL                     8
150 ng                      6
-666 -666|-666 -666         1
Name: pert_idose, Length: 89, dtype: int64

In [39]:
sig_info['pert_idose'].value_counts().values

array([122797,  96847,  58578,  46744,  37836,  21078,  17085,  11135,
         8863,   6860,   5609,   4832,   4303,   2832,   2418,   2184,
         1788,   1554,   1387,   1345,   1228,   1228,   1217,   1202,
         1169,    969,    854,    854,    853,    852,    831,    746,
          589,    551,    456,    448,    342,    293,    285,    190,
          188,    187,    175,    133,    131,    120,    114,     95,
           78,     76,     76,     57,     57,     47,     44,     44,
           44,     44,     38,     38,     38,     38,     38,     38,
           32,     32,     32,     32,     32,     19,     19,     19,
           19,     19,     19,     19,     19,     19,     19,     19,
           19,     19,     19,     16,     11,      9,      8,      6,
            1])

In [40]:
sig_info['pert_idose'].value_counts().index

Index(['10 µM', '2 µL', '1 µL', '5 µM', '1.5 µL', '5 µL', '-666', '6 µL',
       '3 µL', '500 nM', '1 µM', '100 nM', '3 µM', '0.1 %', '10 µL', '4 µL',
       '40 µM', '100 µM', '10 nM', '200 ng', '100 ng/µL', '1 ng/µL', '1 nM',
       '20 µM', '80 µM', '100 ng/mL', '0.12 µM', '0.37 µM', '1.11 µM',
       '3.33 µM', '0.04 µM', '50 ng/mL', '25 ng/mL', '20 ng/mL', '10 ng/mL',
       '30 µM', '200 ng/mL', '90 µM', '15 ng/mL', '1000 ng/mL', '5 ng/mL',
       '1 ng/mL', '60 µM', '400 ng/mL', '50 µM', '70 µM', '250 ng/mL',
       '150 ng/mL', '300 ng', '2 ng/mL', '500 ng/mL', '80 ng/mL', '40 ng/mL',
       '20 µL', '3 ng/µL', '300 ng/µL', '0.1 ng/µL', '10 ng/µL', '30 ng/mL',
       '5000 ng/mL', '10000 ng/mL', '0.1 ng/mL', '2000 ng/mL', '800 ng/mL',
       '0.41 µM', '33.33 µM', '3.7 µM', '1.23 µM', '11.11 µM', '3000 ng/mL',
       '100000 ng/mL', '16 ng/mL', '0.15 ng/mL', '200000 ng/mL', '2500 ng/mL',
       '0.5 ng/mL', '50000 ng/mL', '8300 ng/mL', '1.65 ng/mL', '0.03 ng/mL',
       '0.2 ng

In [41]:
sig_info[sig_info['pert_idose'] == '2 µL']

Unnamed: 0_level_0,pert_id,cell_id,pert_idose,pert_itime
sig_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CGS001_HCC515_96H:AARS:2,CGS001-16,HCC515,2 µL,96 h
CGS001_HCC515_96H:ABAT:2,CGS001-18,HCC515,2 µL,96 h
CGS001_HCC515_96H:ABCB1:2,CGS001-5243,HCC515,2 µL,96 h
CGS001_HCC515_96H:ABCB4:2,CGS001-5244,HCC515,2 µL,96 h
CGS001_HCC515_96H:ABCB5:2,CGS001-340273,HCC515,2 µL,96 h
...,...,...,...,...
TAK003_PC3_96H:TRCN0000437559:-666,TRCN0000437559,PC3,2 µL,96 h
TAK003_PC3_96H:TRCN0000438344:-666,TRCN0000438344,PC3,2 µL,96 h
TAK003_PC3_96H:TRCN0000439146:-666,TRCN0000439146,PC3,2 µL,96 h
TAK003_PC3_96H:TRCN0000442221:-666,TRCN0000442221,PC3,2 µL,96 h


In [42]:
pert_info.loc[list(sig_info[sig_info['pert_idose'] == '2 µL']['pert_id'])]

Unnamed: 0_level_0,pert_iname,pert_type,is_touchstone,inchi_key_prefix,inchi_key,canonical_smiles,pubchem_cid,fps
pert_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
CGS001-16,AARS,trt_sh.cgs,1,-666,-666,-666,-666,
CGS001-18,ABAT,trt_sh.cgs,1,-666,-666,-666,-666,
CGS001-5243,ABCB1,trt_sh.cgs,1,-666,-666,-666,-666,
CGS001-5244,ABCB4,trt_sh.cgs,1,-666,-666,-666,-666,
CGS001-340273,ABCB5,trt_sh.cgs,1,-666,-666,-666,-666,
...,...,...,...,...,...,...,...,...
TRCN0000437559,HIST1H1C,trt_sh,0,-666,-666,-666,-666,
TRCN0000438344,CD79B,trt_sh,0,-666,-666,-666,-666,
TRCN0000439146,HIST1H1B,trt_sh,0,-666,-666,-666,-666,
TRCN0000442221,CD79B,trt_sh,0,-666,-666,-666,-666,


Table of perturbations that have a dose of '2uL'

In [43]:
pert_info.loc[list(sig_info[sig_info['pert_idose'] == '2 µL']['pert_id'])]['canonical_smiles'].value_counts()

-666    96847
Name: canonical_smiles, dtype: int64

In [44]:
list_of_pertid_in_um = np.unique(list(sig_info[sig_info['pert_idose'].isin(['10 µM', '5 µM', '500 nM', '1 µM', '100 nM', '3 µM',
       '40 µM', '100 µM', '10 nM', '1 nM',
       '20 µM', '80 µM', '0.37 µM', '0.12 µM', '1.11 µM',
       '3.33 µM', '0.04 µM',
       '30 µM', '90 µM', '60 µM', '50 µM', '70 µM', 
       '0.41 µM', '11.11 µM', '1.23 µM', '3.7 µM', '33.33 µM'])]['pert_id']))

In [45]:
pert_info.loc[list_of_pertid_in_um]['pert_type'].value_counts()

trt_cp    20412
Name: pert_type, dtype: int64

In [46]:
def get_concentration(s):
    if s.endswith('µM') or s.endswith('um'):
        return float(s[:-3])
    if s.endswith('nM'):
        return 0.001*float(s[:-3])
    return -1


def get_time(s):
    return float(s[:-2])


In [47]:
sig_info['pert_idose_value'] = sig_info['pert_idose'].apply(get_concentration)
sig_info['pert_idose'].apply(get_concentration).value_counts()

-1.000      268614
 10.000     122797
 5.000       46744
 0.500        6860
 1.000        5609
 0.100        4832
 3.000        4303
 40.000       1788
 100.000      1554
 0.010        1387
 0.001        1217
 20.000       1202
 80.000       1169
 0.120         854
 0.370         854
 1.110         853
 3.330         852
 0.040         831
 30.000        448
 90.000        293
 60.000        175
 50.000        131
 70.000        120
 0.410          32
 1.230          32
 11.110         32
 3.700          32
 33.330         32
Name: pert_idose, dtype: int64

In [48]:
sig_info['pert_itime'].value_counts().index

Index(['96 h', '24 h', '6 h', '120 h', '144 h', '4 h', '2 h', '48 h', '168 h',
       '1 h', '3 h', '72 h'],
      dtype='object')

In [49]:
sig_info['pert_itime_value'] = sig_info['pert_itime'].apply(get_time)
sig_info['pert_itime'].apply(get_time).value_counts()

96.0     194826
24.0     119148
6.0      102306
120.0     28429
144.0     18489
4.0        2972
2.0        2663
48.0       2076
168.0       813
1.0         736
3.0         612
72.0        577
Name: pert_itime, dtype: int64

In [50]:
# Get all sig_info pert_ids 
pert_id_of_sig = sig_info['pert_id']
# Drop rows that are not in pert_info
pert_id_of_sig = pert_id_of_sig.drop(sig_info[~sig_info['pert_id'].isin(pert_info.index)].index)
# print(pert_id_of_sig.shape)

# Add the corresponding fingerprints to sig_info
sig_fps = pert_info.loc[pert_id_of_sig]['fps']
sig_fps.index = pert_id_of_sig.index
sig_fps = sig_fps.reindex(sig_info.index, fill_value=None)
sig_info['fps'] = sig_fps

In [51]:
sig_info

Unnamed: 0_level_0,pert_id,cell_id,pert_idose,pert_itime,pert_idose_value,pert_itime_value,fps
sig_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
AML001_CD34_24H:A05,DMSO,CD34,0.1 %,24 h,-1.0,24.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
AML001_CD34_24H:A06,DMSO,CD34,0.1 %,24 h,-1.0,24.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
AML001_CD34_24H:B05,DMSO,CD34,0.1 %,24 h,-1.0,24.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
AML001_CD34_24H:B06,DMSO,CD34,0.1 %,24 h,-1.0,24.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
AML001_CD34_24H:BRD-A03772856:0.37037,BRD-A03772856,CD34,500 nM,24 h,0.5,24.0,"[0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...,...,...,...
TAK004_U2OS_96H:TRCN0000370007:1,TRCN0000370007,U2OS,1 µL,96 h,-1.0,96.0,
TAK004_U2OS_96H:TRCN0000370678:1,TRCN0000370678,U2OS,1 µL,96 h,-1.0,96.0,
TAK004_U2OS_96H:TRCN0000370697:1,TRCN0000370697,U2OS,1 µL,96 h,-1.0,96.0,
TAK004_U2OS_96H:TRCN0000370751:1,TRCN0000370751,U2OS,1 µL,96 h,-1.0,96.0,


In [52]:
(~sig_info['pert_id'].isin(pert_info.index)).index

Index(['AML001_CD34_24H:A05', 'AML001_CD34_24H:A06', 'AML001_CD34_24H:B05',
       'AML001_CD34_24H:B06', 'AML001_CD34_24H:BRD-A03772856:0.37037',
       'AML001_CD34_24H:BRD-A03772856:1.11111',
       'AML001_CD34_24H:BRD-A03772856:10',
       'AML001_CD34_24H:BRD-A03772856:3.33333',
       'AML001_CD34_24H:BRD-A19037878:1.11111',
       'AML001_CD34_24H:BRD-A19037878:10',
       ...
       'TAK004_U2OS_96H:TRCN0000364646:1', 'TAK004_U2OS_96H:TRCN0000364647:1',
       'TAK004_U2OS_96H:TRCN0000369292:1', 'TAK004_U2OS_96H:TRCN0000369366:1',
       'TAK004_U2OS_96H:TRCN0000370006:1', 'TAK004_U2OS_96H:TRCN0000370007:1',
       'TAK004_U2OS_96H:TRCN0000370678:1', 'TAK004_U2OS_96H:TRCN0000370697:1',
       'TAK004_U2OS_96H:TRCN0000370751:1', 'TAK004_U2OS_96H:TRCN0000381509:1'],
      dtype='object', name='sig_id', length=473647)

### Perturbation info

In [53]:
pert_info

Unnamed: 0_level_0,pert_iname,pert_type,is_touchstone,inchi_key_prefix,inchi_key,canonical_smiles,pubchem_cid,fps
pert_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
56582,AKT2,trt_oe,0,-666,-666,-666,-666,
5981,HSF1,trt_oe,0,-666,-666,-666,-666,
7150,NFE2L2,trt_oe,0,-666,-666,-666,-666,
ABL1_G2A,ABL1,trt_oe.mut,0,-666,-666,-666,-666,
ABL1_T315I,ABL1,trt_oe.mut,0,-666,-666,-666,-666,
...,...,...,...,...,...,...,...,...
ccsbBroad304_99991,LUCIFERASE,ctl_vector,0,-666,-666,-666,-666,
ccsbBroad304_99994,lacZ,ctl_vector,0,-666,-666,-666,-666,
ccsbBroad304_99997,eGFP,ctl_vector,0,-666,-666,-666,-666,
dsRED,DSRED,trt_oe,0,-666,-666,-666,-666,


In [54]:
(pert_info['canonical_smiles'] == '-666').value_counts()

True     31032
False    20351
Name: canonical_smiles, dtype: int64

In [55]:
len(pert_info['pert_iname'].unique())

28957

Check that DMSO has a SMILES

In [56]:
pert_info.loc['DMSO']

pert_iname                                                       DMSO
pert_type                                                 ctl_vehicle
is_touchstone                                                       1
inchi_key_prefix                                       IAZDPXIOMUYVGZ
inchi_key                                 IAZDPXIOMUYVGZ-UHFFFAOYSA-N
canonical_smiles                                              CS(=O)C
pubchem_cid                                                      -666
fps                 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
Name: DMSO, dtype: object

In [57]:
np.nonzero(pert_info.loc['DMSO']['fps'])

(array([ 33, 241, 427, 552, 589, 650]),)

Analyse examples that do not have a SMILES

In [58]:
pert_info[pert_info['canonical_smiles'] == '-666']

Unnamed: 0_level_0,pert_iname,pert_type,is_touchstone,inchi_key_prefix,inchi_key,canonical_smiles,pubchem_cid,fps
pert_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
56582,AKT2,trt_oe,0,-666,-666,-666,-666,
5981,HSF1,trt_oe,0,-666,-666,-666,-666,
7150,NFE2L2,trt_oe,0,-666,-666,-666,-666,
ABL1_G2A,ABL1,trt_oe.mut,0,-666,-666,-666,-666,
ABL1_T315I,ABL1,trt_oe.mut,0,-666,-666,-666,-666,
...,...,...,...,...,...,...,...,...
ccsbBroad304_99991,LUCIFERASE,ctl_vector,0,-666,-666,-666,-666,
ccsbBroad304_99994,lacZ,ctl_vector,0,-666,-666,-666,-666,
ccsbBroad304_99997,eGFP,ctl_vector,0,-666,-666,-666,-666,
dsRED,DSRED,trt_oe,0,-666,-666,-666,-666,


In [59]:
pert_info[pert_info['canonical_smiles'] == '-666']['pert_type'].value_counts()

trt_sh             18493
trt_sh.cgs          4345
trt_sh.css          3807
trt_oe              3492
trt_lig              622
trt_oe.mut           135
trt_cp                63
ctl_vector            61
ctl_vector.cns         8
ctl_vehicle            2
ctl_vehicle.cns        2
ctl_untrt.cns          1
ctl_untrt              1
Name: pert_type, dtype: int64

In [60]:
pert_info[pert_info['canonical_smiles'] != '-666']['pert_type'].value_counts()

trt_cp         20350
ctl_vehicle        1
Name: pert_type, dtype: int64

In [61]:
pert_info[pert_info['pert_type'] == 'trt_sh']

Unnamed: 0_level_0,pert_iname,pert_type,is_touchstone,inchi_key_prefix,inchi_key,canonical_smiles,pubchem_cid,fps
pert_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
CMAP-ERGK-001,ERG,trt_sh,0,-666,-666,-666,-666,
CMAP-ERGK001,ERG,trt_sh,0,-666,-666,-666,-666,
CMAP-T2DKEIF2AK3,EIF2AK3,trt_sh,0,-666,-666,-666,-666,
CMAP-T2DKLRPPRC,LRPPRC,trt_sh,0,-666,-666,-666,-666,
CMAP-T2DKTCF7L2,TCF7L2,trt_sh,0,-666,-666,-666,-666,
...,...,...,...,...,...,...,...,...
TRCN0000443169,HORMAD2,trt_sh,0,-666,-666,-666,-666,
TRCN0000444370,HIST1H1B,trt_sh,0,-666,-666,-666,-666,
TRCN0000447230,LIN28A,trt_sh,1,-666,-666,-666,-666,
TRCN0000448055,SNAI1,trt_sh,0,-666,-666,-666,-666,


In [62]:
pert_info[pert_info['pert_type'] == 'ctl_vector']

Unnamed: 0_level_0,pert_iname,pert_type,is_touchstone,inchi_key_prefix,inchi_key,canonical_smiles,pubchem_cid,fps
pert_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
CMAP-ERGK-002,GFP,ctl_vector,0,-666,-666,-666,-666,
CMAP-HSF-GFP,GFP,ctl_vector,0,-666,-666,-666,-666,
CMAP-T2DKGFP231,GFP,ctl_vector,0,-666,-666,-666,-666,
CMAP-T2DKGFP437,GFP,ctl_vector,0,-666,-666,-666,-666,
CMAP-T2DKGFP587,GFP,ctl_vector,0,-666,-666,-666,-666,
...,...,...,...,...,...,...,...,...
ccsbBroad304_99985,BFP,ctl_vector,0,-666,-666,-666,-666,
ccsbBroad304_99988,HcRed,ctl_vector,0,-666,-666,-666,-666,
ccsbBroad304_99991,LUCIFERASE,ctl_vector,0,-666,-666,-666,-666,
ccsbBroad304_99994,lacZ,ctl_vector,0,-666,-666,-666,-666,


In [63]:
pert_info[pert_info['pert_type'] == 'ctl_untrt']

Unnamed: 0_level_0,pert_iname,pert_type,is_touchstone,inchi_key_prefix,inchi_key,canonical_smiles,pubchem_cid,fps
pert_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
CMAP-000,UnTrt,ctl_untrt,0,-666,-666,-666,-666,


### Cell infos

In [64]:
cell_info['cell_id'] = cell_info.index

In [65]:
cell_info

Unnamed: 0_level_0,cell_type,base_cell_id,precursor_cell_id,modification,sample_type,primary_site,subtype,original_growth_pattern,provider_catalog_id,original_source_vendor,donor_age,donor_sex,donor_ethnicity,cell_id
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
A375,cell line,A375,-666,-666,tumor,skin,malignant melanoma,adherent,CRL-1619,ATCC,54,F,-666,A375
A375.311,cell line,A375,A375,genetically modified to stably express Cas9 pr...,tumor,skin,malignant melanoma,adherent,CRL-1619,ATCC,54,F,-666,A375.311
A549,cell line,A549,-666,-666,tumor,lung,non small cell lung cancer| carcinoma,adherent,CCL-185,ATCC,58,M,Caucasian,A549
A549.311,cell line,A549,A549,genetically modified to stably express Cas9 p...,tumor,lung,non small cell lung cancer| carcinoma,adherent,CCL-185,ATCC,58,M,Caucasian,A549.311
A673,cell line,A673,-666,-666,tumor,bone,ewing's sarcoma,adherent,CRL-1598,ATCC,-666,F,-666,A673
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CD34,primary,CD34,-666,-666,normal,bone,bone marrow,suspension,-666,-666,-666,-666,-666,CD34
PHH,primary,PHH,-666,-666,primary,liver,normal primary liver,-666,-666,CellzDirect,-666,-666,-666,PHH
SKB,primary,SKB,-666,-666,normal,muscle,myoblast,-666,CC-2580,Lonza,-666,-666,-666,SKB
SKL,primary,SKL,-666,-666,primary,muscle,normal primary skeletal muscle cells,adherent,CC-2561,LONZA,-666,-666,-666,SKL


In [66]:
cell_info.precursor_cell_id.value_counts()

-666    84
PC3      2
HA1E     2
NPC      2
MCF7     2
A549     1
NEU      1
A375     1
YAPC     1
HELA     1
HT29     1
Name: precursor_cell_id, dtype: int64

In [67]:
cell_info.cell_type.value_counts()

cell line         83
primary            7
differentiated     6
ESC                1
iPSC               1
Name: cell_type, dtype: int64

In [68]:
cell_info.base_cell_id.value_counts()

HA1E       3
MCF7       3
NPC        3
PC3        3
A549       2
          ..
OV7        1
NCIH716    1
SHSY5Y     1
SNUC5      1
SKB        1
Name: base_cell_id, Length: 82, dtype: int64

In [69]:
cell_info.modification.value_counts()

-666                                                                        72
genetically modified to stably express Cas9 protein                         10
immortalized normal                                                          7
differentiated from iPSC, but not terminally differentiated                  2
terminally differentiated to be neurons                                      1
genetically  modified to stably express Cas9 protein                         1
differentiated from ESC to be motor neurons                                  1
bone marrow cells that were immortalized                                     1
hTERT-immortalized normal kidney cells immunoselected for DBA-positivity     1
NEU exposed to KCl (potassium chloride) solution to activate neurons         1
NPC that were genetically modified to stably express Cas9 protein            1
Name: modification, dtype: int64

In [70]:
cell_info.sample_type.value_counts()

tumor      70
normal     19
primary     8
-666        1
Name: sample_type, dtype: int64

In [71]:
cell_info.primary_site.value_counts()

large intestine                       17
lung                                  13
breast                                 9
haematopoietic and lymphoid tissue     8
ovary                                  6
kidney                                 6
skin                                   6
prostate                               5
bone                                   4
-666                                   4
central nervous system                 3
muscle                                 3
liver                                  3
endometrium                            3
adipose                                2
pancreas                               2
blood                                  1
stomach                                1
vascular system                        1
autonomic ganglia                      1
Name: primary_site, dtype: int64

In [72]:
cell_info.subtype.value_counts()

adenocarcinoma                                          11
colorectal adenocarcinoma                               10
carcinoma                                                7
-666                                                     6
colorectal carcinoma                                     5
malignant melanoma                                       4
non small cell lung cancer| adenocarcinoma               3
normal stem fibroblast-derived iPScs                     3
normal kidney                                            3
pancreatic carcinoma                                     2
acute myeloid leukemia (AML)                             2
non small cell lung cancer| large cell carcinoma         2
epithelial                                               2
hepatocellular carcinoma                                 2
normal primary skeletal muscle cells                     2
non small cell lung cancer| carcinoma                    2
bone marrow                                             

In [73]:
cell_info.original_growth_pattern.value_counts()

adherent      73
suspension    13
mix            7
-666           5
Name: original_growth_pattern, dtype: int64

In [74]:
cell_id_of_sig = sig_info['cell_id']
# Drop rows that are not in cell_info
cell_id_of_sig = cell_id_of_sig.drop(sig_info[~sig_info['cell_id'].isin(cell_info.index)].index)
# Add the corresponding cell information to sig_info
sig_info_of_cell = cell_info.loc[cell_id_of_sig][['primary_site', 'original_growth_pattern', 
                                                  'subtype', 'sample_type', 'cell_type', 'cell_id']]
sig_info_of_cell = pd.get_dummies(sig_info_of_cell) # Get one hot encodings
sig_info_of_cell.index = cell_id_of_sig.index
print(sig_info_of_cell.shape, sig_info.shape)
sig_info_of_cell = sig_info_of_cell.reindex(sig_info.index, fill_value=0)
pd.concat([sig_info, sig_info_of_cell], axis=1, sort=False)

(473269, 146) (473647, 7)


Unnamed: 0_level_0,pert_id,cell_id,pert_idose,pert_itime,pert_idose_value,pert_itime_value,fps,primary_site_-666,primary_site_adipose,primary_site_autonomic ganglia,...,cell_id_SW620,cell_id_SW948,cell_id_T3M10,cell_id_THP1,cell_id_TYKNU,cell_id_U266,cell_id_U2OS,cell_id_U937,cell_id_VCAP,cell_id_WSUDLCL2
sig_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AML001_CD34_24H:A05,DMSO,CD34,0.1 %,24 h,-1.0,24.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,0,0,...,0,0,0,0,0,0,0,0,0,0
AML001_CD34_24H:A06,DMSO,CD34,0.1 %,24 h,-1.0,24.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,0,0,...,0,0,0,0,0,0,0,0,0,0
AML001_CD34_24H:B05,DMSO,CD34,0.1 %,24 h,-1.0,24.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,0,0,...,0,0,0,0,0,0,0,0,0,0
AML001_CD34_24H:B06,DMSO,CD34,0.1 %,24 h,-1.0,24.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,0,0,...,0,0,0,0,0,0,0,0,0,0
AML001_CD34_24H:BRD-A03772856:0.37037,BRD-A03772856,CD34,500 nM,24 h,0.5,24.0,"[0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TAK004_U2OS_96H:TRCN0000370007:1,TRCN0000370007,U2OS,1 µL,96 h,-1.0,96.0,,0,0,0,...,0,0,0,0,0,0,1,0,0,0
TAK004_U2OS_96H:TRCN0000370678:1,TRCN0000370678,U2OS,1 µL,96 h,-1.0,96.0,,0,0,0,...,0,0,0,0,0,0,1,0,0,0
TAK004_U2OS_96H:TRCN0000370697:1,TRCN0000370697,U2OS,1 µL,96 h,-1.0,96.0,,0,0,0,...,0,0,0,0,0,0,1,0,0,0
TAK004_U2OS_96H:TRCN0000370751:1,TRCN0000370751,U2OS,1 µL,96 h,-1.0,96.0,,0,0,0,...,0,0,0,0,0,0,1,0,0,0


### Gene information

In [75]:
gene_info

Unnamed: 0,pr_gene_id,pr_gene_symbol,pr_gene_title,pr_is_lm,pr_is_bing
0,780,DDR1,discoidin domain receptor tyrosine kinase 1,1,1
1,7849,PAX8,paired box 8,1,1
2,2978,GUCA1A,guanylate cyclase activator 1A,0,0
3,2049,EPHB3,EPH receptor B3,0,1
4,2101,ESRRA,estrogen related receptor alpha,0,1
...,...,...,...,...,...
12323,4034,LRCH4,leucine-rich repeats and calponin homology (CH...,0,1
12324,399664,MEX3D,mex-3 RNA binding family member D,0,1
12325,54869,EPS8L1,EPS8 like 1,0,1
12326,90379,DCAF15,DDB1 and CUL4 associated factor 15,0,1


# Merging both phases

In [5]:
phase = "phase1"
# Data path
path_to_data_1 = paths_to_L1000_files[phase]["path_to_data"]

# Read metadata
sig_info_1 = pd.read_csv(paths_to_L1000_files[phase]["path_to_sig_info"], sep="\t", index_col="sig_id",
                            usecols=["sig_id", "pert_id", "cell_id", "pert_idose", "pert_itime"])
pert_info_1 = pd.read_csv(paths_to_L1000_files[phase]["path_to_pert_info"], sep="\t",
                             index_col="pert_id", usecols=["pert_id", "canonical_smiles"])
cell_info_1 = pd.read_csv(paths_to_L1000_files[phase]["path_to_cell_info"], sep="\t", index_col="cell_id")
gene_info_1 = pd.read_csv(paths_to_L1000_files[phase]["path_to_gene_info"], sep="\t")

# Get list of landmark genes
landmark_gene_list_1 = gene_info_1[gene_info_1['pr_is_lm'] == 1]["pr_gene_id"].astype(str)

# Load fingerprints
# fps_1 = pert_info_1.apply(lambda row: get_fingerprint(row["canonical_smiles"], radius, nBits), axis=1)
# pert_info_1["fps"] = fps_1

phase = "phase2"
# Data path
path_to_data_2 = paths_to_L1000_files[phase]["path_to_data"]

# Read metadata
sig_info_2 = pd.read_csv(paths_to_L1000_files[phase]["path_to_sig_info"], sep="\t", index_col="sig_id",
                            usecols=["sig_id", "pert_id", "cell_id", "pert_idose", "pert_itime"])
pert_info_2 = pd.read_csv(paths_to_L1000_files[phase]["path_to_pert_info"], sep="\t",
                             index_col="pert_id", usecols=["pert_id", "canonical_smiles"])
cell_info_2 = pd.read_csv(paths_to_L1000_files[phase]["path_to_cell_info"], sep="\t", index_col="cell_id")
gene_info_2 = pd.read_csv(paths_to_L1000_files[phase]["path_to_gene_info"], sep="\t")

# Get list of landmark genes
landmark_gene_list_2 = gene_info_2[gene_info_2['pr_is_lm'] == 1]["pr_gene_id"].astype(str)

# Load fingerprints
# fps_2 = pert_info_2.apply(lambda row: get_fingerprint(row["canonical_smiles"], radius, nBits), axis=1)
# pert_info_2["fps"] = fps_2

In [6]:
gene_info_2.equals(gene_info_1)

True

In [7]:
cell_info_2.equals(cell_info_1)

True

In [8]:
pert_info_1.equals(pert_info_2)

False

In [9]:
landmark_gene_list_1.equals(landmark_gene_list_2)

True

In [10]:
pert_info_1.index

Index(['56582', '5981', '7150', 'ABL1_G2A', 'ABL1_T315I', 'ACHE',
       'ACVR1_G328R', 'ACVR1_G356D', 'ACVR1_R206H', 'ACVRL1_C344P',
       ...
       'ccsbBroad304_15726', 'ccsbBroad304_16045', 'ccsbBroad304_16132',
       'ccsbBroad304_99985', 'ccsbBroad304_99988', 'ccsbBroad304_99991',
       'ccsbBroad304_99994', 'ccsbBroad304_99997', 'dsRED', 'renilla'],
      dtype='object', name='pert_id', length=51383)

In [11]:
pert_info_2.index

Index(['BRD-K70792160', 'BRD-K68552125', 'BRD-K92301463', 'BRD-A29731977',
       'BRD-K07954936', 'BRD-K44408410', 'BRD-A20131130', 'BRD-K08703257',
       'BRD-A79431551', 'BRD-K37846922',
       ...
       'BRDN0000585793', 'BRDN0000585515', 'BRDN0000562855', 'BRDN0000562867',
       'BRDN0000562805', 'BRDN0000585417', 'BRDN0000562990', 'BRDN0000585533',
       'BRDN0000563287', 'BRDN0000562919'],
      dtype='object', name='pert_id', length=2170)

In [12]:
pert_intersection = list(set(pert_info_1.index).intersection(pert_info_2.index))
print(len(pert_intersection))

912


In [13]:
pert_info_1.loc[pert_intersection].equals(pert_info_2.loc[pert_intersection])

True

In [17]:
# comp_pert_2 = pert_info_2[["canonical_smiles", "pert_iname", "pert_type"]].loc[pert_intersection]

In [18]:
# comp_pert_1 = pert_info_1[["canonical_smiles", "pert_iname", "pert_type"]].loc[pert_intersection]

In [20]:
# comp_pert_1.equals(comp_pert_2)

In [24]:
# np.where(comp_pert_1 != comp_pert_2)

In [25]:
# np.array(comp_pert_1)

In [26]:
# np.array(comp_pert_2)

only the pert_inames change

In [27]:
sig_intersection = list(set(sig_info_1.index).intersection(sig_info_2.index))

In [28]:
sig_intersection

[]

No signature present in both datasets

In [29]:
pd.concat([sig_info_1, sig_info_2])

Unnamed: 0_level_0,pert_id,cell_id,pert_idose,pert_itime
sig_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AML001_CD34_24H:A05,DMSO,CD34,0.1 %,24 h
AML001_CD34_24H:A06,DMSO,CD34,0.1 %,24 h
AML001_CD34_24H:B05,DMSO,CD34,0.1 %,24 h
AML001_CD34_24H:B06,DMSO,CD34,0.1 %,24 h
AML001_CD34_24H:BRD-A03772856:0.37037,BRD-A03772856,CD34,500 nM,24 h
...,...,...,...,...
XPR002_YAPC.311_96H:G22,BRDN0001054782,YAPC.311,-666,96 h
XPR002_YAPC.311_96H:G23,BRDN0001055014,YAPC.311,-666,96 h
XPR002_YAPC.311_96H:J16,BRDN0000585515,YAPC.311,-666,96 h
XPR002_YAPC.311_96H:M15,BRDN0001054777,YAPC.311,-666,96 h


In [30]:
sig_info_1

Unnamed: 0_level_0,pert_id,cell_id,pert_idose,pert_itime
sig_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AML001_CD34_24H:A05,DMSO,CD34,0.1 %,24 h
AML001_CD34_24H:A06,DMSO,CD34,0.1 %,24 h
AML001_CD34_24H:B05,DMSO,CD34,0.1 %,24 h
AML001_CD34_24H:B06,DMSO,CD34,0.1 %,24 h
AML001_CD34_24H:BRD-A03772856:0.37037,BRD-A03772856,CD34,500 nM,24 h
...,...,...,...,...
TAK004_U2OS_96H:TRCN0000370007:1,TRCN0000370007,U2OS,1 µL,96 h
TAK004_U2OS_96H:TRCN0000370678:1,TRCN0000370678,U2OS,1 µL,96 h
TAK004_U2OS_96H:TRCN0000370697:1,TRCN0000370697,U2OS,1 µL,96 h
TAK004_U2OS_96H:TRCN0000370751:1,TRCN0000370751,U2OS,1 µL,96 h


In [31]:
pert_info_1

Unnamed: 0_level_0,canonical_smiles
pert_id,Unnamed: 1_level_1
56582,-666
5981,-666
7150,-666
ABL1_G2A,-666
ABL1_T315I,-666
...,...
ccsbBroad304_99991,-666
ccsbBroad304_99994,-666
ccsbBroad304_99997,-666
dsRED,-666


In [32]:
pert_info_2

Unnamed: 0_level_0,canonical_smiles
pert_id,Unnamed: 1_level_1
BRD-K70792160,CCN(CC)CCCCN1c2ccccc2Oc2ccc(Cl)cc12
BRD-K68552125,CCCCCCCCCCCCCC(=O)O[C@@H]1[C@@H](C)[C@]2(O)[C@...
BRD-K92301463,CCCCC(C)(C)[C@H](O)\C=C\[C@H]1[C@H](O)CC(=O)[C...
BRD-A29731977,CCCCCC(=O)O[C@@]1(CCC2C3CCC4=CC(=O)CC[C@]4(C)C...
BRD-K07954936,OC(=O)CCCC[C@@H]1SC[C@@H]2NC(=N)N[C@H]12
...,...
BRDN0000585417,-666
BRDN0000562990,-666
BRDN0000585533,-666
BRDN0000563287,-666


In [33]:
pert_concat = pd.concat([pert_info_1, pert_info_2])
pert_concat = pert_concat.loc[~pert_concat.index.duplicated(keep='first')]
pert_concat

Unnamed: 0_level_0,canonical_smiles
pert_id,Unnamed: 1_level_1
56582,-666
5981,-666
7150,-666
ABL1_G2A,-666
ABL1_T315I,-666
...,...
BRDN0000585417,-666
BRDN0000562990,-666
BRDN0000585533,-666
BRDN0000563287,-666


In [34]:
pert_info_1.shape[0] + pert_info_2.shape[0] -len(pert_intersection)

52641

In [3]:
phase = "phase1"
path_to_data = paths_to_L1000_files[phase]["path_to_data"]
# Load all data
df_path = os.path.join(paths_to_L1000_files[phase]["path_to_dir"], "dataframe.pkl")
data_1 = pd.read_pickle(df_path)

phase = "phase2"
path_to_data = paths_to_L1000_files[phase]["path_to_data"]
# Load all data
df_path = os.path.join(paths_to_L1000_files[phase]["path_to_dir"], "dataframe.pkl")
data_2 = pd.read_pickle(df_path)

In [4]:
data_1

Unnamed: 0_level_0,gene_expr
cid,Unnamed: 1_level_1
CPC005_A375_6H:BRD-A85280935-003-01-7:10,"[0.77376896, -0.81846803, 0.18957229, -0.14603..."
CPC005_A375_6H:BRD-A07824748-001-02-6:10,"[-0.64558613, -0.8107487, 0.45906025, -0.22467..."
CPC004_A375_6H:BRD-K20482099-001-01-1:10,"[-5.4496655, 2.393775, 1.2797899, 2.167868, 2...."
CPC005_A375_6H:BRD-K62929068-001-03-3:10,"[0.19340771, -0.58224326, -0.17897698, -1.1820..."
CPC005_A375_6H:BRD-K43405658-001-01-8:10,"[1.0062981, 0.455536, 0.63173795, -0.93641376,..."
...,...
PCLB003_PC3_24H:BRD-A75409952-001-01-6:0.12,"[1.3442287, 0.58684695, 0.056336567, 2.275817,..."
PCLB003_PC3_24H:BRD-A75409952-001-01-6:0.04,"[1.5823274, 0.4941511, 0.44259453, -1.0504124,..."
PCLB003_PC3_24H:BRD-K42573370-001-01-1:10,"[-1.8632, -2.2225, -0.3203, 4.0302, -4.1008, -..."
PCLB003_PC3_24H:BRD-K53665955-001-01-4:0.04,"[-0.13275003, 1.9477, 0.60395, 4.72655, -0.544..."


In [5]:
data_2

Unnamed: 0_level_0,gene_expr
cid,Unnamed: 1_level_1
REP.A001_A375_24H:A03,"[4.2641425, 0.057249196, -2.1393342, -0.221784..."
REP.A001_A375_24H:A04,"[-0.3822108, 0.30431318, -0.9959235, -0.670833..."
REP.A001_A375_24H:A05,"[-0.57171094, -0.75499886, -0.71010953, 0.4288..."
REP.A001_A375_24H:A06,"[0.5843761, -0.58997315, -0.026397973, -0.0652..."
REP.A001_A375_24H:A07,"[0.6583478, -0.2268537, -1.143599, 0.34242612,..."
...,...
LJP007_SKL_24H:O13,"[4.4395, 10.0, 1.6831, 4.5242, -2.6985, 1.6431..."
LJP007_SKL_24H:O14,"[6.175, 2.8649, 1.8397, 4.3375, -1.0555, 0.963..."
LJP007_SKL_24H:O24,"[8.0582, 0.4905, 3.3238, 3.6885, -1.5548, -4.0..."
LJP007_SKL_24H:P24,"[10.0, 9.1524, -1.2545, -4.9315, 0.9744, -3.12..."


In [38]:
set(data_1.index).intersection(data_2.index)

set()

In [39]:
len(set(data_1.columns).intersection(data_2.columns))

978

In [40]:
pd.concat([data_1, data_2], sort=False)

Unnamed: 0_level_0,5720,466,6009,2309,387,3553,427,5898,23365,6657,...,9738,6793,7358,58472,50865,23200,51293,10962,10153,874
cid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CPC005_A375_6H:BRD-A85280935-003-01-7:10,0.773769,-0.818468,0.189572,-0.146031,-0.654002,0.206183,0.626013,-0.260590,0.369380,0.328593,...,-1.191830,-0.061229,-0.505697,0.339170,-0.133465,0.651272,0.196972,0.000822,-0.118087,-0.540854
CPC005_A375_6H:BRD-A07824748-001-02-6:10,-0.645586,-0.810749,0.459060,-0.224676,-0.335681,2.804548,-0.125799,0.304977,-0.895473,3.438638,...,-1.836194,1.507439,-0.571273,1.174893,4.993051,0.908271,-2.183036,-1.593815,-0.200885,-1.985022
CPC004_A375_6H:BRD-K20482099-001-01-1:10,-5.449666,2.393775,1.279790,2.167868,2.333199,1.649395,2.004065,0.466132,2.341779,2.535294,...,-0.352764,2.288361,-0.370153,4.419082,-1.994474,-0.844823,1.017626,-5.340797,-2.409926,2.094843
CPC005_A375_6H:BRD-K62929068-001-03-3:10,0.193408,-0.582243,-0.178977,-1.182025,-1.012651,1.753655,-0.988824,-0.217892,0.807503,-1.357967,...,0.394868,0.418696,-0.712170,-0.002733,0.433721,0.516296,-1.434045,0.340841,0.074960,-5.317751
CPC005_A375_6H:BRD-K43405658-001-01-8:10,1.006298,0.455536,0.631738,-0.936414,-1.213203,1.662564,-0.239190,0.079852,-0.992458,0.676782,...,-1.938225,3.251216,-0.498275,3.756618,0.284775,0.785232,-0.346764,-1.431072,0.186842,0.186379
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
LJP007_SKL_24H:O13,2.253500,-9.957100,-3.923000,-2.047700,1.643100,4.245200,1.556700,-3.169900,-2.201900,2.682000,...,-0.586300,1.410900,-2.538600,2.534800,-2.044000,2.143000,3.889800,-4.905300,-1.310100,-2.536600
LJP007_SKL_24H:O14,2.389000,-6.052200,-4.456400,-3.286600,0.963200,2.774700,1.556700,-3.828400,-2.165100,2.369900,...,-0.319500,1.604600,-4.498700,-0.685100,-5.743500,1.820100,3.649200,-3.636500,-2.545800,-2.401500
LJP007_SKL_24H:O24,-2.444300,-0.150300,-0.941800,0.264600,-4.049600,2.436500,-5.328400,1.210300,0.562100,-0.614100,...,0.088200,1.270000,-1.307400,0.510100,-5.238400,-3.759700,0.241800,-4.689700,1.412600,0.655800
LJP007_SKL_24H:P24,2.310400,-2.382000,-10.000000,3.467200,-3.125800,5.853200,-10.000000,-4.087800,2.970700,2.361100,...,10.000000,10.000000,-0.348000,-7.336400,-9.948600,-2.987100,4.624600,-5.629200,8.663500,3.915500
