In [56]:
import pandas as pd
import numpy as np
import codecs
from rdkit import Chem
from rdkit.Chem.SaltRemover import SaltRemover
from subword_nmt.apply_bpe import BPE

# Preprocess Omics Data 

In [30]:
cell_list = pd.read_csv('./raw_data/cell_info.csv')['depmap_id'].values
gene_list = pd.read_csv('./raw_data/gene_4079_info.csv')['entrez_id'].values
cell_list

array(['ACH-000425', 'ACH-000615', 'ACH-000579', 'ACH-000750',
       'ACH-000730', 'ACH-001190', 'ACH-000788', 'ACH-000147',
       'ACH-000651', 'ACH-000810', 'ACH-000644', 'ACH-000348',
       'ACH-000450', 'ACH-000968', 'ACH-000552', 'ACH-000900',
       'ACH-000971', 'ACH-000572', 'ACH-000811', 'ACH-000401',
       'ACH-000463', 'ACH-000614', 'ACH-000008', 'ACH-000019',
       'ACH-000001', 'ACH-000849', 'ACH-000304', 'ACH-000219',
       'ACH-000555', 'ACH-000915', 'ACH-000288', 'ACH-000148',
       'ACH-000367', 'ACH-000376', 'ACH-000681', 'ACH-000987',
       'ACH-000046', 'ACH-000997', 'ACH-000768', 'ACH-001151',
       'ACH-000655', 'ACH-000232', 'ACH-000969', 'ACH-000090',
       'ACH-000966', 'ACH-000343', 'ACH-000696', 'ACH-000273',
       'ACH-000649', 'ACH-000477', 'ACH-000617', 'ACH-000433',
       'ACH-000428', 'ACH-000551', 'ACH-000338', 'ACH-000825',
       'ACH-000861', 'ACH-000706', 'ACH-000504', 'ACH-000817',
       'ACH-001685', 'ACH-000702', 'ACH-000052', 'ACH-0

In [49]:
# 'tanh_norm'
def normalization(data): 
    std1 = np.nanstd(data, axis=0)  
    data = np.ascontiguousarray(data)
    means1 = np.mean(data, axis=0) 

    data = (data-means1)/std1
    data = np.tanh(data)

    data[np.isnan(data)] = 0
    
    return data

In [47]:
def process_gene(df):

    for i in gene_list.astype('str'):
        if i not in df.columns:
            df[i] = 0.
    
    mean_arr = df.mean()
    for j in cell_list:
        if j not in df.index:
            df.loc[j,:] = mean_arr

    df = df[gene_list.astype('str')]
    df = df.loc[cell_list,:]
    df_arr = normalization(df.values)
    
    print(df_arr.shape)
    return df_arr 

In [31]:
exp_raw = pd.read_csv('./raw_data/omics_data/cell_exp_raw.csv', index_col=0)
mut_raw = pd.read_csv('./raw_data/omics_data/cell_mut_raw.csv', index_col=0)
cn_raw = pd.read_csv('./raw_data/omics_data/cell_cn_raw.csv', index_col=0)
eff_raw = pd.read_csv('./raw_data/omics_data/cell_eff_raw.csv', index_col=0)
dep_raw = pd.read_csv('./raw_data/omics_data/cell_dep_raw.csv', index_col=0)
met_raw = pd.read_csv('./raw_data/omics_data/cell_met_raw.csv', index_col=0)
exp_raw

Unnamed: 0_level_0,7105,64102,8813,57147,55732,2268,3075,2519,2729,4800,...,89839,1124.1,54816.1,11046.1,114483834,548644,114483833,647264,106865373,102724657
DepMap_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ACH-000001,5.823495,0.000000,7.369292,2.100978,4.221877,0.042644,0.910733,5.672991,4.676380,4.013462,...,1.545968,0.189034,0.275007,0.000000,0.545968,4.424922,0.000000,0.000000,0.124328,0.000000
ACH-000002,0.189034,0.000000,5.633431,1.263034,3.152183,4.189034,0.163499,4.161888,4.139961,2.799087,...,1.761285,0.000000,0.056584,0.000000,0.799087,3.748461,0.000000,0.000000,0.189034,0.000000
ACH-000003,6.035624,0.084064,7.744767,1.851999,3.895303,0.000000,0.056584,6.597978,4.734981,4.141596,...,1.895303,0.344828,0.042644,0.028569,2.914565,5.350144,0.111031,0.000000,0.070389,0.000000
ACH-000004,2.599318,0.000000,5.310340,2.467279,3.926948,0.918386,4.957915,3.912650,4.878725,4.971314,...,2.704872,0.286881,0.070389,0.454176,2.257011,6.182692,0.000000,0.000000,0.782409,0.000000
ACH-000005,3.051372,0.000000,5.754888,2.969012,5.322289,0.214125,5.762880,4.152995,5.532629,5.330558,...,2.580145,0.454176,0.137504,0.084064,1.560715,5.469886,0.000000,0.000000,0.042644,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ACH-002508,5.556736,0.000000,6.861955,2.220330,2.843984,0.014355,4.788164,5.529509,4.036503,3.463361,...,1.485427,0.000000,0.176323,0.000000,2.056584,3.777157,0.000000,0.000000,0.286881,0.000000
ACH-002509,5.548128,0.344828,6.132166,3.249445,4.156235,0.000000,1.981853,5.844737,3.471187,4.622345,...,0.948601,0.014355,0.275007,0.000000,0.807355,5.042644,0.000000,0.000000,1.422233,0.000000
ACH-002510,4.052242,0.000000,6.295539,1.922198,2.364572,0.000000,1.495695,6.338424,3.768714,4.216455,...,0.176323,0.028569,0.042644,0.028569,2.456806,5.454505,0.000000,0.042644,1.454176,0.042644
ACH-002511,3.727920,0.000000,7.306335,2.833902,3.942984,0.400538,1.843984,4.697107,4.454176,4.144046,...,0.632268,0.028569,0.111031,0.028569,1.570463,4.254745,0.000000,0.000000,0.855990,0.000000


In [51]:
exp_norm = process_gene(exp_raw)
mut_norm = process_gene(mut_raw)
cn_norm = process_gene(cn_raw)
eff_norm = process_gene(eff_raw)
dep_norm = process_gene(dep_raw)
met_norm = process_gene(met_raw)
exp_norm

  data = (data-means1)/std1
  data = (data-means1)/std1


(170, 4079)
(170, 4079)
(170, 4079)
(170, 4079)
(170, 4079)
(170, 4079)


  data = (data-means1)/std1
  data = (data-means1)/std1
  data = (data-means1)/std1


array([[-0.65929888, -0.29371101, -0.3456667 , ..., -0.32113319,
        -0.20751589, -0.17686495],
       [-0.35259911, -0.55027678,  0.86243642, ..., -0.32113319,
        -0.20751589, -0.17686495],
       [ 0.81905023, -0.77037037,  0.87643153, ..., -0.32113319,
        -0.20751589, -0.17686495],
       ...,
       [ 0.63450427, -0.49849807, -0.88172769, ..., -0.32113319,
         0.17637874, -0.17686495],
       [-0.91198369,  0.03851141, -0.44252208, ..., -0.32113319,
         0.51036901, -0.17686495],
       [ 0.80072262, -0.16558187,  0.58852235, ...,  0.74586177,
         0.17637874, -0.17686495]])

In [53]:
# stack six-omics
exp_mut_cn_eff_dep_met = np.dstack((exp_norm,mut_norm,cn_norm,eff_norm,dep_norm,met_norm))
exp_mut_cn_eff_dep_met

array([[[-0.65929888, -0.21863508, -0.5794201 , -0.99297606,
          0.68736301, -0.49532403],
        [-0.29371101, -0.25712316, -0.60552954,  0.49569812,
         -0.37193214, -0.23229456],
        [-0.3456667 , -0.3014243 , -0.03800188, -0.22589936,
         -0.02315676, -0.35504196],
        ...,
        [-0.32113319,  0.        ,  0.34150311,  0.        ,
          0.        ,  0.        ],
        [-0.20751589,  0.        ,  0.10391545,  0.        ,
          0.        ,  0.        ],
        [-0.17686495,  0.        ,  0.81880147,  0.        ,
          0.        ,  0.        ]],

       [[-0.35259911, -0.21863508, -0.25785485, -0.06317779,
         -0.11917116, -0.49532403],
        [-0.55027678, -0.25712316, -0.49403855,  0.02321648,
          0.01061242, -0.32063781],
        [ 0.86243642, -0.3014243 , -0.6364502 ,  0.08373165,
         -0.05021293, -0.03857797],
        ...,
        [-0.32113319,  0.        ,  0.68043079,  0.        ,
          0.        ,  0.        ],
  

In [55]:
# 
exp_mut_cn_eff_dep_met_dict = dict(zip(cell_list,exp_mut_cn_eff_dep_met))
exp_mut_cn_eff_dep_met_dict

{'ACH-000425': array([[-0.65929888, -0.21863508, -0.5794201 , -0.99297606,  0.68736301,
         -0.49532403],
        [-0.29371101, -0.25712316, -0.60552954,  0.49569812, -0.37193214,
         -0.23229456],
        [-0.3456667 , -0.3014243 , -0.03800188, -0.22589936, -0.02315676,
         -0.35504196],
        ...,
        [-0.32113319,  0.        ,  0.34150311,  0.        ,  0.        ,
          0.        ],
        [-0.20751589,  0.        ,  0.10391545,  0.        ,  0.        ,
          0.        ],
        [-0.17686495,  0.        ,  0.81880147,  0.        ,  0.        ,
          0.        ]]),
 'ACH-000615': array([[-0.35259911, -0.21863508, -0.25785485, -0.06317779, -0.11917116,
         -0.49532403],
        [-0.55027678, -0.25712316, -0.49403855,  0.02321648,  0.01061242,
         -0.32063781],
        [ 0.86243642, -0.3014243 , -0.6364502 ,  0.08373165, -0.05021293,
         -0.03857797],
        ...,
        [-0.32113319,  0.        ,  0.68043079,  0.        ,  0.       

In [None]:
# np.save('./0_cell_data/4079g/170_cellGraphs_exp_mut_cn_eff_dep_met_4079_genes_norm.npy',exp_mut_cn_eff_dep_met_dict)

# Generate Drug Substructure Encoding

In [57]:
def process_smi(smi):

    if ';' in smi:
        smi = smi.split(';')[0].split(';')[0].split(';')[0]
    m = Chem.MolFromSmiles(smi)
    remover = SaltRemover()

    try:
        moll, deleted = remover.StripMolWithDeleted(m)
        Chem.AddHs(moll)
    except Exception as e:
        print('except:',e)

    canonical_smi = Chem.MolToSmiles(moll)
    return canonical_smi

In [23]:
def drug2emb_encoder(smile):
        
        vocab_path = './raw_data/drug_substructure/drug_codes.txt'
        sub2idx = np.load('./raw_data/drug_substructure/sub2idx.npy',allow_pickle=True).item()     
        bpe_codes_drug = codecs.open(vocab_path)
        dbpe = BPE(bpe_codes_drug, merges=-1, separator='')

        max_d = 50
        try:
            t1 = dbpe.process_line(smile).split()  
        except:
            print('this molecular is error:')
            print(smile)

        try:
            i1 = np.asarray([sub2idx[i] for i in t1])  
        except:
            i1 = np.array([0])

        l = len(i1)
        if l < max_d:
            i = np.pad(i1, (0, max_d - l), 'constant', constant_values=0)
            input_mask = ([1] * l) + ([0] * (max_d - l))
        else:
            i = i1[:max_d]
            input_mask = [1] * max_d

        return i, np.asarray(input_mask)

In [10]:
# Obtain SMILES of drugs and convert them to canonical SMILES using the Rdkit package
drug_smiles = pd.read_csv('./raw_data/drug_info.csv',index_col=0)['canonical_smi'].values
drug_smiles = [process_smi(smi) for smi in drug_smiles] 
drug_smiles = np.unique(drug_smiles)
drug_smiles

array(['C#CCC(Cc1cnc2nc(N)nc(N)c2n1)c1ccc(C(=O)NC(CCC(=O)O)C(=O)O)cc1',
       'C#Cc1cccc(Nc2ncnc3cc(OCCOC)c(OCCOC)cc23)c1',
       'C1CN1c1nc(N2CC2)nc(N2CC2)n1',
       'C=C(c1ccc(C(=O)O)cc1)c1cc2c(cc1C)C(C)(C)CCC2(C)C',
       'C=C1C(=O)C23C(O)C1CCC2C12COC3(O)C(O)C1C(C)(C)CCC2O',
       'C=C1C(=O)OC(CCCCCCCC)C1C(=O)O',
       'C=C1CC2C(CCC3(C)C(=O)CCC23)C2(C)C=CC(=O)C=C12',
       'C=CC(=O)Nc1cc2c(Nc3ccc(F)c(Cl)c3)ncnc2cc1C#CC(C)(C)N1CCN(C)CC1',
       'C=CCNC1=C2CC(C)CC(OC)C(O)C(C)C=C(C)C(OC(N)=O)C(OC)C=CC=C(C)C(=O)NC(=CC1=O)C2=O',
       'C=CCn1c(=O)c2cnc(Nc3ccc(N4CCN(C)CC4)cc3)nc2n1-c1cccc(C(C)(C)O)n1',
       'C=Cc1c(C)c2cc3nc(cc4[nH]c(cc5nc(cc1[nH]2)C(C)=C5CCC(=O)O)c(CCC(=O)OC)c4C)C1(C)C3=CC=C(C(=O)OC)C1C(=O)OC',
       'CC(=Cc1csc(C)n1)C1CC2OC2(C)CCCC(C)C(O)C(C)C(=O)C(C)(C)C(O)CC(=O)N1',
       'CC(=Cc1csc(C)n1)C1CC2OC2CCCC(C)C(O)C(C)C(=O)C(C)(C)C(O)CC(=O)O1',
       'CC(=NNC(=S)N1CCC1)c1ccccn1',
       'CC(=O)C1(O)Cc2c(O)c3c(c(O)c2C(OC2CC(N)C(O)C(C)O2)C1)C(=O)c1ccccc1C3=O',
  

In [25]:
drugSmile_drugSubEmbed = { k:drug2emb_encoder(k) for k in drug_smiles}
drugSmile_drugSubEmbed

{'C#CCC(Cc1cnc2nc(N)nc(N)c2n1)c1ccc(C(=O)NC(CCC(=O)O)C(=O)O)cc1': (array([ 43,  17, 599, 370, 603, 110, 386, 102, 593, 247, 391,  80, 250,
          66, 923,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0]),
  array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0])),
 'C#Cc1cccc(Nc2ncnc3cc(OCCOC)c(OCCOC)cc23)c1': (array([  43,  424,  203, 1606,  304,  119,  699,  119,  382,   38,  794,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0]),
  array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [None]:
# np.save('./1_drug_data/drugSmile_drugSubEmbed.npy',drugSmile_drugSubEmbed)