In [1]:

import numpy as np
import tensorflow as tf
from tensorflow.keras.datasets import imdb
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, SimpleRNN, GRU, LSTM, Embedding, Dropout, BatchNormalization
from tensorflow.keras.preprocessing import sequence
from sklearn.model_selection import StratifiedKFold
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import KFold

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import json
import itertools
from rdkit import Chem
from rdkit.Chem import AllChem

import torch
import torch.nn as nn
from keras.models import Sequential
from keras.layers import LSTM, Dense

da = pd.read_parquet('data/de_train.parquet')




In [2]:
### do the NLP and the gene selections
def generate_nlp(s):
    chars = [char for char in s]
    molecule = Chem.MolFromSmiles(s)

    # 计算分子量
    mw = AllChem.CalcExactMolWt(molecule)
    
    # 基本统计
    total_chars = len(chars)
    count_B = chars.count('B')
    count_C = chars.count('C')
    count_c = chars.count('c')
    count_N = chars.count('N')
    count_n = chars.count('n')
    count_O = chars.count('O')

    count_F = chars.count('F')
    count_P = chars.count('P')
    count_S = chars.count('S')
    count_Cl = chars.count('Cl')
    count_Br = chars.count('Br')
    
    count_numbers = sum(chars.count(str(digit)) for digit in range(1, 10))
#     count_1 = chars.count('1')
#     count_2 = chars.count('2')
    
    count_chiral_S = chars.count('@@')
    count_chiral_R = chars.count('@') - count_chiral_S
    
    count_open_bracket = chars.count('(')
    
    count_open_s_bracket = chars.count('[')

    
    count_double = chars.count('=')
    count_triple = chars.count('#')
    count_aromatic = chars.count(':')


    
    # 结构特征
    in_bracket = ''.join(chars).split('(')[1].split(')')[0] if '(' in chars and ')' in chars else ''
    chars_in_bracket = len(in_bracket)
    max_consecutive_C = max(len(list(g)) for k, g in itertools.groupby(chars) if k == 'C')
    
    # Calculating num_rings and num_substructures
    num_rings = count_numbers // 2

    
    # Creating a pandas Series with all counts and structural features
    data = {
        'SMILES' : s,
        'weight': mw,
        'total_chars': total_chars,
        'count_B': count_B,
        'count_C': count_C,
        'count_c': count_c,
        'count_N': count_N,
        'count_n': count_n,
        'count_O': count_O,

        'count_F': count_F,
        'count_P': count_P,
        'count_S': count_S,
        'count_Cl': count_Cl,
        'count_Br': count_Br,
        
        'count_numbers': count_numbers,

        'count_chiral_S': count_chiral_S,
        'count_chiral_R': count_chiral_R,
        
        'count_open_bracket': count_open_bracket,

        'count_open_s_bracket': count_open_s_bracket,

        'count_double': count_double,
        'count_triple': count_triple,
        'count_aromatic': count_aromatic,
        
        'chars_in_bracket': chars_in_bracket,
        'max_consecutive_C': max_consecutive_C,
        'num_rings': num_rings
    }
    
    return pd.Series(data)

def read_gene(p):
    with open(p, 'r') as file:
        data = json.load(file)
        print("File successfully read.")

    return(pd.DataFrame(data).Gene)

In [3]:
### read in data
da = pd.read_parquet('data/de_train.parquet')
drug_smile = np.unique(da.SMILES)
chem_train = np.load("smile_output/chemberta_train.npy")
chem_test = np.load("smile_output/chemberta_test.npy")

In [4]:
NK_file_path = "data/cell_type_category_rna_NK-cells_Cell.json"
T_file_path = "data/cell_type_category_rna_T-cells_Cell.json"
B_file_path = "data/cell_type_category_rna_B-cells_Cell.json"
Myeloid_file_path = "data/cell_type_category_rna_monocytes_Cell.json"
columns = da.columns

NK_gene = read_gene(NK_file_path)
T_gene = read_gene(T_file_path)
B_gene = read_gene(B_file_path)
Myeloid_gene = read_gene(Myeloid_file_path)
gene = columns[5:]

T_genes = [g for g in T_gene if g in gene]
B_genes = [g for g in B_gene if g in gene]
NK_genes = [g for g in NK_gene if g in gene]
Myeloid_genes = [g for g in Myeloid_gene if g in gene]


File successfully read.
File successfully read.
File successfully read.
File successfully read.


In [5]:
np.unique(da.cell_type)

array(['B cells', 'Myeloid cells', 'NK cells', 'T cells CD4+',
       'T cells CD8+', 'T regulatory cells'], dtype=object)

In [7]:
all_genes = set(T_genes) | set(B_genes) | set(NK_genes) | set(Myeloid_genes)

cell_data = da[da['cell_type'] == 'B cells']
B_mean = cell_data[B_genes].mean(axis=0)

cell_data = da[da['cell_type'] == 'NK cells']
NK_mean = cell_data[NK_genes].mean(axis=0)

cell_data = da[da['cell_type'] == 'Myeloid cells']
Myeloid_mean = cell_data[Myeloid_genes].mean(axis=0)

cell_data = da[da['cell_type'] == 'T regulatory cells']
TR_mean = cell_data[T_genes].mean(axis=0)

cell_data = da[da['cell_type'] == 'T cells CD4+']
T4_mean = cell_data[T_genes].mean(axis=0)

cell_data = da[da['cell_type'] == 'T cells CD4+']
T8_mean = cell_data[T_genes].mean(axis=0)

cell_types = ['B cells', 'Myeloid cells', 'NK cells', 'T cells CD4+', 'T cells CD8+', 'T regulatory cells']
cell_gene_df = pd.DataFrame(index=cell_types, columns=all_genes)

cell_gene_df.index.name = 'cell_type'
# cell_gene_df = cell_gene_df.fillna(0)

In [11]:
cell_gene_df.loc['B cells', B_genes] = B_mean
cell_gene_df.loc['Myeloid cells', Myeloid_genes] = Myeloid_mean
cell_gene_df.loc['NK cells', B_genes] = NK_mean
cell_gene_df.loc['T cells CD4+', B_genes] = T4_mean
cell_gene_df.loc['T cells CD8+', B_genes] = T8_mean
cell_gene_df.loc['T regulatory cells', B_genes] = TR_mean
cell_gene_df = cell_gene_df.fillna(0)

In [12]:
cell_gene_df

Unnamed: 0_level_0,TRAV36DV7,LRRFIP1,TBK1,TRAV12-2,ADAM8,WAS,HSH2D,ADGRE2,SIRPB1,NAGA,...,SNX18,RNASE6,BLM,NLRC5,AXIN1,LYAR,CCDC69,MAP2K1,APOBR,VMO1
cell_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
B cells,0,0.0,0.0,0,0,-3.260323,-1.523301,0.0,0.0,0.0,...,0.0,-1.759516,0.0,0,0,0,0.0,0.0,0,0.0
Myeloid cells,0,0.634658,0.164204,0,0,-3.297664,0.0,-1.170606,-0.359801,-5.421365,...,0.754875,0.0,0.565769,0,0,0,0.790919,-0.88294,0,-0.439066
NK cells,0,0.0,0.0,0,0,-0.788506,-0.305869,0.0,0.0,0.0,...,0.0,0.0,0.0,0,0,0,0.0,0.0,0,0.0
T cells CD4+,0,0.0,0.0,0,0,-1.17643,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0,0,0,0.0,0.0,0,0.0
T cells CD8+,0,0.0,0.0,0,0,-1.17643,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0,0,0,0.0,0.0,0,0.0
T regulatory cells,0,0.0,0.0,0,0,-0.264482,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0,0,0,0.0,0.0,0,0.0


In [None]:
# cell_gene_df.to_csv("cell_gene_mean.csv")

In [14]:
stat_smile = pd.DataFrame()

# Convert the list to a pandas Series
drug_series = pd.Series(drug_smile)

# Apply the generate_nlp function to each element and store the result in the dataframe
stat_smile = drug_series.apply(generate_nlp).reset_index(drop=True)
stat_smile

Unnamed: 0,SMILES,weight,total_chars,count_B,count_C,count_c,count_N,count_n,count_O,count_F,...,count_chiral_S,count_chiral_R,count_open_bracket,count_open_s_bracket,count_double,count_triple,count_aromatic,chars_in_bracket,max_consecutive_C,num_rings
0,C/C(=C\c1csc(C)n1)[C@@H]1C[C@@H]2O[C@]2(C)CCC[...,506.281443,99,0,24,3,1,1,5,0,...,0,11,11,7,3,0,0,8,3,3
1,C1CCC(C(CC2CCCCN2)C2CCCCC2)CC1,277.276950,30,0,19,0,1,0,0,0,...,0,0,2,0,0,0,0,1,5,3
2,C=C1/C(=C\C=C2/CCC[C@]3(C)[C@@H]([C@H](C)CC[C@...,416.329045,88,0,27,0,0,0,3,0,...,0,12,7,7,3,0,0,16,3,3
3,C=CC(=O)Nc1cc2c(Nc3ccc(F)c(Cl)c3)ncnc2cc1OCCCN...,485.162996,53,0,11,14,3,2,3,1,...,0,0,4,0,2,0,0,2,3,4
4,C=CC(=O)Nc1cccc(Nc2nc(Nc3ccc(OCCOC)cc3)ncc2F)c1,423.170668,47,0,6,16,3,2,3,1,...,0,0,4,0,2,0,0,2,2,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
141,O=C1c2cccc3cccc(c23)C(=O)N1CCCCCC(O)=NO,326.126657,39,0,8,10,2,0,4,0,...,0,0,3,0,3,0,0,3,6,3
142,OC1(c2ccc(Cl)c(C(F)(F)F)c2)CCN(CCCC(c2ccc(F)cc...,523.170133,63,0,11,18,1,0,1,5,...,0,0,9,0,0,0,0,5,4,4
143,OCCCNc1cc(-c2ccnc(Nc3cccc(Cl)c3)n2)ccn1,355.119988,39,0,4,15,2,3,1,0,...,0,0,3,0,0,0,0,7,3,3
144,c1cc(OCCN2CCCCC2)cc(-c2[nH]nc3ccc(-c4nc[nH]n4)...,388.201159,53,0,7,15,1,5,1,0,...,0,0,3,2,0,0,0,11,5,5


In [23]:
X = da.loc[:,columns[0:4]]
X = pd.merge(X, stat_smile, on='SMILES', how='inner')
X = pd.merge(X, cell_gene_df, on='cell_type', how='inner')
X_drug_gene = X.drop(columns=['cell_type','sm_lincs_id', 'SMILES', 'sm_name'])
X_drug_gene.shape

(614, 1471)

In [24]:
X

Unnamed: 0,cell_type,sm_name,sm_lincs_id,SMILES,weight,total_chars,count_B,count_C,count_c,count_N,...,SNX18,RNASE6,BLM,NLRC5,AXIN1,LYAR,CCDC69,MAP2K1,APOBR,VMO1
0,NK cells,Clotrimazole,LSM-5341,Clc1ccccc1C(c1ccccc1)(c1ccccc1)n1ccnc1,344.108026,38,0,2,21,0,...,0.000000,0.0,0.000000,0,0,0,0.000000,0.00000,0,0.000000
1,NK cells,Mometasone Furoate,LSM-3349,C[C@@H]1C[C@H]2[C@@H]3CCC4=CC(=O)C=C[C@]4(C)[C...,520.141944,101,0,25,4,0,...,0.000000,0.0,0.000000,0,0,0,0.000000,0.00000,0,0.000000
2,NK cells,Idelalisib,LSM-1205,CC[C@H](Nc1ncnc2[nH]cnc12)c1nc2cccc(F)c2c(=O)n...,415.155686,56,0,3,19,1,...,0.000000,0.0,0.000000,0,0,0,0.000000,0.00000,0,0.000000
3,NK cells,Vandetanib,LSM-1199,COc1cc2c(Nc3ccc(Br)cc3F)ncnc2cc1OCC1CCN(C)CC1,474.106666,45,1,8,14,2,...,0.000000,0.0,0.000000,0,0,0,0.000000,0.00000,0,0.000000
4,NK cells,Bosutinib,LSM-1190,COc1cc(Nc2c(C#N)cnc3cc(OCCCN4CCN(C)CC4)c(OC)cc...,529.164745,59,0,13,15,4,...,0.000000,0.0,0.000000,0,0,0,0.000000,0.00000,0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,Myeloid cells,Penfluridol,LSM-2334,OC1(c2ccc(Cl)c(C(F)(F)F)c2)CCN(CCCC(c2ccc(F)cc...,523.170133,63,0,11,18,1,...,0.754875,0.0,0.565769,0,0,0,0.790919,-0.88294,0,-0.439066
610,Myeloid cells,Dactolisib,LSM-4255,Cn1c(=O)n(-c2ccc(C(C)(C)C#N)cc2)c2c3cc(-c4cnc5...,469.190260,64,0,5,25,1,...,0.754875,0.0,0.565769,0,0,0,0.790919,-0.88294,0,-0.439066
611,Myeloid cells,O-Demethylated Adapalene,LSM-6237,O=C(O)c1ccc2cc(-c3ccc(O)c(C45CC6CC(CC(C6)C4)C5...,398.188195,56,0,11,16,0,...,0.754875,0.0,0.565769,0,0,0,0.790919,-0.88294,0,-0.439066
612,Myeloid cells,Oprozomib (ONX 0912),LSM-45496,COC[C@H](NC(=O)c1cnc(C)s1)C(=O)N[C@@H](COC)C(=...,532.199170,83,0,16,9,3,...,0.754875,0.0,0.565769,0,0,0,0.790919,-0.88294,0,-0.439066


In [34]:
submission = pd.read_csv('data/sample_submission.csv') #  提交
id_pair = pd.read_csv('data/id_map.csv') #  每个id对应的药物与细胞类型
id_col = ["cell_type", "sm_name"]
X = da.loc[:,columns[0:4]]
X = pd.merge(X, stat_smile, on='SMILES', how='inner')
drug_info = X.drop(columns=["cell_type", "sm_lincs_id", "SMILES"]).drop_duplicates()

merged_df = pd.merge(id_pair, drug_info, on='sm_name', how='inner')

final_merged_df = pd.merge(merged_df, cell_gene_df, on='cell_type', how='inner')


X_test = final_merged_df.drop(columns = ["id", "cell_type", "sm_name"])

In [37]:
X_drug_gene.to_csv("Xtrain_nlp_gene_mean.csv")

In [39]:
X_drug_gene

Unnamed: 0,weight,total_chars,count_B,count_C,count_c,count_N,count_n,count_O,count_F,count_P,...,SNX18,RNASE6,BLM,NLRC5,AXIN1,LYAR,CCDC69,MAP2K1,APOBR,VMO1
0,344.108026,38,0,2,21,0,2,0,0,0,...,0.000000,0.0,0.000000,0,0,0,0.000000,0.00000,0,0.000000
1,520.141944,101,0,25,4,0,0,5,0,0,...,0.000000,0.0,0.000000,0,0,0,0.000000,0.00000,0,0.000000
2,415.155686,56,0,3,19,1,6,1,1,0,...,0.000000,0.0,0.000000,0,0,0,0.000000,0.00000,0,0.000000
3,474.106666,45,1,8,14,2,2,2,1,0,...,0.000000,0.0,0.000000,0,0,0,0.000000,0.00000,0,0.000000
4,529.164745,59,0,13,15,4,1,3,0,0,...,0.000000,0.0,0.000000,0,0,0,0.000000,0.00000,0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,523.170133,63,0,11,18,1,0,1,5,0,...,0.754875,0.0,0.565769,0,0,0,0.790919,-0.88294,0,-0.439066
610,469.190260,64,0,5,25,1,4,1,0,0,...,0.754875,0.0,0.565769,0,0,0,0.790919,-0.88294,0,-0.439066
611,398.188195,56,0,11,16,0,0,3,0,0,...,0.754875,0.0,0.565769,0,0,0,0.790919,-0.88294,0,-0.439066
612,532.199170,83,0,16,9,3,1,7,0,0,...,0.754875,0.0,0.565769,0,0,0,0.790919,-0.88294,0,-0.439066


In [38]:
X_test.to_csv("Xtest_nlp_gene_mean.csv")