This code section is for trying to import the data from GDSC with the list of drug to individual cell line. then convert the name of chemical to SELFIES and SMILES encoding.

In [1]:
import pandas as pd 
import numpy as np 
import selfies as sf 
import pubchempy as pcp 

In [None]:
dataA549_path = 'dataset/A549_drugzscore.csv'
dataA549 = pd.read_csv(dataA549_path)
dataA549 = dataA549.drop(columns=['Count']) #remove the column 'Count'
print(dataA549.head(5))

     ID                          Drug Name               Targets   Z Score
0  1026                       Tanespimycin                 HSP90 -2.986095
1  1425                              BPTES                   GLS -2.534759
2  1368                           BAY-MPS1                  MPS1 -2.209878
3  1369  BAY-MPS-combo-1 (paclitaxel 5 uM)  TTK and microtubules -2.091753
4  1370  BAY-MPS-combo 2 (paclitaxel 1 uM)  TTK and microtubules -2.087849


In [10]:
# Add a new column for SMILES codes
dataA549['SMILES'] = dataA549['Drug Name'].apply(lambda drug: pcp.get_compounds(drug, 'name')[0].canonical_smiles if pcp.get_compounds(drug, 'name') else None)

print(dataA549.head(5))

     ID                          Drug Name               Targets   Z Score  \
0  1026                       Tanespimycin                 HSP90 -2.986095   
1  1425                              BPTES                   GLS -2.534759   
2  1368                           BAY-MPS1                  MPS1 -2.209878   
3  1369  BAY-MPS-combo-1 (paclitaxel 5 uM)  TTK and microtubules -2.091753   
4  1370  BAY-MPS-combo 2 (paclitaxel 1 uM)  TTK and microtubules -2.087849   

                                              SMILES  
0  CC1CC(C(C(C=C(C(C(C=CC=C(C(=O)NC2=CC(=O)C(=C(C...  
1  C1=CC=C(C=C1)CC(=O)NC2=NN=C(S2)CCSCCC3=NN=C(S3...  
2                                               None  
3                                               None  
4                                               None  


In [13]:
print(dataA549.head(10))
print("rows : ", dataA549.shape[0])
print("columns : ", dataA549.shape[1])

     ID                          Drug Name                         Targets  \
0  1026                       Tanespimycin                           HSP90   
1  1425                              BPTES                             GLS   
2  1368                           BAY-MPS1                            MPS1   
3  1369  BAY-MPS-combo-1 (paclitaxel 5 uM)            TTK and microtubules   
4  1370  BAY-MPS-combo 2 (paclitaxel 1 uM)            TTK and microtubules   
5  1014                        Refametinib                      MEK1, MEK2   
6  1461                          TANK_1366  Tankyrase 1/2 (PARP5a, PARP5b)   
7  1498                        Selumetinib                      MEK1, MEK2   
8  1458                          PARP_9495             PARP1, PARP2, PARP7   
9  1038                             NU7441                           DNAPK   

    Z Score                                             SMILES  
0 -2.986095  CC1CC(C(C(C=C(C(C(C=CC=C(C(=O)NC2=CC(=O)C(=C(C...  
1 -2.534759

In [14]:
#preprocess the data by dropping the SMILES with none
dataA549_clean = dataA549.dropna(subset=['SMILES'])
print(dataA549_clean.head(5))
print("rows : ", dataA549_clean.shape[0])
print("columns : ", dataA549_clean.shape[1])

     ID     Drug Name     Targets   Z Score  \
0  1026  Tanespimycin       HSP90 -2.986095   
1  1425         BPTES         GLS -2.534759   
5  1014   Refametinib  MEK1, MEK2 -1.983500   
7  1498   Selumetinib  MEK1, MEK2 -1.810198   
9  1038        NU7441       DNAPK -1.734860   

                                              SMILES  
0  CC1CC(C(C(C=C(C(C(C=CC=C(C(=O)NC2=CC(=O)C(=C(C...  
1  C1=CC=C(C=C1)CC(=O)NC2=NN=C(S2)CCSCCC3=NN=C(S3...  
5  COC1=CC(=C(C(=C1NS(=O)(=O)C2(CC2)CC(CO)O)NC3=C...  
7  CN1C=NC2=C1C=C(C(=C2F)NC3=C(C=C(C=C3)Br)Cl)C(=...  
9  C1COCCN1C2=CC(=O)C3=C(O2)C(=CC=C3)C4=CC=CC5=C4...  
rows :  282
columns :  5


In [16]:
#encode SMILES to SELFIES
dataA549_clean['SELFIES'] = dataA549_clean['SMILES'].apply(lambda x: sf.encoder(x) if x else None)
print(dataA549_clean.head(5))

     ID     Drug Name     Targets   Z Score  \
0  1026  Tanespimycin       HSP90 -2.986095   
1  1425         BPTES         GLS -2.534759   
5  1014   Refametinib  MEK1, MEK2 -1.983500   
7  1498   Selumetinib  MEK1, MEK2 -1.810198   
9  1038        NU7441       DNAPK -1.734860   

                                              SMILES  \
0  CC1CC(C(C(C=C(C(C(C=CC=C(C(=O)NC2=CC(=O)C(=C(C...   
1  C1=CC=C(C=C1)CC(=O)NC2=NN=C(S2)CCSCCC3=NN=C(S3...   
5  COC1=CC(=C(C(=C1NS(=O)(=O)C2(CC2)CC(CO)O)NC3=C...   
7  CN1C=NC2=C1C=C(C(=C2F)NC3=C(C=C(C=C3)Br)Cl)C(=...   
9  C1COCCN1C2=CC(=O)C3=C(O2)C(=CC=C3)C4=CC=CC5=C4...   

                                             SELFIES  
0  [C][C][C][C][Branch2][=Branch1][=Branch1][C][B...  
1  [C][=C][C][=C][Branch1][Branch1][C][=C][Ring1]...  
5  [C][O][C][=C][C][=Branch2][Branch1][C][=C][Bra...  
7  [C][N][C][=N][C][=C][Ring1][Branch1][C][=C][Br...  
9  [C][C][O][C][C][N][Ring1][=Branch1][C][=C][C][...  


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataA549_clean['SELFIES'] = dataA549_clean['SMILES'].apply(lambda x: sf.encoder(x) if x else None)


In [17]:
none_count = dataA549_clean['SELFIES'].isna().sum()
print("Number of None values in SELFIES column:", none_count)

Number of None values in SELFIES column: 0


In [None]:
# note: success encoding SMILES and SELFIES from the database CCL with many drug compounds. (source GDSC)
'''
- https://aspuru.substack.com/p/molecular-graph-representations-and
- https://github.com/aspuru-guzik-group/selfies
- https://github.com/gadsbyfly/PyBioMed/blob/master/PyBioMed/doc/User_guide.rst
'''