In [49]:
from rdkit import Chem
from rdkit.Chem import AllChem
import pandas as pd
import numpy as np
from tqdm import tqdm

In [50]:
train_file='../dataset/trainset.csv'
test_file='../dataset/testset.csv'

In [51]:
train_df=pd.read_csv(train_file)
test_df=pd.read_csv(test_file)

# View the basics of the dataset

In [52]:
train_df.head()

Unnamed: 0,SMILES,Label
0,CN(C)C(=N)N=C(N)N,Negative
1,COC(=O)C=CC(O)=O,Negative
2,OC(=O)C=CC1=CC=CC=C1,Negative
3,CC(C)C1=CC(O)=C(C)C=C1,Negative
4,COC1=CC=C(C=C1)C(O)=O,Negative


In [53]:
test_df.head()

Unnamed: 0,SMILES,Label
0,CNC(=O)C1=CC=CC=C1SC2=CC3=C(C=C2)C(=NN3)C=CC4=...,Positive
1,CN1C=NC(=C1SC2=NC=NC3=C2NC=N3)[N+](=O)[O-],Positive
2,CN(C)CCC(C1=CC=CC2=CC=CC=C21)(C(C3=CC=CC=C3)C4...,Positive
3,CC(C)(C)C1=CC=C(C=C1)S(=O)(=O)NC2=C(C(=NC(=N2)...,Positive
4,CN1CCN(CC1)CCCOC2=C(C=C3C(=C2)N=CC(=C3NC4=CC(=...,Positive


In [54]:
print(train_df.info())
print(test_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1253 entries, 0 to 1252
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   SMILES  1253 non-null   object
 1   Label   1253 non-null   object
dtypes: object(2)
memory usage: 19.7+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 208 entries, 0 to 207
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   SMILES  208 non-null    object
 1   Label   208 non-null    object
dtypes: object(2)
memory usage: 3.4+ KB
None


# chaeck and remove duplicates row

In [55]:
def chaeck_remove_duplicates(df):
    duplicated_smiles = df[df.duplicated('SMILES', keep=False)]
    
    if not duplicated_smiles.empty:
        print("Duplicated SMILES found:")
        print(duplicated_smiles['SMILES'].unique())
        df_cleaned = df.drop_duplicates('SMILES', keep='first')
    else:
        print("No duplicated SMILES found.")
        df_cleaned = df
    return df_cleaned

In [56]:
train_cleaned_df = chaeck_remove_duplicates(train_df)
test_cleaned_df = chaeck_remove_duplicates(test_df)

No duplicated SMILES found.
No duplicated SMILES found.


# check the lable distribution

In [57]:
train_label_distribution = train_cleaned_df['Label'].value_counts()
test_label_distribution = test_cleaned_df['Label'].value_counts()
print(f"trainset:\n{train_label_distribution}")
print(f"testset:\n{test_label_distribution}")

trainset:
Positive    636
Negative    617
Name: Label, dtype: int64
testset:
Negative    114
Positive     94
Name: Label, dtype: int64


# View if SMILES are canonical

In [58]:
def check_smiles_canonicalization_df(df):
    canonical_count = 0
    non_canonical_count = 0
    invalid_indices = []
    canonicalized_smiles = []

    for index, smiles in df["SMILES"].items():
        molecule = Chem.MolFromSmiles(smiles)
        if molecule:
            canonical_smiles = Chem.MolToSmiles(molecule, isomericSmiles=True)
            if smiles == canonical_smiles:
                canonical_count += 1
            else:
                non_canonical_count += 1
            canonicalized_smiles.append(canonical_smiles)
        else:
            invalid_indices.append(index)

    df_clean = df.drop(index=invalid_indices).reset_index(drop=True)
    
    df_clean["SMILES"] = pd.Series(canonicalized_smiles)
    
    print(f"Canonical SMILES count: {canonical_count}")
    print(f"Non-canonical SMILES count: {non_canonical_count}")
    print(f"Invalid SMILES count: {len(invalid_indices)}")
    
    return df_clean


In [59]:
train_canonicalization_df = check_smiles_canonicalization_df(train_cleaned_df)
test_canonicalization_df = check_smiles_canonicalization_df(test_cleaned_df)

Canonical SMILES count: 114
Non-canonical SMILES count: 1139
Invalid SMILES count: 0
Canonical SMILES count: 8
Non-canonical SMILES count: 200
Invalid SMILES count: 0


# View whether the canonicalized data are duplicated

In [60]:
train_final_df = chaeck_remove_duplicates(train_canonicalization_df)
test_final_df = chaeck_remove_duplicates(test_canonicalization_df)

No duplicated SMILES found.
Duplicated SMILES found:
['Cn1c(=O)c2[nH]cnc2n(C)c1=O' 'N=c1nc(N2CCCCC2)cc(N)n1O'
 'CNC1CCc2[nH]c3ccc(C(N)=O)cc3c2C1']


# check if SMILES can generate 3D conformation


In [61]:
def generate_3d_coordinates(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None,False
    mol_with_h = Chem.AddHs(mol)
    if AllChem.EmbedMolecule(mol_with_h, AllChem.ETKDG()) != 0:
        return None, False
    optimization_result = AllChem.MMFFOptimizeMolecule(mol_with_h,maxIters=10000)
    if optimization_result != 0:
            return None, False
    return mol_with_h, True

In [62]:
def process_smiles_and_remove_failures(smiles_df):
    failed_indices = []
    for index, row in tqdm(smiles_df.iterrows(), total=smiles_df.shape[0], desc="Processing SMILES"):
        try:
            _, success = generate_3d_coordinates(row['SMILES'])
            if not success:
                failed_indices.append(index)
        except Exception as e:
            print(f"Error processing SMILES at index {index}: {row['SMILES']}")
            print(f"Error message: {str(e)}")
            failed_indices.append(index)
    successful_df = smiles_df.drop(failed_indices).reset_index(drop=True)
    return successful_df


In [69]:
all_test_df = process_smiles_and_remove_failures(test_final_df) 

Processing SMILES: 100%|█████████████████████████████████████████████████████████████| 205/205 [00:17<00:00, 11.63it/s]


In [70]:
all_train_df = process_smiles_and_remove_failures(train_final_df) 

Processing SMILES:   8%|████▉                                                       | 104/1253 [00:05<02:14,  8.51it/s][08:42:37] UFFTYPER: Unrecognized charge state for atom: 1
Processing SMILES: 100%|███████████████████████████████████████████████████████████| 1253/1253 [02:57<00:00,  7.07it/s]


In [73]:
removed_df = all_train_df.to_csv('../dataset/removed_df.csv', index=False)

In [74]:
len_removed_df = []
for i in range(10):
    canonical_file = '../dataset/removed_df.csv'
    ori_smiles_df = pd.read_csv(canonical_file)
    removed_df = process_smiles_and_remove_failures(ori_smiles_df)
    len_removed_df.append(len(removed_df))
    removed_df.to_csv('../dataset/removed_df.csv', index=False)

Processing SMILES: 100%|███████████████████████████████████████████████████████████| 1244/1244 [03:01<00:00,  6.86it/s]
Processing SMILES: 100%|███████████████████████████████████████████████████████████| 1242/1242 [03:04<00:00,  6.73it/s]
Processing SMILES: 100%|███████████████████████████████████████████████████████████| 1242/1242 [02:58<00:00,  6.95it/s]
Processing SMILES: 100%|███████████████████████████████████████████████████████████| 1242/1242 [03:00<00:00,  6.87it/s]
Processing SMILES: 100%|███████████████████████████████████████████████████████████| 1242/1242 [02:59<00:00,  6.93it/s]
Processing SMILES: 100%|███████████████████████████████████████████████████████████| 1242/1242 [03:04<00:00,  6.73it/s]
Processing SMILES: 100%|███████████████████████████████████████████████████████████| 1242/1242 [03:05<00:00,  6.68it/s]
Processing SMILES: 100%|███████████████████████████████████████████████████████████| 1242/1242 [02:58<00:00,  6.94it/s]
Processing SMILES: 100%|████████████████

In [75]:
print(len_removed_df)

[1242, 1242, 1242, 1242, 1242, 1242, 1242, 1242, 1242, 1242]


# Final checking

In [76]:
final_canonical_train_df = removed_df
print(final_canonical_train_df.info())
print(all_test_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1242 entries, 0 to 1241
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   SMILES  1242 non-null   object
 1   Label   1242 non-null   object
dtypes: object(2)
memory usage: 19.5+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   SMILES  205 non-null    object
 1   Label   205 non-null    object
dtypes: object(2)
memory usage: 3.3+ KB
None


In [77]:
print(f"trainset:\n{final_canonical_train_df['Label'].value_counts()}")
print(f"testset:\n{all_test_df['Label'].value_counts()}")

trainset:
Positive    635
Negative    607
Name: Label, dtype: int64
testset:
Negative    111
Positive     94
Name: Label, dtype: int64


# write to csv

In [78]:
final_canonical_train_df.to_csv('../dataset/final_canonical_trainset.csv', index=False)
all_test_df.to_csv('../dataset/canonical_offlinetestset.csv', index=False)