In [1]:
from rdkit import Chem
import pandas as pd
import numpy as np

In [2]:
train_file='../dataset/trainset.csv'
test_file='../dataset/testset.csv'

In [3]:
train_pd=pd.read_csv(train_file)
test_pd=pd.read_csv(test_file)

# View the basics of the dataset

In [4]:
train_pd.head()

Unnamed: 0,ID,SMILES,Label
0,1,CN(C)C(=N)N=C(N)N,Negative
1,2,COC(=O)C=CC(O)=O,Negative
2,3,OC(=O)C=CC1=CC=CC=C1,Negative
3,4,CC(C)C1=CC(O)=C(C)C=C1,Negative
4,5,COC1=CC=C(C=C1)C(O)=O,Negative


In [5]:
test_pd.head()

Unnamed: 0,ID,SMILES,Label
0,1,CNC(=O)C1=CC=CC=C1SC2=CC3=C(C=C2)C(=NN3)C=CC4=...,Positive
1,2,CN1C=NC(=C1SC2=NC=NC3=C2NC=N3)[N+](=O)[O-],Positive
2,3,CN(C)CCC(C1=CC=CC2=CC=CC=C21)(C(C3=CC=CC=C3)C4...,Positive
3,4,CC(C)(C)C1=CC=C(C=C1)S(=O)(=O)NC2=C(C(=NC(=N2)...,Positive
4,5,CN1CCN(CC1)CCCOC2=C(C=C3C(=C2)N=CC(=C3NC4=CC(=...,Positive


In [6]:
print(train_pd.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1253 entries, 0 to 1252
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   ID      1253 non-null   int64 
 1   SMILES  1253 non-null   object
 2   Label   1253 non-null   object
dtypes: int64(1), object(2)
memory usage: 29.5+ KB
None


In [7]:
print(test_pd.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 208 entries, 0 to 207
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   ID      208 non-null    int64 
 1   SMILES  208 non-null    object
 2   Label   208 non-null    object
dtypes: int64(1), object(2)
memory usage: 5.0+ KB
None


# View if the compounds are duplicated

In [8]:
def check_duplicates(smiles_series):
    duplicates = smiles_series.duplicated(keep=False).sum()
    return duplicates

In [9]:
duplicates_train_count=check_duplicates(train_pd['SMILES'])
duplicates_test_count=check_duplicates(test_pd['SMILES'])
print(f"trainset duplicate count:\n{duplicates_train_count}")
print(f"testset duplicate count:\n{duplicates_test_count}")

trainset duplicate count:
0
testset duplicate count:
0


# View label distribution

In [10]:
train_counts = train_pd['Label'].value_counts()
test_counts = test_pd['Label'].value_counts()
print(f"trainset:\n{train_counts}")
print(f"testset:\n{test_counts}")

trainset:
Positive    636
Negative    617
Name: Label, dtype: int64
testset:
Negative    114
Positive     94
Name: Label, dtype: int64


# View if SMILES are canonical

In [11]:
def check_smiles_canonicalization(smiles_list):
    canonicalized_smiles=[]
    canonical_count = 0
    non_canonical_count = 0
    invalid_count = 0
    for smiles in smiles_list:
        molecule = Chem.MolFromSmiles(smiles)
        if molecule:
            canonical_smiles = Chem.MolToSmiles(molecule, isomericSmiles=True)
            if smiles == canonical_smiles:
                canonical_count += 1
            else:
                non_canonical_count +=1
            canonicalized_smiles.append(canonical_smiles)
        else:
            invalid_count +=1
            
    canonicalized_smiles_series = pd.Series(canonicalized_smiles)
    
    return canonical_count,non_canonical_count,invalid_count,canonicalized_smiles_series

In [12]:
canonical_count,non_canonical_count,invalid_count,canonicalized_smiles_series = check_smiles_canonicalization(train_pd['SMILES'])
print(f"count of canonical SMILES: {canonical_count}")
print(f"count of non_canonical SMILES: {non_canonical_count}")
print(f"count of invalid SMILES: {invalid_count}")

count of canonical SMILES: 114
count of non_canonical SMILES: 1139
count of invalid SMILES: 0


# View whether the canonicalized data are duplicated

In [13]:
duplicates_count=check_duplicates(canonicalized_smiles_series)
print(f"canonical trainset duplicate count:\n{duplicates_count}")

canonical trainset duplicate count:
0


# Generate a new Dataframe

In [14]:
canonical_df = pd.DataFrame({'SMILES': canonicalized_smiles_series, 'Label': train_pd['Label']})
canonical_df.to_csv('../dataset/canonical_trainset.csv', index=False)