In [5]:
from rdkit import Chem
import numpy as np
import pandas as pd
from joblib import Parallel, delayed

def make_smile_canonical(smile): # To avoid duplicates, for example: canonical '*C=C(*)C' == '*C(=C*)C'
    try:
        mol = Chem.MolFromSmiles(smile)
        canon_smile = Chem.MolToSmiles(mol, canonical=True)
        # if smile != canon_smile:
        #     print(f'{smile} > {canon_smile}')
        return canon_smile
    except:
        return np.nan

def process_chunk(df_chunk: pd.DataFrame) -> pd.DataFrame:
    # 복사해서 변경(원본 공유로 인한 사이드이펙트 방지)
    out = df_chunk.copy()
    out["canonical_smiles"] = out["smiles"].map(make_smile_canonical)
    return out

In [6]:
ZINC_df = pd.read_parquet('../data/raw/ZINC_cleaned.parquet', columns=['smiles'])
ZINC_df

Unnamed: 0,smiles
0,O=C1NCCN1[C@@H]1CCC[NH+](Cc2cccc(O)c2)C1
1,CC(=O)N1c2ccc(S(=O)(=O)N3CCCC3)cc2C[C@@H]1C(=O...
2,Cc1nc(-c2ccc(NC(=O)[C@H]3C[C@H]4CC[C@@H]3O4)cc...
3,COc1ccc(C(=O)[C@H](C)Sc2nc(-c3cccs3)n[n-]2)cc1OC
4,CCOC(=O)c1sc(NC(=O)CCCS(=O)(=O)c2ccc(F)cc2)nc1...
...,...
22992516,O=S(=O)([O-])C1=CC2=NC(c3ccccc3)=N[C@H]2C=C1
22992517,CN(Cc1nccs1)C(=O)[C@@H]1CSc2ccccc21
22992518,CCCN(CC(F)F)C(=O)N[C@H](C)c1nc(-c2cccc(F)c2)no1
22992519,Cc1cc(NC(=O)c2ccc3[nH]c[nH+]c3c2)ncc1Br


In [7]:
n_jobs = 16
chunks = np.array_split(ZINC_df, n_jobs)

processed_chunks = Parallel(
    n_jobs=n_jobs,
    backend="loky",
    prefer="processes",
    batch_size=10_000,
)(
    delayed(process_chunk)(chk) for chk in chunks
)

ZINC_df = pd.concat(processed_chunks, ignore_index=True)

  return bound(*args, **kwds)


In [11]:
ZINC_df.to_parquet(
    '../data/pretrain/ZINC_canonical_cleaned.parquet',
    engine='pyarrow',
    compression='snappy',
    index=False
) # <-- 수정된 부분

In [12]:
ZINC_df.head()

Unnamed: 0,smiles,canonical_smiles
0,O=C1NCCN1[C@@H]1CCC[NH+](Cc2cccc(O)c2)C1,O=C1NCCN1[C@@H]1CCC[NH+](Cc2cccc(O)c2)C1
1,CC(=O)N1c2ccc(S(=O)(=O)N3CCCC3)cc2C[C@@H]1C(=O...,CC(=O)N1c2ccc(S(=O)(=O)N3CCCC3)cc2C[C@@H]1C(=O...
2,Cc1nc(-c2ccc(NC(=O)[C@H]3C[C@H]4CC[C@@H]3O4)cc...,Cc1nc(-c2ccc(NC(=O)[C@H]3C[C@H]4CC[C@@H]3O4)cc...
3,COc1ccc(C(=O)[C@H](C)Sc2nc(-c3cccs3)n[n-]2)cc1OC,COc1ccc(C(=O)[C@H](C)Sc2nc(-c3cccs3)n[n-]2)cc1OC
4,CCOC(=O)c1sc(NC(=O)CCCS(=O)(=O)c2ccc(F)cc2)nc1...,CCOC(=O)c1sc(NC(=O)CCCS(=O)(=O)c2ccc(F)cc2)nc1...
