In [3]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.Chem import rdMolDescriptors as rdMD


df = pd.read_csv(r"data\preprocessed\train_extracted_SMILES.csv")

# SMILES에서 피처 추출
def extract_features(smiles):
    molecule = Chem.MolFromSmiles(smiles)
    if molecule is None:
        return None
    
    features = {
        'MolecularWeight': Descriptors.MolWt(molecule),
        'LogP': Descriptors.MolLogP(molecule),
        'NumHDonors': Descriptors.NumHDonors(molecule),
        'NumHAcceptors': Descriptors.NumHAcceptors(molecule),
        'NumRotatableBonds': Descriptors.NumRotatableBonds(molecule),
        'TPSA': Descriptors.TPSA(molecule),
        'NumRings': Descriptors.RingCount(molecule),
        'NumAliphaticRings': Descriptors.NumAliphaticRings(molecule),
        'NumAromaticRings': Descriptors.NumAromaticRings(molecule),
        'NumSaturatedRings': Descriptors.NumSaturatedRings(molecule),
        'NumHeteroatoms': Descriptors.NumHeteroatoms(molecule),
        'NumHeavyAtoms': Descriptors.HeavyAtomCount(molecule),
        'FractionCSP3': Descriptors.FractionCSP3(molecule),
        'MolMR': Descriptors.MolMR(molecule),
        'NumAmideBonds': rdMD.CalcNumAmideBonds(molecule),
        # 'BalabanJ': rdMD.CalcBalabanJ(molecule),
        # 'BertzCT': rdMD.CalcBertzCT(molecule),
        'HallKierAlpha': rdMD.CalcHallKierAlpha(molecule)
    }
    return features


df_rdkit = df['Smiles'].apply(extract_features)
df_rdkit = pd.DataFrame(df_rdkit.tolist())
df_target = pd.concat([df, df_rdkit], axis=1)  
df_target.to_csv('data/preprocessed/train_rdkit.csv', index=False)