In [None]:
import pandas as pd
from rdkit.Chem import Descriptors
import pickle
import os
import numpy as np
import matplotlib.pyplot as plt

from fg_funcs import mol_to_fingerprint, safe_mol_from_smiles, fg_to_array, fp_to_array

In [None]:
# Load the dataset
data_path = 'data/chembl_35_fg_scaf.csv'
if os.path.exists(data_path):
    chembl = pd.read_csv(data_path)
else:
    raise FileNotFoundError(f"Dataset not found at {data_path}")

In [None]:
# Convert each fgs entry to a list
chembl['fgs'] = chembl['fgs'].apply(lambda x: eval(x) if isinstance(x, str) else x)

# drop rows with empty fgs
chembl = chembl[chembl['fgs'].notna() & (chembl['fgs'].str.len() > 0)]

In [None]:
# Currate dataset using 4 functional groups
fgs_list = []
for fgs in chembl['fgs']:
    if isinstance(fgs, list):
        fgs_list.extend(fgs)
sorted_fgs = pd.Series(fgs_list).value_counts().head(20).index.tolist()

print(sorted_fgs)


In [None]:
print(pd.Series(fgs_list).value_counts())

# Get how many fgs have more than 1000 occurrences
print((pd.Series(fgs_list).value_counts() < 100).sum())


In [None]:
curation_fgs = ['[R][NH][R]', 'O=[C](O)[R]', 'C=C', '[NH2][Car]']
chembl_curated = chembl[chembl['fgs'].apply(lambda x: any(fg in x for fg in curation_fgs))]
chembl_curated.shape

In [None]:
# Convert SMILES to RDKit Mol objects
chembl_curated['mol'] = chembl_curated['smiles'].apply(safe_mol_from_smiles)
chembl_curated.head()

In [None]:
# Get the molecular weights of the molecules
chembl_curated['mol_weight'] = chembl_curated['mol'].apply(lambda x: Descriptors.MolWt(x) if x else None)
chembl_curated.head()

In [None]:
filtered = chembl_curated[
    (chembl_curated['mol_weight'] <= 500)
]

filtered[['smiles', 'inchikey', 'fgs', 'scaffolds', 'mol_weight']].to_csv('chembl_35_fg_scaf_curated.csv', index=False)

In [None]:
# Create fingerprints and functional group arrays
filtered['fingerprint'] = filtered['mol'].apply(mol_to_fingerprint)

filtered['fingerprint_array'] = filtered['fingerprint'].apply(
    lambda x: fp_to_array(x) if x is not None else None
)

filtered['fg_array'] = filtered['fgs'].apply(lambda x: fg_to_array(x, curation_fgs))

# Convert the fingerprint to numpy arrays
filtered['fingerprint_array'] = filtered['fingerprint_array'].apply(lambda x: x if isinstance(x, np.ndarray) else np.zeros((2048,), dtype=int))

# Convert the fg_array to numpy arrays
filtered['fg_array'] = filtered['fg_array'].apply(lambda x: x if isinstance(x, np.ndarray) else np.zeros((len(curation_fgs),), dtype=int))

# Remove molecules with more than 1 of the functional groups
filtered['fg_array'] = filtered['fg_array'].apply(lambda x: x if np.sum(x) <= 1 else np.zeros((len(curation_fgs),), dtype=int))

# Remove any rows where fg_array is all zeros
filtered = filtered[filtered['fg_array'].apply(lambda x: np.any(x))]

# Sample 100,000 molecules
sampled = filtered.sample(n=100000, random_state=42)

In [None]:
# Save the processed data to a pickle file
output_file = 'data/chembl_35_fg_scaf_curated.pkl'
with open(output_file, 'wb') as f:
    pickle.dump(sampled, f)
print(f"Processed data saved to {output_file}")