<h1> Properly classify molecular smiles into: SMs, peptides, macrocycles, etc </h1>

<h3>Classification Categories</h3>
<I> Small molecules <br> Canonical peptidea <br> Noncanonical peptides <br> Cyclic peptides (lariat or circular) <br> Natural products <br> Glycans <br> RIPS  </I>

<br><br><br>

In [4]:
import sys, csv
import numpy as np
import pandas as pd
import seaborn as sns
print(sys.executable)
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm
import sqlite3
import os 
from pathlib import Path
from tqdm.notebook import tqdm
from tqdm import tqdm
from mol_utils import MolecularClassifier
from rdkit import Chem
import pyarrow
import fastparquet

from mol_utils import classify_smiles

/fsx/alex/MolecularTypeClassifier-1/.pixi/envs/default/bin/python


In [7]:
import warnings
warnings.filterwarnings('ignore')


ERROR! Session/line number was not unique in database. History logging moved to new session 94


<br><br>

---------------------------------- Important Functions --------------------------------------------------------

In [5]:
#Extract individual features from each entry in df["canonical_smiles"]

classifier = MolecularClassifier()

def _mol_from_smiles(smiles: str):
    """Safe RDKit SMILES → Mol, handling bad/missing inputs."""
    if not isinstance(smiles, str) or not smiles.strip():
        return None
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        # last-ditch attempt without sanitize; then sanitize manually
        try:
            mol = Chem.MolFromSmiles(smiles, sanitize=False)
            if mol is not None:
                Chem.SanitizeMol(mol)
        except Exception:
            return None
    return mol

def _features_for_smiles(smiles: str):
    """
    Return a dict of features for one SMILES.
    If parsing or featurization fails, return an empty dict so merge still works.
    """
    mol = _mol_from_smiles(smiles)
    if mol is None:
        return {}
    try:
        feats = classifier.extract_features(mol)
        # Normalize to dict for clean column creation
        if isinstance(feats, dict):
            return feats
        if isinstance(feats, pd.Series):
            return feats.to_dict()
        if isinstance(feats, (list, tuple)):
            return {f"feat_{i}": v for i, v in enumerate(feats)}
        # Fallback single value
        return {"feature": feats}
    except Exception:
        return {}


"""
    Function: Extract molecular features from input SMILES.

    Parameters
    ----------
    input_df(ex: df_MCRs) : pd.DataFrame 
        Input dataframe containing a SMILES column.
    smiles_col : str, default "canonical_smiles"
        Name of the column in df_MCRs that holds SMILES strings.
    prefix : str | None, default None
        Optional prefix to add to the generated feature columns (e.g., "feat_").
    show_progress : bool, default True
        Whether to display a tqdm progress bar while featurizing unique SMILES.
    verbose : bool, default True
        Whether to print a short summary and show head().

    Returns
    -------
    dataframe(ex: df_enriched) : pd.DataFrame
        The input dataframe concatenated with the extracted feature columns.
    features_df : pd.DataFrame
        The standalone features dataframe (one row per input row).
    """

def enrich_mcrs_with_features(
    input_df: pd.DataFrame,
    smiles_col: str = "canonical_smiles",
    prefix: str | None = None,
    show_progress: bool = True,
    verbose: bool = True,
):
    
    from tqdm.auto import tqdm as _tqdm  # local import to avoid hard dependency if not needed

    if smiles_col not in input_df.columns:
        raise KeyError(f"Column '{smiles_col}' not found in input_df.")

    # --- Speed-up: cache features for unique SMILES so duplicates aren’t recomputed ---
    unique_smiles = (
        input_df[smiles_col]
        .dropna()
        .astype(str)
        .unique()
    )

    _progress = _tqdm if show_progress else (lambda x, **k: x)
    feat_cache: dict[str, dict] = {}

    for s in _progress(unique_smiles, desc="Featurizing unique SMILES", ncols=80):
        feat_cache[s] = _features_for_smiles(s)

    # Map cached dicts back to each row, then expand dicts into columns
    features_list = [
        feat_cache.get(str(s), {}) if pd.notna(s) else {}
        for s in input_df[smiles_col]
    ]
    features_df = pd.DataFrame(features_list)

    # Optional: add a prefix to avoid any name collisions with existing columns
    if prefix:
        features_df = features_df.add_prefix(prefix)

    # Final enriched dataframe
    df_enriched = pd.concat(
        [input_df.reset_index(drop=True), features_df.reset_index(drop=True)],
        axis=1
    )

    if verbose:
        print(f"✅ Added {features_df.shape[1]} feature columns to {len(df_enriched)} rows.")
        try:
            display(df_enriched.head())
        except NameError:
            # display() may not exist outside notebooks; safe to skip
            pass

    return df_enriched, features_df





#Format the features extracted -> After you extract features, restructure the output into a dataframe (one feature per column)
import re
import ast

def parse_molecular_feature_string(feature_str: str) -> pd.DataFrame:
    """
    Parse a string like:
      MolecularFeatures(molecular_weight=1023.636..., ..., pattern_counts={'AA_PHENYLALANINE': 1, ...})
    into a one-row DataFrame with columns = feature names, values = parsed values.
    """
    if not isinstance(feature_str, str) or not feature_str.strip():
        raise ValueError("Input must be a non-empty string")

    s = feature_str.strip()

    # Remove wrapper
    if s.startswith("MolecularFeatures(") and s.endswith(")"):
        s = s[len("MolecularFeatures("):-1]

    # Extract pattern_counts first
    pattern_dict = {}
    m = re.search(r"pattern_counts\s*=\s*(\{.*\})", s)
    if m:
        pattern_text = m.group(1)
        s = s[:m.start()].rstrip().rstrip(",").rstrip()
        try:
            pattern_dict = ast.literal_eval(pattern_text)
        except Exception:
            pattern_dict = {}

    # Now split the remaining "k=v" pairs
    features = {}
    for kv in re.split(r"\s*,\s*", s):
        if "=" not in kv:
            continue
        k, v = kv.split("=", 1)
        k = k.strip()
        v = v.strip()
        try:
            # Try to coerce to float or int
            if re.fullmatch(r"[-+]?(?:\d+\.\d*|\d*\.\d+)(?:[eE][-+]?\d+)?", v):
                v = float(v)
            elif re.fullmatch(r"[-+]?\d+", v):
                v = int(v)
        except Exception:
            pass
        features[k] = v

    # Add pattern_counts entries as separate columns
    for pk, pv in pattern_dict.items():
        features[f"pattern_{pk}"] = pv

    # Convert to DataFrame
    df = pd.DataFrame([features])
    return df


<br>
----------------------------------------------------------------------------------------------------------------------
<br><br>

<br>

## I. Extract molecular features

In [6]:
#Import positive controls
positive_ctrl_molecules = pd.read_csv("Pos.ctrl_Molecules_merged.csv")
len(positive_ctrl_molecules)
positive_ctrl_molecules.head(5)

Unnamed: 0,Class,SMILES
0,sm,CC[C@H](C)[C@H](NC(=O)[C@H](CCC(O)=O)NC(=O)[C@...
1,sm,CC(C)C[C@H](NC(=O)[C@@H](COC(C)(C)C)NC(=O)[C@H...
2,sm,CC(C)C[C@@H](NC(=O)CNC(=O)[C@@H](NC=O)C(C)C)C(...
3,sm,NC(=O)CC[C@@H]1NC(=O)[C@H](CC2=CC=CC=C2)NC(=O)...
4,sm,CC(C)C[C@H](NC(=O)[C@@H](CCCNC(N)=O)NC(=O)[C@H...


In [None]:
#Extract features from molecules
df_Pos_ctrl_enriched, Pos_ctrl_features_df = enrich_mcrs_with_features(
    positive_ctrl_molecules,
    smiles_col="SMILES",
    show_progress=True,
    verbose=True
)


Featurizing unique SMILES:   0%|                     | 0/216982 [00:00<?, ?it/s]

[01:44:57] Explicit valence for atom # 13 Cl, 5, is greater than permitted
[01:44:57] Explicit valence for atom # 13 Cl, 5, is greater than permitted
[01:44:57] SMILES Parse Error: syntax error while parsing: OS(O)(O)C1=CC=C(C=C1)C-1=C2\\C=CC(=N2)\\C(=C2/N\\C(\\C=C2)=C(/C2=N/C(/C=C2)=C(\\C2=CC=C\\-1N2)C1=CC=C(C=C1)S(O)(O)O)C1=CC=C(C=C1)S([O-])([O-])[O-])\\C1=CC=C(C=C1)S(O)(O)[O-]
[01:44:57] SMILES Parse Error: check for mistakes around position 90:
[01:44:57] /C=C2)=C(\\C2=CC=C\\-1N2)C1=CC=C(C=C1)S(O
[01:44:57] ~~~~~~~~~~~~~~~~~~~~^
[01:44:57] SMILES Parse Error: extra open parentheses while parsing: OS(O)(O)C1=CC=C(C=C1)C-1=C2\\C=CC(=N2)\\C(=C2/N\\C(\\C=C2)=C(/C2=N/C(/C=C2)=C(\\C2=CC=C\\-1N2)C1=CC=C(C=C1)S(O)(O)O)C1=CC=C(C=C1)S([O-])([O-])[O-])\\C1=CC=C(C=C1)S(O)(O)[O-]
[01:44:57] SMILES Parse Error: check for mistakes around position 42:
[01:44:57] C-1=C2\\C=CC(=N2)\\C(=C2/N\\C(\\C=C2)=C(/
[01:44:57] ~~~~~~~~~~~~~~~~~~~~^
[01:44:57] SMILES Parse Error: extra open parentheses while pa

In [None]:
#Clean up features output
parsed_dfs = []

for feature_str in df_Pos_ctrl_enriched["feature"]:
    try:
        #print(feature_str)
        parsed_df = parse_molecular_feature_string(str(feature_str))
        parsed_dfs.append(parsed_df)
    except Exception:
        # if parsing fails (e.g. missing or malformed entry)
        parsed_dfs.append(pd.DataFrame([{}]))


# Concatenate all parsed rows into one DataFrame
parsed_features_df = pd.concat(parsed_dfs, ignore_index=True)

# Merge with the original dataframe
out_df_Pos_ctrl_enriched = pd.concat(
    [df_Pos_ctrl_enriched.reset_index(drop=True), parsed_features_df.reset_index(drop=True)],
    axis=1
)

out_df_Pos_ctrl_enriched.head(5)

#out_df_Pos_ctrl_enriched.to_csv("PositiveCtrl_data_withFeatures", index=False);

In [None]:
out_df_Pos_ctrl_enriched.to_csv("PositiveCtrl_data_withFeatures", index=False);

<br><br>

## II. Build random forest

In [None]:
#Keep random 10% of the data as heldout