In [1]:
# RDKit
from rdkit import Chem, RDLogger
from rdkit.Chem import Descriptors, AllChem
from rdkit.Chem.MolStandardize import rdMolStandardize

# Utilitários
import os
import re
from tqdm import tqdm
import numpy as np
import pandas as pd
from IPython.display import clear_output
import joblib

# Scikit-learn
from sklearn.model_selection import StratifiedKFold, cross_validate,StratifiedGroupKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.svm import NuSVC
from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score, accuracy_score, roc_auc_score
from sklearn.decomposition import PCA


# XGBoost
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB

# Imports base
from sklearn.linear_model import LogisticRegression, RidgeClassifier, SGDClassifier, PassiveAggressiveClassifier, Perceptron
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.ensemble import (
    RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier,
    AdaBoostClassifier, BaggingClassifier, HistGradientBoostingClassifier
)
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier

# Modelos externos
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

## Limpeza de Dados

In [2]:
# Disable informational messages from RDKit
RDLogger.DisableLog('rdApp.info')

def is_valid_smiles(smiles):
    """
    Checks whether a SMILES string is valid.

    Parameters:
        smiles (str): The SMILES representation of the molecule.

    Returns:
        bool: True if the SMILES is valid, False otherwise.
    """
    # Return False if SMILES is None, not a string, or empty/whitespace only
    if pd.isna(smiles) or not isinstance(smiles, str) or smiles.strip() == "":
        return False

    # Remove leading/trailing spaces
    smiles = smiles.strip()

    try:
        # First, try parsing the SMILES with normal sanitization
        mol = Chem.MolFromSmiles(smiles, sanitize=True)
        if mol is not None:
            return True
    except Exception:
        # Ignore errors here and try without sanitization
        pass  

    try:
        # If normal sanitization fails, parse without sanitization
        mol = Chem.MolFromSmiles(smiles, sanitize=False)
        if mol is not None:
            # Attempt manual sanitization and catch possible errors
            Chem.SanitizeMol(mol, catchErrors=True)
            return True
    except Exception:
        pass

    # If all parsing attempts fail, return False
    return False

# Descritores

In [3]:
# Disable informational RDKit logs
RDLogger.DisableLog('rdApp.info')

# Preload RDKit uncharger object for performance
_uncharger = rdMolStandardize.Uncharger()

# --- Molecule Neutralization ---
def neutralize_molecule(mol):
    """
    Neutralizes a molecule using RDKit's standardization tools.

    Parameters:
        mol (rdkit.Chem.Mol): RDKit molecule object.

    Returns:
        rdkit.Chem.Mol or None: Neutralized molecule, or original molecule if neutralization fails.
    """
    if mol is None:
        return None
    try:
        # Cleanup handles salting, normalization, and common charges
        mol = rdMolStandardize.Cleanup(mol)
        mol = _uncharger.Uncharge(mol)
        return mol
    except Exception:
        return mol  # Return original molecule if neutralization fails


# --- SMILES to Molecule Conversion ---
def smiles_to_mol(smiles, neutralize=True):
    """
    Converts a SMILES string to an RDKit molecule object, with optional neutralization.

    Parameters:
        smiles (str): SMILES string.
        neutralize (bool): Whether to neutralize the molecule after parsing.

    Returns:
        rdkit.Chem.Mol or None: RDKit molecule object, or None if parsing fails.
    """
    if pd.isna(smiles) or smiles.strip() == "":
        return None

    try:
        mol = Chem.MolFromSmiles(smiles, sanitize=True)
    except Exception:
        mol = Chem.MolFromSmiles(smiles, sanitize=False)
        if mol is not None:
            Chem.SanitizeMol(mol, catchErrors=True)

    if neutralize:
        mol = neutralize_molecule(mol)

    return mol


# --- Precompiled Descriptor List ---
_DESC_FUNCS = [(name, fn) for name, fn in Descriptors.descList]

def compute_descriptors(mol):
    """
    Computes molecular descriptors for a given RDKit molecule.

    Parameters:
        mol (rdkit.Chem.Mol): RDKit molecule object.

    Returns:
        dict: Dictionary of descriptor names and values, NaN if calculation fails.
    """
    if mol is None:
        return {f"desc_{name}": np.nan for name, _ in _DESC_FUNCS}

    out = {}
    for name, fn in _DESC_FUNCS:
        try:
            out[f"desc_{name}"] = fn(mol)
        except Exception:
            out[f"desc_{name}"] = np.nan
    return out


# --- Descriptor Calculation (RDKit only) ---
def compute_rdkit_descriptors(
    df,
    smiles_col='SMILES',
    neutralize=True
):
    """
    Computes RDKit molecular descriptors for a DataFrame containing SMILES.

    Parameters:
        df (pd.DataFrame): DataFrame contendo uma coluna com SMILES.
        smiles_col (str): Nome da coluna de SMILES no DataFrame.
        neutralize (bool): Se True, neutraliza as moléculas antes de calcular descritores.

    Returns:
        pd.DataFrame: DataFrame original + colunas de descritores (prefixo "desc_").
    """

    # Filtra a lista de descritores para remover os fr_*
    _DESC_FUNCS_NO_FR = [(name, fn) for name, fn in _DESC_FUNCS if not name.startswith('fr_')]

    # Preparar dicionário de resultados (apenas não-fr)
    descriptor_results = {f"desc_{name}": [] for name, _ in _DESC_FUNCS_NO_FR}

    # Processar cada molécula
    for smiles in tqdm(df[smiles_col], desc="Calculando descritores", unit="mol"):
        mol = smiles_to_mol(smiles, neutralize=neutralize)

        # compute_descriptors pode computar todos; aqui só puxamos os que mantivemos
        descs = compute_descriptors(mol)
        for k in descriptor_results.keys():  # k já vem como "desc_<nome>"
            descriptor_results[k].append(descs.get(k, np.nan))

    # Criar DataFrame com os descritores
    df_descs = pd.DataFrame(descriptor_results, index=df.index)

    # Concatenar com o original (mantendo índice)
    df_final = pd.concat([df.reset_index(drop=False), df_descs], axis=1)

    return df_final

In [4]:
df_vivo= pd.read_csv('in vivo + cpdb.csv')
df_vivo.shape

(19883, 9)

In [5]:
df_vivo['SMILES_valido'] = df_vivo['SMILES'].apply(is_valid_smiles)
df_vivo['SMILES_valido'].value_counts()



SMILES_valido
True    19883
Name: count, dtype: int64

In [6]:
# Drop unnecessary columns
df_vivo.drop(columns=['Chemical', 'Identificador', 'SMILES_valido','species','strain','Male','Female'], inplace=True)

# Convert text columns to lowercase
df_vivo['Results'] = df_vivo['Results'].str.lower()
df_vivo['Type'] = df_vivo['Type'].str.lower()

# Remove duplicate rows if any
df_vivo.drop_duplicates(inplace=True)

df_vivo.shape

(3617, 3)

In [7]:
# Identify SMILES (in the filtered DataFrame) that have more than one 'Results' value
smiles_multiple_results = df_vivo.groupby("SMILES")["Results"].nunique()
smiles_multiple_results = smiles_multiple_results[smiles_multiple_results > 1].index

# Remove SMILES that have more than one result
df_final_vivo = df_vivo[~df_vivo["SMILES"].isin(smiles_multiple_results)]

print("Total de compostos não divergentes:", len(df_final_vivo))

df_organicos = df_final_vivo[df_final_vivo["SMILES"].str.contains(r"C(?![a-z])", regex=True, na=False)]

print("Total de compostos orgânicos:", len(df_organicos))

# Drop 'Type' column as it is no longer needed
df_final_vivo = df_organicos.drop(columns='Type')

# Reset index for clean ordering
df_final_vivo.reset_index(drop=True, inplace=True)

# Show final shape
df_final_vivo.shape

Total de compostos não divergentes: 2227
Total de compostos orgânicos: 2090


(2090, 2)

In [8]:
# Exemplo de uso
df_descritores_vivo = compute_rdkit_descriptors(
    df= df_final_vivo,
    smiles_col='SMILES')
df_descritores_vivo.shape

Calculando descritores: 100%|██████████| 2090/2090 [00:09<00:00, 214.65mol/s]


(2090, 135)

In [9]:
# Define features (X) and target (y)
# Remove non-numeric and non-feature columns
X = df_descritores_vivo.drop(columns=['SMILES', 'Results'])
X = X.fillna(0)  # Replace missing values with 0

# Map target column 'Results' to binary format: positive -> 1, negative -> 0
y = df_descritores_vivo['Results'].astype(str).str.lower().map({'positive': 1, 'negative': 0})

# Normalize features (optional, but recommended for some models)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [10]:
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

# =========================
# Model Catalog
# =========================
model_zoo = {
    # Linear regressions / linear classifiers
    "LogisticRegression": LogisticRegression(max_iter=1000, n_jobs=-1),
    "RidgeClassifier": RidgeClassifier(),
    "SGDClassifier": SGDClassifier(max_iter=1000, tol=1e-3, n_jobs=-1),
    "PassiveAggressive": PassiveAggressiveClassifier(max_iter=1000, random_state=42),
    "Perceptron": Perceptron(max_iter=1000, tol=1e-3, random_state=42),

    # Support Vector Machines (SVM)
    "LinearSVC": LinearSVC(),
    "SVC": SVC(),
    "NuSVC": NuSVC(),

    # Nearest neighbors
    "KNeighbors": KNeighborsClassifier(),

    # Decision trees and tree ensembles
    "DecisionTree": DecisionTreeClassifier(random_state=42),
    "ExtraTree": ExtraTreeClassifier(random_state=42),
    "RandomForest": RandomForestClassifier(random_state=42, n_estimators=300, n_jobs=-1),
    "ExtraTrees": ExtraTreesClassifier(random_state=42, n_estimators=300, n_jobs=-1),
    "GradientBoosting": GradientBoostingClassifier(random_state=42),
    "HistGradientBoosting": HistGradientBoostingClassifier(random_state=42),

    # Other ensemble methods
    "AdaBoost": AdaBoostClassifier(random_state=42),
    "Bagging": BaggingClassifier(random_state=42, n_jobs=-1),

    # Naive Bayes
    "GaussianNB": GaussianNB(),
    "BernoulliNB": BernoulliNB(),

    # Discriminant analysis
    "LDA": LinearDiscriminantAnalysis(),
    "QDA": QuadraticDiscriminantAnalysis(),

    # Simple neural networks
    "MLPClassifier": MLPClassifier(max_iter=1000, random_state=42),

    # External gradient boosting models
    "XGBoost": XGBClassifier(
        n_estimators=500,
        learning_rate=0.05,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        use_label_encoder=False,
        eval_metric="logloss",
        n_jobs=-1
    ),
    "LightGBM": LGBMClassifier(
        n_estimators=500,
        learning_rate=0.05,
        max_depth=-1,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        n_jobs=-1
    ),
    "CatBoost": CatBoostClassifier(
        iterations=500,
        learning_rate=0.05,
        depth=6,
        verbose=False,
        random_state=42
    )
}

# =========================
# BALANCEADORES
# =========================
balancers = {
    "original": None,  # sem sampling
    "under": RandomUnderSampler(random_state=42),
    "over": RandomOverSampler(random_state=42),
    "smote": SMOTE(random_state=42),
}

# =========================
# PRÉ-PROCESSAMENTO DO X E y
# =========================
X = df_descritores_vivo.drop(columns=['SMILES', 'Results']).fillna(0)

y = (
    df_descritores_vivo['Results']
    .astype(str)
    .str.lower()
    .map({'positive': 1, 'negative': 0})
)

# =========================
# CROSS-VALIDATION E MÉTRICAS
# =========================
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

scoring = {
    "accuracy": "accuracy",
    "balanced_accuracy": "balanced_accuracy",
    "f1": "f1",
    "roc_auc": "roc_auc",
    "precision": "precision",
    "recall": "recall",
}

# garantir pasta
import os
os.makedirs("Resultados", exist_ok=True)
os.makedirs("Resultados_folds", exist_ok=True)

# =========================
# LOOP PRINCIPAL: sampling → modelos → resultados
# =========================
for balance_name, sampler in balancers.items():

    print(f"\n=== Rodando balanceamento: {balance_name} ===")

    pipelines = {}

    # Construção dos pipelines
    for name, clf in model_zoo.items():

        if sampler is None:
            pipe = Pipeline(steps=[
                ("scaler", StandardScaler()),
                ("clf", clf)
            ])
        else:
            pipe = ImbPipeline(steps=[
                ("scaler", StandardScaler()),
                ("sampler", sampler),
                ("clf", clf)
            ])

        pipelines[name] = pipe

    # =====================
    # 1) MÉDIAS E STD
    # =====================
    summary_rows = []

    # =====================
    # 2) VALORES POR FOLD
    # =====================
    fold_rows = []

    for name, pipe in pipelines.items():
        cvres = cross_validate(
            pipe, X, y,
            cv=cv,
            scoring=scoring,
            n_jobs=-1,
            return_train_score=False
        )

        # ---------- salvar *fold a fold* ----------
        for i in range(cv.get_n_splits()):
            fold_rows.append({
                "Model": name,
                "Fold": i + 1,
                "Accuracy": cvres["test_accuracy"][i],
                "BalancedAcc": cvres["test_balanced_accuracy"][i],
                "F1": cvres["test_f1"][i],
                "ROC_AUC": cvres["test_roc_auc"][i],
                "Precision": cvres["test_precision"][i],
                "Recall": cvres["test_recall"][i],
                "FitTime": cvres["fit_time"][i],
                "ScoreTime": cvres["score_time"][i],
            })

        # ---------- salvar *média e desvio padrão* ----------
        summary_rows.append({
            "Model": name,
            "Accuracy_mean": np.mean(cvres["test_accuracy"]),
            "Accuracy_std":  np.std(cvres["test_accuracy"], ddof=1),
            "BalAcc_mean":   np.mean(cvres["test_balanced_accuracy"]),
            "BalAcc_std":    np.std(cvres["test_balanced_accuracy"], ddof=1),
            "F1_mean":       np.mean(cvres["test_f1"]),
            "F1_std":        np.std(cvres["test_f1"], ddof=1),
            "ROC_AUC_mean":  np.mean(cvres["test_roc_auc"]),
            "ROC_AUC_std":   np.std(cvres["test_roc_auc"], ddof=1),
            "Precision_mean":np.mean(cvres["test_precision"]),
            "Precision_std": np.std(cvres["test_precision"], ddof=1),
            "Recall_mean":   np.mean(cvres["test_recall"]),
            "Recall_std":    np.std(cvres["test_recall"], ddof=1),
            "FitTime_mean":  np.mean(cvres["fit_time"]),
            "FitTime_std":   np.std(cvres["fit_time"], ddof=1),
            "ScoreTime_mean":np.mean(cvres["score_time"]),
            "ScoreTime_std": np.std(cvres["score_time"], ddof=1),
        })

    # ============================
    # Salvar RESUMO (médias/std)
    # ============================
    final_summary = (
        pd.DataFrame(summary_rows)
        .sort_values(by="F1_mean", ascending=False)
        .reset_index(drop=True)
    )

    path_summary = f"Resultados/Desc_{balance_name}.csv"
    final_summary.to_csv(path_summary, index=False)

    print(f"[OK] Salvo resumo: {path_summary}")
    print(final_summary.head(5))

    # ============================
    # Salvar FOLDS (raw)
    # ============================
    df_folds = pd.DataFrame(fold_rows)
    path_folds = f"Resultados_folds/Desc_{balance_name}.csv"
    df_folds.to_csv(path_folds, index=False)

    print(f"[OK] Salvo folds: {path_folds}")



=== Rodando balanceamento: original ===
[OK] Salvo resumo: Resultados/Desc_original.csv
                  Model  Accuracy_mean  Accuracy_std  BalAcc_mean  BalAcc_std  \
0              LightGBM       0.804306      0.025958     0.783372    0.030348   
1              CatBoost       0.806220      0.019304     0.781964    0.021114   
2               XGBoost       0.801914      0.027826     0.780547    0.029565   
3  HistGradientBoosting       0.797129      0.022690     0.776666    0.024669   
4          RandomForest       0.803828      0.024605     0.774204    0.028174   

    F1_mean    F1_std  ROC_AUC_mean  ROC_AUC_std  Precision_mean  \
0  0.730602  0.040430      0.867104     0.017179        0.775583   
1  0.728532  0.028365      0.866925     0.019549        0.790380   
2  0.727456  0.038682      0.869307     0.022311        0.773344   
3  0.722613  0.032433      0.864834     0.018192        0.762635   
4  0.716269  0.039513      0.862159     0.013093        0.806543   

   Precision_st