In [None]:
!pip uninstall -y numpy
!pip install numpy==1.26.4 --no-cache-dir --force-reinstall
import os
os.kill(os.getpid(), 9)  # Isso reinicia o kernel do Colab


In [1]:
!pip install rdkit-pypi
!pip install lazypredict scikit-learn

Collecting rdkit-pypi
  Downloading rdkit_pypi-2022.9.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.9 kB)
Downloading rdkit_pypi-2022.9.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (29.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m29.4/29.4 MB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rdkit-pypi
Successfully installed rdkit-pypi-2022.9.5
Collecting lazypredict
  Downloading lazypredict-0.2.13-py2.py3-none-any.whl.metadata (12 kB)
Downloading lazypredict-0.2.13-py2.py3-none-any.whl (12 kB)
Installing collected packages: lazypredict
Successfully installed lazypredict-0.2.13


In [11]:
# RDKit
from rdkit import Chem, RDLogger
from rdkit.Chem import Descriptors, AllChem

# Utilitários
import os
import re
from datetime import datetime
from tqdm import tqdm
import numpy as np
import pandas as pd

# Scikit-learn
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_validate
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.svm import NuSVC
from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score, accuracy_score, roc_auc_score

# XGBoost
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# LazyPredict
from lazypredict.Supervised import LazyClassifier

In [31]:
df = pd.read_csv('/content/df_final.csv')
df.shape

(3886, 2)

# Descritores + Estrutura de alerta

In [13]:
RDLogger.DisableLog('rdApp.*')

# Função auxiliar para neutralizar SMILES com cargas
def neutralizar_smiles(smiles):
    """
    Remove cargas formais de átomos representados como [Na+], [Fe+3], [Cl-], etc., convertendo para [Na], [Fe], [Cl], etc.
    """
    if pd.isna(smiles):
        return ""
    return re.sub(r'\[([A-Z][a-z]?)[+-]?[0-9]*\]', r'[\1]', smiles)

# Função para calcular descritores moleculares
def calcular_descritores(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return {}
    return {desc[0]: desc[1](mol) for desc in Descriptors.descList}

# Função principal com controle de neutralização
def verificar_subestruturas_e_descritores(
    df, df_estruturas,
    smiles_col='SMILES',
    estrutura_smiles_col='SMILES',
    neutralizar=True
):
    """
    Verifica presença de subestruturas e calcula descritores moleculares.

    Parâmetros:
        df: DataFrame com compostos.
        df_estruturas: DataFrame com subestruturas.
        smiles_col: nome da coluna de SMILES no df.
        estrutura_smiles_col: nome da coluna de SMILES no df_estruturas.
        neutralizar: se True, remove carga dos SMILES antes da comparação.

    Retorna:
        DataFrame com descritores e colunas de presença de subestruturas.
    """

    df_estruturas.columns = df_estruturas.columns.str.strip()

    # Preparar os SMILES das subestruturas
    if neutralizar:
        df_estruturas['SMILES_neutro'] = df_estruturas[estrutura_smiles_col].apply(neutralizar_smiles)
    else:
        df_estruturas['SMILES_neutro'] = df_estruturas[estrutura_smiles_col]

    padroes = {
        smiles: Chem.MolFromSmiles(smiles)
        for smiles in df_estruturas['SMILES_neutro']
        if Chem.MolFromSmiles(smiles) is not None
    }

    subestrutura_resultados = {smiles: [] for smiles in padroes}
    descritores_resultados = {desc[0]: [] for desc in Descriptors.descList}

    # Iterar sobre os SMILES dos compostos
    for smiles in tqdm(df[smiles_col], desc="Processando moléculas", unit="molécula"):
        smiles_proc = neutralizar_smiles(smiles) if neutralizar else smiles
        mol = Chem.MolFromSmiles(smiles_proc)

        for sub_smiles, padrao in padroes.items():
            subestrutura_resultados[sub_smiles].append(int(mol.HasSubstructMatch(padrao)) if mol else 0)

        descritores = calcular_descritores(smiles_proc)
        for desc_nome in descritores_resultados:
            descritores_resultados[desc_nome].append(descritores.get(desc_nome, None))

    df_subs = pd.DataFrame(subestrutura_resultados)
    df_descs = pd.DataFrame(descritores_resultados)
    df_final = pd.concat([df.reset_index(drop=True), df_descs, df_subs], axis=1)

    return df_final

In [14]:
df_estruturas = pd.read_csv('/content/Estruturas de alerta.csv')

In [32]:
# Exemplo de uso
df_processado = verificar_subestruturas_e_descritores(
    df=df,
    df_estruturas=df_estruturas,
    smiles_col='SMILES',
    estrutura_smiles_col='SMILES'
)
df_processado.shape

Processando moléculas: 100%|██████████| 3886/3886 [00:50<00:00, 76.22molécula/s]


(3886, 368)

# Classificação

In [35]:
# Carregar os dados
df = df_processado.copy()

# Definir as features (X) e o alvo (y)
X = df.drop(columns=['SMILES', 'Results'])  # Remove colunas não numéricas
y = df['Results']

# Normalizar os dados (opcional, mas recomendado para alguns modelos)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
# Configurar o StratifiedKFold (10 folds)
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Criar dicionário para armazenar os resultados
results = []

# Iterar sobre os folds
for train_idx, test_idx in cv.split(X_scaled, y):
    X_train, X_test = X_scaled[train_idx], X_scaled[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    # Inicializar o LazyClassifier
    clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)

    # Treinar e testar os modelos
    models, predictions = clf.fit(X_train, X_test, y_train, y_test)

    # Salvar os resultados do fold
    results.append(models)

final_results = (pd.concat(results).groupby(level=0).agg(['mean', 'std']).sort_values(by=('F1 Score', 'mean'), ascending=False)
)

 97%|█████████▋| 31/32 [03:36<00:01,  1.05s/it]

[LightGBM] [Info] Number of positive: 1788, number of negative: 1709
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006663 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 22719
[LightGBM] [Info] Number of data points in the train set: 3497, number of used features: 250
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.511295 -> initscore=0.045189
[LightGBM] [Info] Start training from score 0.045189


100%|██████████| 32/32 [03:38<00:00,  6.83s/it]
 97%|█████████▋| 31/32 [03:13<00:00,  1.12it/s]

[LightGBM] [Info] Number of positive: 1788, number of negative: 1709
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006000 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 22745
[LightGBM] [Info] Number of data points in the train set: 3497, number of used features: 251
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.511295 -> initscore=0.045189
[LightGBM] [Info] Start training from score 0.045189


100%|██████████| 32/32 [03:14<00:00,  6.08s/it]
 97%|█████████▋| 31/32 [03:27<00:00,  1.16it/s]

[LightGBM] [Info] Number of positive: 1788, number of negative: 1709
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006646 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 22704
[LightGBM] [Info] Number of data points in the train set: 3497, number of used features: 250
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.511295 -> initscore=0.045189
[LightGBM] [Info] Start training from score 0.045189


100%|██████████| 32/32 [03:28<00:00,  6.52s/it]
 97%|█████████▋| 31/32 [03:15<00:00,  1.25it/s]

[LightGBM] [Info] Number of positive: 1788, number of negative: 1709
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005757 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 22726
[LightGBM] [Info] Number of data points in the train set: 3497, number of used features: 250
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.511295 -> initscore=0.045189
[LightGBM] [Info] Start training from score 0.045189


100%|██████████| 32/32 [03:17<00:00,  6.17s/it]
 97%|█████████▋| 31/32 [03:11<00:01,  1.05s/it]

[LightGBM] [Info] Number of positive: 1788, number of negative: 1709
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005886 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 22708
[LightGBM] [Info] Number of data points in the train set: 3497, number of used features: 250
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.511295 -> initscore=0.045189
[LightGBM] [Info] Start training from score 0.045189


100%|██████████| 32/32 [03:12<00:00,  6.03s/it]
 97%|█████████▋| 31/32 [03:20<00:00,  1.10it/s]

[LightGBM] [Info] Number of positive: 1788, number of negative: 1709
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005781 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 22663
[LightGBM] [Info] Number of data points in the train set: 3497, number of used features: 250
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.511295 -> initscore=0.045189
[LightGBM] [Info] Start training from score 0.045189


100%|██████████| 32/32 [03:22<00:00,  6.32s/it]
 97%|█████████▋| 31/32 [03:27<00:01,  1.11s/it]

[LightGBM] [Info] Number of positive: 1789, number of negative: 1709
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005633 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 22719
[LightGBM] [Info] Number of data points in the train set: 3498, number of used features: 250
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.511435 -> initscore=0.045748
[LightGBM] [Info] Start training from score 0.045748


100%|██████████| 32/32 [03:28<00:00,  6.53s/it]
 97%|█████████▋| 31/32 [03:05<00:01,  1.05s/it]

[LightGBM] [Info] Number of positive: 1789, number of negative: 1709
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005899 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 22677
[LightGBM] [Info] Number of data points in the train set: 3498, number of used features: 251
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.511435 -> initscore=0.045748
[LightGBM] [Info] Start training from score 0.045748


100%|██████████| 32/32 [03:06<00:00,  5.84s/it]
 97%|█████████▋| 31/32 [03:12<00:00,  1.25it/s]

[LightGBM] [Info] Number of positive: 1789, number of negative: 1709
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005650 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 22697
[LightGBM] [Info] Number of data points in the train set: 3498, number of used features: 251
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.511435 -> initscore=0.045748
[LightGBM] [Info] Start training from score 0.045748


100%|██████████| 32/32 [03:14<00:00,  6.07s/it]
 97%|█████████▋| 31/32 [03:22<00:01,  1.07s/it]

[LightGBM] [Info] Number of positive: 1788, number of negative: 1710
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005830 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 22698
[LightGBM] [Info] Number of data points in the train set: 3498, number of used features: 252
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.511149 -> initscore=0.044604
[LightGBM] [Info] Start training from score 0.044604


100%|██████████| 32/32 [03:24<00:00,  6.38s/it]


In [None]:
final_results

Unnamed: 0_level_0,Accuracy,Accuracy,Balanced Accuracy,Balanced Accuracy,ROC AUC,ROC AUC,F1 Score,F1 Score,Time Taken,Time Taken
Unnamed: 0_level_1,mean,std,mean,std,mean,std,mean,std,mean,std
Model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
ExtraTreesClassifier,0.74,0.02,0.74,0.02,0.74,0.02,0.74,0.02,1.51,0.22
RandomForestClassifier,0.73,0.02,0.73,0.02,0.73,0.02,0.73,0.02,2.01,0.05
NuSVC,0.73,0.03,0.72,0.03,0.72,0.03,0.73,0.03,3.14,0.41
SVC,0.73,0.02,0.72,0.02,0.72,0.02,0.72,0.02,2.81,0.36
LGBMClassifier,0.72,0.02,0.72,0.02,0.72,0.02,0.72,0.02,1.49,0.07
XGBClassifier,0.72,0.03,0.72,0.03,0.72,0.03,0.72,0.03,2.45,0.8
BaggingClassifier,0.71,0.02,0.71,0.02,0.71,0.02,0.71,0.02,3.14,0.09
LogisticRegression,0.71,0.02,0.71,0.02,0.71,0.02,0.71,0.02,0.43,0.17
LinearSVC,0.71,0.02,0.71,0.02,0.71,0.02,0.71,0.02,38.84,5.56
LinearDiscriminantAnalysis,0.71,0.02,0.7,0.02,0.7,0.02,0.7,0.02,0.48,0.22


In [None]:
final_results.to_csv('results_both_descritores_lazy.csv')

In [38]:
# Validação cruzada
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Métricas
scoring = {
    'accuracy': 'accuracy',
    'precision': 'precision',
    'recall': 'recall',
    'f1': 'f1',
    'roc_auc': 'roc_auc'
}

# Caminho do arquivo para salvar resultados
caminho_csv = "gridsearch_resultados_both_descritores_n_normalizado.csv"
primeira_execucao = not os.path.exists(caminho_csv)

modelos = {
    "LightGBM": {
        "pipeline": Pipeline([
           # ("scaler", StandardScaler()),
            ("model", LGBMClassifier(random_state=42, verbose=-1))  # verbose corrigido
        ]),
        "param_grid": {
            "model__n_estimators": [50, 100, 200],
            "model__max_depth": [3, 6, 9],
        }
    },
    "RandomForest": {
        "pipeline": Pipeline([
            #("scaler", StandardScaler()),
            ("model", RandomForestClassifier(random_state=42))
        ]),
        "param_grid": {
            "model__n_estimators": [50, 100, 200],
            "model__max_depth": [3, 6, 9],
            "model__min_samples_split": [2, 5],
            "model__min_samples_leaf": [1, 2]
        }
    },
    "ExtraTrees": {
        "pipeline": Pipeline([
           # ("scaler", StandardScaler()),
            ("model", ExtraTreesClassifier(random_state=42))
        ]),
        "param_grid": {
            "model__n_estimators": [50, 100, 200],
            "model__max_depth": [3, 6, 9],
            "model__min_samples_split": [2, 5],
            "model__min_samples_leaf": [1, 2]
        }
    }
}

# Loop de treino e salvamento incremental
for nome_modelo, config in modelos.items():
    print(f"\n🔍 Treinando modelo: {nome_modelo}")

    grid = GridSearchCV(
        estimator=config["pipeline"],
        param_grid=config["param_grid"],
        scoring=scoring,
        refit="f1",
        cv=cv,
        verbose=1,
        n_jobs=-1,
        return_train_score=False
    )

    if isinstance(X, pd.DataFrame):
     X.columns = [f"f{i}" for i in range(X.shape[1])]

    grid.fit(X, y)

    # Criar DataFrame dos resultados
    df_resultado = pd.DataFrame(grid.cv_results_)
    df_resultado["modelo"] = nome_modelo

      # Filtrar colunas relevantes
    colunas_mostrar = [ 'modelo',
          'mean_test_accuracy', 'std_test_accuracy',
          'mean_test_precision', 'std_test_precision',
          'mean_test_recall', 'std_test_recall',
          'mean_test_f1', 'std_test_f1',
          'mean_test_roc_auc', 'std_test_roc_auc',
          'params'
      ]

    df_filtrado = df_resultado[colunas_mostrar]

    # Adicionar ao CSV incrementalmente
    df_filtrado.to_csv(caminho_csv, mode='a', header=primeira_execucao, index=False)
    primeira_execucao = False  # Para evitar cabeçalho duplicado nas próximas iterações

    print(f"✅ {nome_modelo} finalizado. Melhor F1: {grid.best_score_:.4f}")
    print(f"Melhores parâmetros: {grid.best_params_}")



🔍 Treinando modelo: LightGBM
Fitting 10 folds for each of 9 candidates, totalling 90 fits
✅ LightGBM finalizado. Melhor F1: 0.7432
Melhores parâmetros: {'model__max_depth': 9, 'model__n_estimators': 50}

🔍 Treinando modelo: RandomForest
Fitting 10 folds for each of 36 candidates, totalling 360 fits
✅ RandomForest finalizado. Melhor F1: 0.7399
Melhores parâmetros: {'model__max_depth': 9, 'model__min_samples_leaf': 1, 'model__min_samples_split': 2, 'model__n_estimators': 200}

🔍 Treinando modelo: ExtraTrees
Fitting 10 folds for each of 36 candidates, totalling 360 fits
✅ ExtraTrees finalizado. Melhor F1: 0.6991
Melhores parâmetros: {'model__max_depth': 9, 'model__min_samples_leaf': 2, 'model__min_samples_split': 2, 'model__n_estimators': 50}


In [None]:
df_resultados = pd.read_csv(caminho_csv)
df_resultados.sort_values(by=['mean_test_f1', 'mean_test_accuracy'], ascending=False)

# Avaliação do Melhor modelo

In [None]:
# Pipeline
pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("model", LGBMClassifier(
    learning_rate=0.05,
    max_depth=6,
    n_estimators=200,
    num_leaves=15,
    random_state=42,
    verbose=-1))
])

# Validação cruzada
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Scorers para ambas as classes
scoring = {
    'accuracy': 'accuracy',
    'roc_auc': 'roc_auc',

    'precision_pos1': make_scorer(precision_score, average='binary', pos_label=1),
    'recall_pos1': make_scorer(recall_score, average='binary', pos_label=1),
    'f1_pos1': make_scorer(f1_score, average='binary', pos_label=1),

    'precision_pos0': make_scorer(precision_score, average='binary', pos_label=0),
    'recall_pos0': make_scorer(recall_score, average='binary', pos_label=0),
    'f1_pos0': make_scorer(f1_score, average='binary', pos_label=0),
}

# Avaliação
resultados = cross_validate(
    pipeline, X, y,
    scoring=scoring,
    cv=cv,
    return_train_score=False
)

# Impressão dos resultados
print("🔍 Resultados médios na validação cruzada:\n")

# Separando por classe
for label in ['pos1', 'pos0']:
    print(f"📊 Classe positiva: y = {label[-1]}")
    for metric in ['precision', 'recall', 'f1']:
        key = f'test_{metric}_{label}'
        mean = resultados[key].mean()
        std = resultados[key].std()
        print(f"{metric:<10}: {mean:.4f} ± {std:.4f}")
    print()

# Accuracy e AUC gerais
for metric in ['accuracy', 'roc_auc']:
    mean = resultados[f'test_{metric}'].mean()
    std = resultados[f'test_{metric}'].std()
    print(f"{metric:<10}: {mean:.4f} ± {std:.4f}")


🔍 Resultados médios na validação cruzada:

📊 Classe positiva: y = 1
precision : 0.7217 ± 0.0237
recall    : 0.7423 ± 0.0289
f1        : 0.7318 ± 0.0247

📊 Classe positiva: y = 0
precision : 0.7223 ± 0.0275
recall    : 0.7004 ± 0.0280
f1        : 0.7110 ± 0.0260

accuracy  : 0.7218 ± 0.0250
roc_auc   : 0.8041 ± 0.0251


# In vivo

In [39]:
df_vivo= pd.read_csv('/content/df_final_vivo.csv')
df_estruturas = pd.read_csv('/content/Estruturas de alerta.csv')
df_vivo.shape

(2223, 2)

In [21]:
RDLogger.DisableLog('rdApp.*')

def calcular_descritores(smiles):
    """Calcula todos os descritores RDKit para um dado SMILES."""
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return {}  # Retorna dicionário vazio se o SMILES for inválido
    return {desc[0]: desc[1](mol) for desc in Descriptors.descList}

def verificar_subestruturas_e_descritores(df, df_estruturas, smiles_col='SMILES', estrutura_smiles_col='SMILES', estrutura_nome_col='Estrutura de Alerta'):
    """
    Para cada molécula no df, verifica a presença de subestruturas e calcula descritores moleculares.

    Parâmetros:
        df: DataFrame contendo uma coluna de SMILES.
        df_estruturas: DataFrame contendo as subestruturas com seus nomes.
        smiles_col: Nome da coluna no df contendo os SMILES das moléculas.
        estrutura_smiles_col: Nome da coluna no df_estruturas contendo os SMILES das subestruturas.
        estrutura_nome_col: Nome da coluna no df_estruturas contendo os nomes das subestruturas.

    Retorna:
        DataFrame df atualizado com colunas de subestruturas (0/1) e descritores moleculares.
    """

    # Converte os padrões do DataFrame df_estruturas em objetos RDKit
    padroes = {
        nome: Chem.MolFromSmarts(smiles) for nome, smiles in zip(df_estruturas[estrutura_nome_col], df_estruturas[estrutura_smiles_col])
        if Chem.MolFromSmarts(smiles) is not None
    }

    # Criar dicionários para armazenar os resultados
    subestrutura_resultados = {nome: [] for nome in padroes}
    descritores_resultados = {desc[0]: [] for desc in Descriptors.descList}

    # Iterar sobre cada SMILES do df e calcular os resultados, usando tqdm para barra de progresso
    for smiles in tqdm(df[smiles_col], desc="Processando moléculas", unit="molécula"):
        mol = Chem.MolFromSmiles(smiles)

        # Verificação de subestruturas
        for nome, padrao in padroes.items():
            subestrutura_resultados[nome].append(int(mol.HasSubstructMatch(padrao)) if mol else 0)

        # Cálculo dos descritores moleculares
        descritores = calcular_descritores(smiles)
        for desc_nome in descritores_resultados:
            descritores_resultados[desc_nome].append(descritores.get(desc_nome, None))

    # Criar DataFrames com os resultados e concatenar com df
    df_subs = pd.DataFrame(subestrutura_resultados)
    df_descs = pd.DataFrame(descritores_resultados)
    df = pd.concat([df.reset_index(drop=True), df_descs, df_subs], axis=1)

    return df

In [40]:
# Exemplo de uso
df_descritores_vivo = verificar_subestruturas_e_descritores(df_vivo, df_estruturas)
df_descritores_vivo.shape

Processando moléculas: 100%|██████████| 2223/2223 [00:31<00:00, 70.64molécula/s]


(2223, 368)

In [41]:
# Definir as features (X) e o alvo (y)
X = df_descritores_vivo.drop(columns=['SMILES', 'Results'])  # Remove colunas não numéricas
y = df_descritores_vivo['Results']

In [24]:
# Normalizar os dados (opcional, mas recomendado para alguns modelos)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
# Configurar o StratifiedKFold (10 folds)
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Criar dicionário para armazenar os resultados
results = []

# Iterar sobre os folds
for train_idx, test_idx in cv.split(X_scaled, y):
    X_train, X_test = X_scaled[train_idx], X_scaled[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    # Inicializar o LazyClassifier
    clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)

    # Treinar e testar os modelos
    models, predictions = clf.fit(X_train, X_test, y_train, y_test)

    # Salvar os resultados do fold
    results.append(models)

final_results_vivo = (pd.concat(results).groupby(level=0).agg(['mean', 'std']).sort_values(by=('F1 Score', 'mean'), ascending=False)
)

100%|██████████| 32/32 [01:06<00:00,  2.07s/it]
100%|██████████| 32/32 [01:07<00:00,  2.12s/it]
100%|██████████| 32/32 [01:03<00:00,  1.99s/it]
100%|██████████| 32/32 [01:05<00:00,  2.06s/it]
100%|██████████| 32/32 [01:04<00:00,  2.01s/it]
100%|██████████| 32/32 [01:01<00:00,  1.92s/it]
100%|██████████| 32/32 [01:00<00:00,  1.88s/it]
100%|██████████| 32/32 [01:05<00:00,  2.06s/it]
100%|██████████| 32/32 [01:03<00:00,  1.99s/it]
100%|██████████| 32/32 [01:05<00:00,  2.05s/it]


In [None]:
final_results_vivo.to_csv('final_results_vivo_lazy.csv')

In [None]:
final_results_vivo

Unnamed: 0_level_0,Accuracy,Accuracy,Balanced Accuracy,Balanced Accuracy,ROC AUC,ROC AUC,F1 Score,F1 Score,Time Taken,Time Taken
Unnamed: 0_level_1,mean,std,mean,std,mean,std,mean,std,mean,std
Model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
ExtraTreesClassifier,0.8,0.02,0.77,0.02,0.77,0.02,0.8,0.02,0.83,0.16
RandomForestClassifier,0.8,0.02,0.76,0.02,0.76,0.02,0.79,0.02,1.35,0.18
XGBClassifier,0.79,0.03,0.77,0.03,0.77,0.03,0.79,0.03,2.14,0.87
LGBMClassifier,0.79,0.02,0.77,0.02,0.77,0.02,0.79,0.02,1.36,0.21
NuSVC,0.79,0.02,0.76,0.03,0.76,0.03,0.79,0.03,0.77,0.03
KNeighborsClassifier,0.79,0.02,0.76,0.02,0.76,0.02,0.79,0.02,0.07,0.02
SVC,0.78,0.02,0.74,0.03,0.74,0.03,0.77,0.03,0.75,0.1
BaggingClassifier,0.77,0.02,0.74,0.02,0.74,0.02,0.76,0.02,2.0,0.07
RidgeClassifierCV,0.77,0.03,0.74,0.03,0.74,0.03,0.76,0.03,0.37,0.21
RidgeClassifier,0.76,0.03,0.73,0.03,0.73,0.03,0.76,0.03,0.16,0.04


In [42]:
# Validação cruzada
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# Métricas
scoring = {
    'accuracy': 'accuracy',
    'precision': 'precision',
    'recall': 'recall',
    'f1': 'f1',
    'roc_auc': 'roc_auc'
}

# Caminho do arquivo para salvar resultados
caminho_csv = "gridsearch_resultados_vivo_descritores_n_normalizado.csv"
primeira_execucao = not os.path.exists(caminho_csv)

modelos = {
    "LightGBM": {
        "pipeline": Pipeline([
           ("scaler", StandardScaler()),
            ("model", LGBMClassifier(random_state=42, verbose=-1))  # verbose corrigido
        ]),
        "param_grid": {
            "model__n_estimators": [50, 100, 200],
            "model__max_depth": [3, 6, 9]
        }
    },
    "RandomForest": {
        "pipeline": Pipeline([
            ("scaler", StandardScaler()),
            ("model", RandomForestClassifier(random_state=42))
        ]),
        "param_grid": {
            "model__n_estimators": [50, 100, 200],
            "model__max_depth": [3, 6, 9],
            "model__min_samples_split": [2, 5],
            "model__min_samples_leaf": [1, 2]
        }
    },
    "ExtraTrees": {
        "pipeline": Pipeline([
            ("scaler", StandardScaler()),
            ("model", ExtraTreesClassifier(random_state=42))
        ]),
        "param_grid": {
            "model__n_estimators": [50, 100, 200],
            "model__max_depth": [3, 6, 9],
            "model__min_samples_split": [2, 5],
            "model__min_samples_leaf": [1, 2]
        }
    }
}

# Loop de treino e salvamento incremental
for nome_modelo, config in modelos.items():
    print(f"\n🔍 Treinando modelo: {nome_modelo}")

    grid = GridSearchCV(
        estimator=config["pipeline"],
        param_grid=config["param_grid"],
        scoring=scoring,
        refit="f1",
        cv=cv,
        verbose=1,
        n_jobs=-1,
        return_train_score=False
    )

    if isinstance(X, pd.DataFrame):
     X.columns = [f"f{i}" for i in range(X.shape[1])]

    grid.fit(X, y)

    # Criar DataFrame dos resultados
    df_resultado = pd.DataFrame(grid.cv_results_)
    df_resultado["modelo"] = nome_modelo

            # Filtrar colunas relevantes
    colunas_mostrar = [
                'mean_test_accuracy', 'std_test_accuracy',
                'mean_test_precision', 'std_test_precision',
                'mean_test_recall', 'std_test_recall',
                'mean_test_f1', 'std_test_f1',
                'mean_test_roc_auc', 'std_test_roc_auc',
                'params'
            ]
    df_filtrado = df_resultado[colunas_mostrar]

    # Adicionar ao CSV incrementalmente
    df_filtrado.to_csv(caminho_csv, mode='a', header=primeira_execucao, index=False)
    primeira_execucao = False  # Para evitar cabeçalho duplicado nas próximas iterações

    print(f"✅ {nome_modelo} finalizado. Melhor F1: {grid.best_score_:.4f}")
    print(f"Melhores parâmetros: {grid.best_params_}")

print('treinamento concluido')


🔍 Treinando modelo: LightGBM
Fitting 3 folds for each of 9 candidates, totalling 27 fits
✅ LightGBM finalizado. Melhor F1: 0.6920
Melhores parâmetros: {'model__max_depth': 6, 'model__n_estimators': 200}

🔍 Treinando modelo: RandomForest
Fitting 3 folds for each of 36 candidates, totalling 108 fits
✅ RandomForest finalizado. Melhor F1: 0.6299
Melhores parâmetros: {'model__max_depth': 9, 'model__min_samples_leaf': 1, 'model__min_samples_split': 5, 'model__n_estimators': 200}

🔍 Treinando modelo: ExtraTrees
Fitting 3 folds for each of 36 candidates, totalling 108 fits
✅ ExtraTrees finalizado. Melhor F1: 0.5480
Melhores parâmetros: {'model__max_depth': 9, 'model__min_samples_leaf': 1, 'model__min_samples_split': 5, 'model__n_estimators': 50}
treinamento concluido


In [None]:
df_resultados = pd.read_csv(caminho_csv)
df_resultados.sort_values(by=['mean_test_f1', 'mean_test_accuracy'], ascending=False)

Unnamed: 0,mean_test_accuracy,std_test_accuracy,mean_test_precision,std_test_precision,mean_test_recall,std_test_recall,mean_test_f1,std_test_f1,mean_test_roc_auc,std_test_roc_auc,params
87,0.79,0.00,0.75,0.02,0.67,0.01,0.71,0.00,0.83,0.01,"{'model__gamma': 'auto', 'model__kernel': 'rbf..."
81,0.79,0.01,0.75,0.02,0.67,0.01,0.71,0.01,0.83,0.01,"{'model__gamma': 'scale', 'model__kernel': 'rb..."
90,0.79,0.02,0.77,0.03,0.65,0.04,0.70,0.03,0.81,0.03,"{'model__gamma': 'auto', 'model__kernel': 'pol..."
84,0.79,0.02,0.77,0.03,0.64,0.04,0.70,0.03,0.81,0.03,"{'model__gamma': 'scale', 'model__kernel': 'po..."
8,0.78,0.01,0.75,0.03,0.65,0.02,0.70,0.00,0.83,0.01,"{'model__max_depth': 9, 'model__n_estimators':..."
...,...,...,...,...,...,...,...,...,...,...,...
50,0.68,0.00,0.87,0.04,0.20,0.02,0.32,0.02,0.76,0.01,"{'model__max_depth': 3, 'model__min_samples_le..."
53,0.68,0.00,0.87,0.04,0.20,0.02,0.32,0.02,0.76,0.01,"{'model__max_depth': 3, 'model__min_samples_le..."
56,0.68,0.00,0.87,0.04,0.20,0.02,0.32,0.02,0.76,0.01,"{'model__max_depth': 3, 'model__min_samples_le..."
86,0.63,0.00,0.76,0.14,0.03,0.01,0.05,0.02,0.76,0.02,"{'model__gamma': 'scale', 'model__kernel': 'po..."


## Melhor resultado

In [None]:
# Pipeline
pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("model", LGBMClassifier(
    learning_rate=0.05,
    max_depth=6,
    n_estimators=200,
    num_leaves=15,
    random_state=42,
    verbose=-1))
])

# Validação cruzada
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Scorers para ambas as classes
scoring = {
    'accuracy': 'accuracy',
    'roc_auc': 'roc_auc',

    'precision_pos1': make_scorer(precision_score, average='binary', pos_label=1),
    'recall_pos1': make_scorer(recall_score, average='binary', pos_label=1),
    'f1_pos1': make_scorer(f1_score, average='binary', pos_label=1),

    'precision_pos0': make_scorer(precision_score, average='binary', pos_label=0),
    'recall_pos0': make_scorer(recall_score, average='binary', pos_label=0),
    'f1_pos0': make_scorer(f1_score, average='binary', pos_label=0),
}

# Avaliação
resultados = cross_validate(
    pipeline, X, y,
    scoring=scoring,
    cv=cv,
    return_train_score=False
)

# Impressão dos resultados
print("🔍 Resultados médios na validação cruzada:\n")

# Separando por classe
for label in ['pos1', 'pos0']:
    print(f"📊 Classe positiva: y = {label[-1]}")
    for metric in ['precision', 'recall', 'f1']:
        key = f'test_{metric}_{label}'
        mean = resultados[key].mean()
        std = resultados[key].std()
        print(f"{metric:<10}: {mean:.4f} ± {std:.4f}")
    print()

# Accuracy e AUC gerais
for metric in ['accuracy', 'roc_auc']:
    mean = resultados[f'test_{metric}'].mean()
    std = resultados[f'test_{metric}'].std()
    print(f"{metric:<10}: {mean:.4f} ± {std:.4f}")


🔍 Resultados médios na validação cruzada:

📊 Classe positiva: y = 1
precision : 0.7708 ± 0.0366
recall    : 0.6503 ± 0.0482
f1        : 0.7036 ± 0.0257

📊 Classe positiva: y = 0
precision : 0.8044 ± 0.0198
recall    : 0.8795 ± 0.0282
f1        : 0.8397 ± 0.0116

accuracy  : 0.7922 ± 0.0148
roc_auc   : 0.8485 ± 0.0201


# In vitro

In [43]:
df_vitro = pd.read_csv('/content/df_final_vitro.csv')
df_estruturas = pd.read_csv('/content/Estruturas de alerta.csv')
df_vitro.shape

(3062, 2)

In [27]:
RDLogger.DisableLog('rdApp.*')

def calcular_descritores(smiles):
    """Calcula todos os descritores RDKit para um dado SMILES."""
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return {}  # Retorna dicionário vazio se o SMILES for inválido
    return {desc[0]: desc[1](mol) for desc in Descriptors.descList}

def verificar_subestruturas_e_descritores(df, df_estruturas, smiles_col='SMILES', estrutura_smiles_col='SMILES', estrutura_nome_col='Estrutura de Alerta'):
    """
    Para cada molécula no df, verifica a presença de subestruturas e calcula descritores moleculares.

    Parâmetros:
        df: DataFrame contendo uma coluna de SMILES.
        df_estruturas: DataFrame contendo as subestruturas com seus nomes.
        smiles_col: Nome da coluna no df contendo os SMILES das moléculas.
        estrutura_smiles_col: Nome da coluna no df_estruturas contendo os SMILES das subestruturas.
        estrutura_nome_col: Nome da coluna no df_estruturas contendo os nomes das subestruturas.

    Retorna:
        DataFrame df atualizado com colunas de subestruturas (0/1) e descritores moleculares.
    """

    # Converte os padrões do DataFrame df_estruturas em objetos RDKit
    padroes = {
        nome: Chem.MolFromSmarts(smiles) for nome, smiles in zip(df_estruturas[estrutura_nome_col], df_estruturas[estrutura_smiles_col])
        if Chem.MolFromSmarts(smiles) is not None
    }

    # Criar dicionários para armazenar os resultados
    subestrutura_resultados = {nome: [] for nome in padroes}
    descritores_resultados = {desc[0]: [] for desc in Descriptors.descList}

    # Iterar sobre cada SMILES do df e calcular os resultados, usando tqdm para barra de progresso
    for smiles in tqdm(df[smiles_col], desc="Processando moléculas", unit="molécula"):
        mol = Chem.MolFromSmiles(smiles)

        # Verificação de subestruturas
        for nome, padrao in padroes.items():
            subestrutura_resultados[nome].append(int(mol.HasSubstructMatch(padrao)) if mol else 0)

        # Cálculo dos descritores moleculares
        descritores = calcular_descritores(smiles)
        for desc_nome in descritores_resultados:
            descritores_resultados[desc_nome].append(descritores.get(desc_nome, None))

    # Criar DataFrames com os resultados e concatenar com df
    df_subs = pd.DataFrame(subestrutura_resultados)
    df_descs = pd.DataFrame(descritores_resultados)
    df = pd.concat([df.reset_index(drop=True), df_descs, df_subs], axis=1)

    return df

In [44]:
# Exemplo de uso
df_descritores_vitro = verificar_subestruturas_e_descritores(df_vitro, df_estruturas)
df_descritores_vitro.shape

Processando moléculas: 100%|██████████| 3062/3062 [00:37<00:00, 81.07molécula/s]


(3062, 368)

In [49]:
list(df_descritores_vitro.columns)

['SMILES',
 'Results',
 'MaxEStateIndex',
 'MinEStateIndex',
 'MaxAbsEStateIndex',
 'MinAbsEStateIndex',
 'qed',
 'MolWt',
 'HeavyAtomMolWt',
 'ExactMolWt',
 'NumValenceElectrons',
 'NumRadicalElectrons',
 'MaxPartialCharge',
 'MinPartialCharge',
 'MaxAbsPartialCharge',
 'MinAbsPartialCharge',
 'FpDensityMorgan1',
 'FpDensityMorgan2',
 'FpDensityMorgan3',
 'BCUT2D_MWHI',
 'BCUT2D_MWLOW',
 'BCUT2D_CHGHI',
 'BCUT2D_CHGLO',
 'BCUT2D_LOGPHI',
 'BCUT2D_LOGPLOW',
 'BCUT2D_MRHI',
 'BCUT2D_MRLOW',
 'BalabanJ',
 'BertzCT',
 'Chi0',
 'Chi0n',
 'Chi0v',
 'Chi1',
 'Chi1n',
 'Chi1v',
 'Chi2n',
 'Chi2v',
 'Chi3n',
 'Chi3v',
 'Chi4n',
 'Chi4v',
 'HallKierAlpha',
 'Ipc',
 'Kappa1',
 'Kappa2',
 'Kappa3',
 'LabuteASA',
 'PEOE_VSA1',
 'PEOE_VSA10',
 'PEOE_VSA11',
 'PEOE_VSA12',
 'PEOE_VSA13',
 'PEOE_VSA14',
 'PEOE_VSA2',
 'PEOE_VSA3',
 'PEOE_VSA4',
 'PEOE_VSA5',
 'PEOE_VSA6',
 'PEOE_VSA7',
 'PEOE_VSA8',
 'PEOE_VSA9',
 'SMR_VSA1',
 'SMR_VSA10',
 'SMR_VSA2',
 'SMR_VSA3',
 'SMR_VSA4',
 'SMR_VSA5',
 'SMR_VSA

In [45]:
# Definir as features (X) e o alvo (y)
X = df_descritores_vitro.drop(columns=['SMILES', 'Results'])  # Remove colunas não numéricas
y = df_descritores_vitro['Results']

In [None]:
# Normalizar os dados (opcional, mas recomendado para alguns modelos)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Configurar o StratifiedKFold (10 folds)
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Criar dicionário para armazenar os resultados
results = []

# Iterar sobre os folds
for train_idx, test_idx in cv.split(X_scaled, y):
    X_train, X_test = X_scaled[train_idx], X_scaled[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    # Inicializar o LazyClassifier
    clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)

    # Treinar e testar os modelos
    models, predictions = clf.fit(X_train, X_test, y_train, y_test)

    # Salvar os resultados do fold
    results.append(models)

final_results_vitro = (pd.concat(results).groupby(level=0).agg(['mean', 'std']).sort_values(by=('F1 Score', 'mean'), ascending=False)
)

100%|██████████| 32/32 [01:52<00:00,  3.53s/it]
100%|██████████| 32/32 [01:54<00:00,  3.56s/it]
100%|██████████| 32/32 [01:55<00:00,  3.60s/it]
100%|██████████| 32/32 [02:16<00:00,  4.27s/it]
100%|██████████| 32/32 [01:58<00:00,  3.70s/it]
100%|██████████| 32/32 [01:53<00:00,  3.56s/it]
100%|██████████| 32/32 [01:53<00:00,  3.55s/it]
100%|██████████| 32/32 [02:10<00:00,  4.09s/it]
100%|██████████| 32/32 [02:01<00:00,  3.79s/it]
100%|██████████| 32/32 [02:00<00:00,  3.76s/it]


In [None]:
final_results_vitro.to_csv('final_results_vitro_lazy.csv')

In [None]:
final_results_vitro

Unnamed: 0_level_0,Accuracy,Accuracy,Balanced Accuracy,Balanced Accuracy,ROC AUC,ROC AUC,F1 Score,F1 Score,Time Taken,Time Taken
Unnamed: 0_level_1,mean,std,mean,std,mean,std,mean,std,mean,std
Model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
ExtraTreesClassifier,0.77,0.02,0.75,0.02,0.75,0.02,0.76,0.02,1.12,0.07
XGBClassifier,0.76,0.02,0.74,0.02,0.74,0.02,0.76,0.02,1.74,0.09
RandomForestClassifier,0.76,0.03,0.73,0.03,0.73,0.03,0.76,0.03,2.0,0.22
NuSVC,0.76,0.03,0.73,0.03,0.73,0.03,0.75,0.03,1.87,0.27
LGBMClassifier,0.75,0.02,0.73,0.03,0.73,0.03,0.75,0.02,1.52,0.26
SVC,0.75,0.02,0.71,0.03,0.71,0.03,0.74,0.03,1.73,0.34
KNeighborsClassifier,0.74,0.02,0.71,0.02,0.71,0.02,0.73,0.02,0.12,0.04
LinearDiscriminantAnalysis,0.73,0.02,0.7,0.02,0.7,0.02,0.73,0.02,0.4,0.15
RidgeClassifier,0.73,0.02,0.7,0.02,0.7,0.02,0.73,0.02,0.17,0.07
RidgeClassifierCV,0.73,0.02,0.7,0.02,0.7,0.02,0.73,0.02,0.46,0.33


In [46]:
# Validação cruzada
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# Métricas
scoring = {
    'accuracy': 'accuracy',
    'precision': 'precision',
    'recall': 'recall',
    'f1': 'f1',
    'roc_auc': 'roc_auc'
}

# Caminho do arquivo para salvar resultados
caminho_csv = "gridsearch_resultados_vitro_descritores.csv"
primeira_execucao = not os.path.exists(caminho_csv)

modelos = {
    "LightGBM": {
        "pipeline": Pipeline([
            ("scaler", StandardScaler()),
            ("model", LGBMClassifier(random_state=42, verbose=-1))  # verbose corrigido
        ]),
        "param_grid": {
            "model__n_estimators": [50, 100, 200],
            "model__max_depth": [3, 6, 9]
        }
    },
    "RandomForest": {
        "pipeline": Pipeline([
            ("scaler", StandardScaler()),
            ("model", RandomForestClassifier(random_state=42))
        ]),
        "param_grid": {
            "model__n_estimators": [50, 100, 200],
            "model__max_depth": [3, 6, 9],
            "model__min_samples_split": [2, 5],
            "model__min_samples_leaf": [1, 2]
        }
    },
    "ExtraTrees": {
        "pipeline": Pipeline([
            ("scaler", StandardScaler()),
            ("model", ExtraTreesClassifier(random_state=42))
        ]),
        "param_grid": {
            "model__n_estimators": [50, 100, 200],
            "model__max_depth": [3, 6, 9],
            "model__min_samples_split": [2, 5],
            "model__min_samples_leaf": [1, 2]
        }
    }
}

# Loop de treino e salvamento incremental
for nome_modelo, config in modelos.items():
    print(f"\n🔍 Treinando modelo: {nome_modelo}")

    grid = GridSearchCV(
        estimator=config["pipeline"],
        param_grid=config["param_grid"],
        scoring=scoring,
        refit="f1",
        cv=cv,
        verbose=1,
        n_jobs=-1,
        return_train_score=False
    )

    if isinstance(X, pd.DataFrame):
     X.columns = [f"f{i}" for i in range(X.shape[1])]

    grid.fit(X, y)

    # Criar DataFrame dos resultados
    df_resultado = pd.DataFrame(grid.cv_results_)
    df_resultado["modelo"] = nome_modelo

    # Filtrar colunas relevantes
    colunas_mostrar = [
        'mean_test_accuracy', 'std_test_accuracy',
        'mean_test_precision', 'std_test_precision',
        'mean_test_recall', 'std_test_recall',
        'mean_test_f1', 'std_test_f1',
        'mean_test_roc_auc', 'std_test_roc_auc',
        'params'
    ]
    df_filtrado = df_resultado[colunas_mostrar]

    # Adicionar ao CSV incrementalmente
    df_filtrado.to_csv(caminho_csv, mode='a', header=primeira_execucao, index=False)
    primeira_execucao = False  # Para evitar cabeçalho duplicado nas próximas iterações

    print(f"✅ {nome_modelo} finalizado. Melhor F1: {grid.best_score_:.4f}")
    print(f"Melhores parâmetros: {grid.best_params_}")

print('treinamento concluido')


🔍 Treinando modelo: LightGBM
Fitting 3 folds for each of 9 candidates, totalling 27 fits
✅ LightGBM finalizado. Melhor F1: 0.8009
Melhores parâmetros: {'model__max_depth': 9, 'model__n_estimators': 50}

🔍 Treinando modelo: RandomForest
Fitting 3 folds for each of 36 candidates, totalling 108 fits
✅ RandomForest finalizado. Melhor F1: 0.8052
Melhores parâmetros: {'model__max_depth': 9, 'model__min_samples_leaf': 2, 'model__min_samples_split': 5, 'model__n_estimators': 50}

🔍 Treinando modelo: ExtraTrees
Fitting 3 folds for each of 36 candidates, totalling 108 fits
✅ ExtraTrees finalizado. Melhor F1: 0.7945
Melhores parâmetros: {'model__max_depth': 9, 'model__min_samples_leaf': 1, 'model__min_samples_split': 2, 'model__n_estimators': 200}
treinamento concluido


In [None]:
df_resultados = pd.read_csv(caminho_csv)
df_resultados.sort_values(by=['mean_test_f1', 'mean_test_accuracy'], ascending=False)

Unnamed: 0,mean_test_accuracy,std_test_accuracy,mean_test_precision,std_test_precision,mean_test_recall,std_test_recall,mean_test_f1,std_test_f1,mean_test_roc_auc,std_test_roc_auc,params
44,0.73,0.01,0.73,0.01,0.90,0.01,0.81,0.00,0.80,0.02,"{'model__max_depth': 9, 'model__min_samples_le..."
82,0.75,0.02,0.76,0.02,0.85,0.01,0.81,0.01,0.81,0.02,"{'model__gamma': 'scale', 'model__kernel': 'rb..."
43,0.73,0.01,0.73,0.01,0.90,0.00,0.81,0.01,0.80,0.02,"{'model__max_depth': 9, 'model__min_samples_le..."
39,0.73,0.02,0.73,0.01,0.90,0.00,0.81,0.01,0.79,0.02,"{'model__max_depth': 9, 'model__min_samples_le..."
5,0.75,0.01,0.77,0.01,0.84,0.00,0.81,0.00,0.81,0.01,"{'model__max_depth': 6, 'model__n_estimators':..."
...,...,...,...,...,...,...,...,...,...,...,...
48,0.62,0.00,0.62,0.00,1.00,0.00,0.76,0.00,0.71,0.01,"{'model__max_depth': 3, 'model__min_samples_le..."
51,0.62,0.00,0.62,0.00,0.99,0.00,0.76,0.00,0.71,0.01,"{'model__max_depth': 3, 'model__min_samples_le..."
54,0.62,0.00,0.62,0.00,0.99,0.00,0.76,0.00,0.71,0.01,"{'model__max_depth': 3, 'model__min_samples_le..."
86,0.62,0.01,0.62,0.00,0.99,0.01,0.76,0.00,0.73,0.03,"{'model__gamma': 'scale', 'model__kernel': 'po..."


## Melhor resultado in vitro

In [None]:
# Pipeline
pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("model", LGBMClassifier(
    learning_rate=0.05,
    max_depth=6,
    n_estimators=200,
    num_leaves=15,
    random_state=42,
    verbose=-1))
])

# Validação cruzada
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Scorers para ambas as classes
scoring = {
    'accuracy': 'accuracy',
    'roc_auc': 'roc_auc',

    'precision_pos1': make_scorer(precision_score, average='binary', pos_label=1),
    'recall_pos1': make_scorer(recall_score, average='binary', pos_label=1),
    'f1_pos1': make_scorer(f1_score, average='binary', pos_label=1),

    'precision_pos0': make_scorer(precision_score, average='binary', pos_label=0),
    'recall_pos0': make_scorer(recall_score, average='binary', pos_label=0),
    'f1_pos0': make_scorer(f1_score, average='binary', pos_label=0),
}

# Avaliação
resultados = cross_validate(
    pipeline, X, y,
    scoring=scoring,
    cv=cv,
    return_train_score=False
)

# Impressão dos resultados
print("🔍 Resultados médios na validação cruzada:\n")

# Separando por classe
for label in ['pos1', 'pos0']:
    print(f"📊 Classe positiva: y = {label[-1]}")
    for metric in ['precision', 'recall', 'f1']:
        key = f'test_{metric}_{label}'
        mean = resultados[key].mean()
        std = resultados[key].std()
        print(f"{metric:<10}: {mean:.4f} ± {std:.4f}")
    print()

# Accuracy e AUC gerais
for metric in ['accuracy', 'roc_auc']:
    mean = resultados[f'test_{metric}'].mean()
    std = resultados[f'test_{metric}'].std()
    print(f"{metric:<10}: {mean:.4f} ± {std:.4f}")


🔍 Resultados médios na validação cruzada:

📊 Classe positiva: y = 1
precision : 0.7692 ± 0.0228
recall    : 0.8541 ± 0.0148
f1        : 0.8092 ± 0.0144

📊 Classe positiva: y = 0
precision : 0.7185 ± 0.0263
recall    : 0.5921 ± 0.0524
f1        : 0.6483 ± 0.0390

accuracy  : 0.7528 ± 0.0214
roc_auc   : 0.8145 ± 0.0227
