In [None]:
!pip uninstall -y numpy
!pip install numpy==1.26.4 --no-cache-dir --force-reinstall
import os
os.kill(os.getpid(), 9)  # Reinicia o kernel


In [None]:
!pip install rdkit-pypi

In [None]:
!pip install --upgrade numpy gensim --force-reinstall

In [None]:
!pip install lazypredict

In [None]:
!pip install --upgrade threadpoolctl



In [None]:
import os

# Manipulação de dados
import pandas as pd
import numpy as np

# RDKit
from rdkit import Chem
from rdkit.Chem import rdMolDescriptors, Descriptors
from rdkit import RDLogger

from tqdm import tqdm

# Gensim
from gensim.models import Word2Vec

# Scikit-learn
from sklearn.model_selection import StratifiedKFold, GridSearchCV, cross_validate
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.svm import NuSVC
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# XGBoost
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier


# LazyPredict
from lazypredict.Supervised import LazyClassifier

# Utilitários
from itertools import product


# Dataset conjunto

In [None]:
# Exemplo: lendo um CSV com uma coluna chamada 'smiles'
df = pd.read_csv("/content/df_final.csv")
df_estruturas = pd.read_csv("/content/Estruturas de alerta.csv")
smiles_list = df['SMILES'].tolist()

In [None]:
RDLogger.DisableLog('rdApp.*')

def calcular_descritores(smiles):
    """Calcula todos os descritores RDKit para um dado SMILES."""
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return {}  # Retorna dicionário vazio se o SMILES for inválido
    return {desc[0]: desc[1](mol) for desc in Descriptors.descList}

def verificar_subestruturas_e_descritores(df, df_estruturas, smiles_col='SMILES', estrutura_smiles_col='SMILES', estrutura_nome_col='Estrutura de Alerta'):
    """
    Para cada molécula no df, verifica a presença de subestruturas e calcula descritores moleculares.

    Parâmetros:
        df: DataFrame contendo uma coluna de SMILES.
        df_estruturas: DataFrame contendo as subestruturas com seus nomes.
        smiles_col: Nome da coluna no df contendo os SMILES das moléculas.
        estrutura_smiles_col: Nome da coluna no df_estruturas contendo os SMILES das subestruturas.
        estrutura_nome_col: Nome da coluna no df_estruturas contendo os nomes das subestruturas.

    Retorna:
        DataFrame df atualizado com colunas de subestruturas (0/1) e descritores moleculares.
    """

    # Converte os padrões do DataFrame df_estruturas em objetos RDKit
    padroes = {
        nome: Chem.MolFromSmarts(smiles) for nome, smiles in zip(df_estruturas[estrutura_nome_col], df_estruturas[estrutura_smiles_col])
        if Chem.MolFromSmarts(smiles) is not None
    }

    # Criar dicionários para armazenar os resultados
    subestrutura_resultados = {nome: [] for nome in padroes}
    descritores_resultados = {desc[0]: [] for desc in Descriptors.descList}

    # Iterar sobre cada SMILES do df e calcular os resultados, usando tqdm para barra de progresso
    for smiles in tqdm(df[smiles_col], desc="Processando moléculas", unit="molécula"):
        mol = Chem.MolFromSmiles(smiles)

        # Verificação de subestruturas
        for nome, padrao in padroes.items():
            subestrutura_resultados[nome].append(int(mol.HasSubstructMatch(padrao)) if mol else 0)

        # Cálculo dos descritores moleculares
        descritores = calcular_descritores(smiles)
        for desc_nome in descritores_resultados:
            descritores_resultados[desc_nome].append(descritores.get(desc_nome, None))

    # Criar DataFrames com os resultados e concatenar com df
    df_subs = pd.DataFrame(subestrutura_resultados)
    df_descs = pd.DataFrame(descritores_resultados)
    df = pd.concat([df.reset_index(drop=True), df_descs, df_subs], axis=1)

    return df

In [None]:
# Exemplo de uso
df_descritores = verificar_subestruturas_e_descritores(df, df_estruturas)
df_descritores.shape

Processando moléculas: 100%|██████████| 3886/3886 [00:45<00:00, 85.52molécula/s]


(3886, 368)

In [None]:
def smiles_to_tokens(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if not mol:
        return []

    tokens = []

    for atom in mol.GetAtoms():
        symbol = atom.GetSymbol()
        idx = atom.GetIdx()
        neighbors = [nbr.GetSymbol() for nbr in atom.GetNeighbors()]
        token = f'{symbol}({",".join(neighbors)})'
        tokens.append(token)

    return tokens

# Exemplo
tokenized = [smiles_to_tokens(s) for s in smiles_list]

In [None]:
# Treinando o modelo Word2Vec nos tokens
model = Word2Vec(
    sentences=tokenized,
    vector_size=128,
    window=5,
    sg=1,  # skip-gram
    min_count=1,
    epochs=30,
    workers=4
)


# Função para média + soma + máximo
def get_mol_vector(tokens, model):
    vecs = [model.wv[t] for t in tokens if t in model.wv]
    if not vecs:
        return np.zeros(model.vector_size * 3)
    vecs = np.array(vecs)
    return np.concatenate([vecs.mean(axis=0), vecs.sum(axis=0), vecs.max(axis=0)])

# Aplicando para todas as moléculas
mol_vectors = np.array([get_mol_vector(tokens, model) for tokens in tokenized])
df_Vetores = pd.DataFrame(mol_vectors)

In [None]:
df_final = pd.concat([df_descritores, df_Vetores], axis=1)

In [None]:
le = LabelEncoder()
X = df_final.drop(columns=['SMILES', 'Results'])
X.columns = X.columns.astype(str)  # <- Corrige o erro
y = le.fit_transform(df_final['Results'])

# Normalizar os dados (opcional, mas recomendado para alguns modelos)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
# Configurar o StratifiedKFold (10 folds)
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Criar dicionário para armazenar os resultados
results = []

# Iterar sobre os folds
for train_idx, test_idx in cv.split(X_scaled, y):
    X_train, X_test = X_scaled[train_idx], X_scaled[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    # Inicializar o LazyClassifier
    clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)

    # Treinar e testar os modelos
    models, predictions = clf.fit(X_train, X_test, y_train, y_test)

    # Salvar os resultados do fold
    results.append(models)

predict_results_both = (pd.concat(results).groupby(level=0).agg(['mean', 'std']).sort_values(by=('F1 Score', 'mean'), ascending=False)
)

100%|██████████| 32/32 [09:49<00:00, 18.42s/it]
100%|██████████| 32/32 [09:24<00:00, 17.63s/it]
100%|██████████| 32/32 [09:06<00:00, 17.07s/it]
100%|██████████| 32/32 [09:33<00:00, 17.91s/it]
100%|██████████| 32/32 [08:58<00:00, 16.82s/it]
100%|██████████| 32/32 [08:50<00:00, 16.58s/it]
100%|██████████| 32/32 [09:32<00:00, 17.90s/it]
100%|██████████| 32/32 [09:26<00:00, 17.71s/it]
100%|██████████| 32/32 [08:46<00:00, 16.46s/it]
100%|██████████| 32/32 [08:05<00:00, 15.17s/it]


In [None]:
predict_results_both.to_csv('results_both_V&D_lazy.csv')

In [None]:
predict_results_both

Unnamed: 0_level_0,Accuracy,Accuracy,Balanced Accuracy,Balanced Accuracy,ROC AUC,ROC AUC,F1 Score,F1 Score,Time Taken,Time Taken
Unnamed: 0_level_1,mean,std,mean,std,mean,std,mean,std,mean,std
Model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
ExtraTreesClassifier,0.75,0.03,0.75,0.03,0.75,0.03,0.75,0.03,2.28,0.3
XGBClassifier,0.74,0.02,0.74,0.02,0.74,0.02,0.74,0.02,12.27,0.1
LGBMClassifier,0.74,0.02,0.74,0.02,0.74,0.02,0.74,0.02,7.05,0.4
RandomForestClassifier,0.74,0.02,0.74,0.02,0.74,0.02,0.74,0.02,6.39,0.32
NuSVC,0.74,0.02,0.74,0.02,0.74,0.02,0.74,0.02,5.36,0.27
SVC,0.74,0.01,0.74,0.01,0.74,0.01,0.74,0.01,5.05,0.29
RidgeClassifier,0.73,0.01,0.73,0.01,0.73,0.01,0.73,0.01,0.3,0.05
LinearSVC,0.73,0.01,0.73,0.01,0.73,0.01,0.73,0.01,109.87,13.42
RidgeClassifierCV,0.73,0.02,0.73,0.02,0.73,0.02,0.73,0.02,1.26,0.54
LogisticRegression,0.72,0.02,0.72,0.02,0.72,0.02,0.72,0.02,0.63,0.19


## gridsearch

In [None]:
param_grid_w2v = {
    'vector_size': [50 ,128, 160, 256],  # mais relevante
    'window': [3, 5],                # afeta coocorrência local
    'sg': [1],                       # skip-gram tende a ser melhor com poucos dados
    'epochs': [20],                 # manter fixo se não houver indício de underfitting
    'min_count': [1]
}

# Classificadores e seus grids - Otimizados
modelos = {
    "RandomForest": {
        "pipeline": Pipeline([
            ("scaler", StandardScaler()),
            ("model", RandomForestClassifier(random_state=42))
        ]),
        "param_grid": {
            "model__n_estimators": [50, 100],
            "model__max_depth": [3, 6]
        }
    },
    "LightGBM": {
        "pipeline": Pipeline([
            ("scaler", StandardScaler()),
            ("model", LGBMClassifier(random_state=42, verbose=-1))  # verbose corrigido
        ]),
        "param_grid": {
            "model__n_estimators": [50, 100, 200],
            "model__max_depth": [3, 6, 9],
        }
    },
    "ExtraTrees": {
        "pipeline": Pipeline([
            ("scaler", StandardScaler()),
            ("model", ExtraTreesClassifier(random_state=42))
        ]),
        "param_grid": {
            "model__n_estimators": [50, 100],
            "model__max_depth": [3, 6]
        }
    }
}

# Scorers
scoring = {
    'accuracy': 'accuracy',
    'precision': 'precision',
    'recall': 'recall',
    'f1': 'f1',
    'roc_auc': 'roc_auc'
}

cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Caminho para salvar resultados
caminho_csv = "resultados_gridsearch_V&D_both.csv"
primeira_execucao = not os.path.exists(caminho_csv)

# Carregar CSV existente para evitar repetições
if os.path.exists(caminho_csv):
    df_existente = pd.read_csv(caminho_csv)
else:
    df_existente = pd.DataFrame()

# Combinações de hiperparâmetros do Word2Vec
combinacoes_w2v = list(product(
    param_grid_w2v['vector_size'],
    param_grid_w2v['window'],
    param_grid_w2v['epochs'],
    param_grid_w2v['sg'],
    param_grid_w2v['min_count']
))

for size, window, epochs, sg, min_count in combinacoes_w2v:
    print(f"\n🧠 Word2Vec: size={size}, window={window}, epochs={epochs}, sg={sg}, min_count={min_count}")
    try:
        # Treinamento do modelo Word2Vec
        w2v_model = Word2Vec(
            sentences=tokenized,
            vector_size=size,
            window=window,
            sg=sg,
            min_count=min_count,
            epochs=epochs,
            workers=4
        )

        def get_mol_vector(tokens, model):
            vecs = [model.wv[t] for t in tokens if t in model.wv]
            return np.mean(vecs, axis=0) if vecs else np.zeros(model.vector_size)

        X = np.array([get_mol_vector(tokens, w2v_model) for tokens in tokenized])

        # Pega os descritores (excluindo SMILES e Results)
        descritores = df_descritores.drop(columns=['SMILES', 'Results']).reset_index(drop=True)

        # Transforma os vetores em DataFrame
        df_w2v = pd.DataFrame(X, columns=[f'w2v_{i}' for i in range(X.shape[1])])

        # Garante que ambos têm o mesmo número de linhas
        assert df_w2v.shape[0] == descritores.shape[0], "Número de linhas não bate entre W2V e descritores"

        # Concatena vetores com descritores
        X_concat = pd.concat([df_w2v, descritores], axis=1)
        X_concat.columns = X_concat.columns.astype(str)

        # Avaliar cada classificador
        for nome_modelo, config in modelos.items():
            print(f"  🔍 Classificando com: {nome_modelo}")

            # Verificar se já foi processado
            if not df_existente.empty:
                filtro = (
                    (df_existente['modelo'] == nome_modelo) &
                    (df_existente['vector_size'] == size) &
                    (df_existente['window'] == window) &
                    (df_existente['epochs'] == epochs) &
                    (df_existente['sg'] == sg) &
                    (df_existente['min_count'] == min_count)
                )
                if filtro.any():
                    print(f"    ⚠️ Já processado anteriormente. Pulando...\n")
                    continue

            grid = GridSearchCV(
                estimator=config["pipeline"],
                param_grid=config["param_grid"],
                scoring=scoring,
                refit="f1",
                cv=cv,
                verbose=0,
                n_jobs=-1,
                return_train_score=False
            )

            grid.fit(X_concat, y)

            df_resultado = pd.DataFrame(grid.cv_results_)
            df_resultado['modelo'] = nome_modelo
            df_resultado['vector_size'] = size
            df_resultado['window'] = window
            df_resultado['epochs'] = epochs
            df_resultado['sg'] = sg
            df_resultado['min_count'] = min_count

            # Filtrar colunas relevantes
            colunas_mostrar = [
                'modelo', 'vector_size', 'window', 'epochs', 'sg', 'min_count',
                'mean_test_accuracy', 'std_test_accuracy',
                'mean_test_precision', 'std_test_precision',
                'mean_test_recall', 'std_test_recall',
                'mean_test_f1', 'std_test_f1',
                'mean_test_roc_auc', 'std_test_roc_auc',
                'params'
            ]

            df_filtrado = df_resultado[colunas_mostrar]

            # Salvar incrementalmente
            df_filtrado.to_csv(caminho_csv, mode='a', header=primeira_execucao, index=False)
            primeira_execucao = False

            print(f"    ✅ Melhor F1: {grid.best_score_:.4f} | Parâmetros: {grid.best_params_}")

    except Exception as e:
        print("⚠️ Erro ao treinar com esta combinação:", e)

print("\n✅ GridSearch finalizado para todas as combinações! Resultados salvos em:", caminho_csv)


🧠 Word2Vec: size=50, window=3, epochs=20, sg=1, min_count=1
  🔍 Classificando com: RandomForest
    ✅ Melhor F1: 0.7208 | Parâmetros: {'model__max_depth': 6, 'model__n_estimators': 100}
  🔍 Classificando com: LightGBM
    ✅ Melhor F1: 0.7468 | Parâmetros: {'model__max_depth': 6, 'model__n_estimators': 200}
  🔍 Classificando com: ExtraTrees
    ✅ Melhor F1: 0.6878 | Parâmetros: {'model__max_depth': 6, 'model__n_estimators': 50}

🧠 Word2Vec: size=50, window=5, epochs=20, sg=1, min_count=1
  🔍 Classificando com: RandomForest
    ✅ Melhor F1: 0.7205 | Parâmetros: {'model__max_depth': 6, 'model__n_estimators': 100}
  🔍 Classificando com: LightGBM
    ✅ Melhor F1: 0.7548 | Parâmetros: {'model__max_depth': 9, 'model__n_estimators': 100}
  🔍 Classificando com: ExtraTrees
    ✅ Melhor F1: 0.6946 | Parâmetros: {'model__max_depth': 6, 'model__n_estimators': 100}

🧠 Word2Vec: size=128, window=3, epochs=20, sg=1, min_count=1
  🔍 Classificando com: RandomForest
    ✅ Melhor F1: 0.7139 | Parâmetros:

In [None]:
df_resultados = pd.read_csv(caminho_csv)
df_resultados.sort_values(by=['mean_test_f1', 'mean_test_accuracy'], ascending=False)

Unnamed: 0,modelo,vector_size,window,epochs,sg,min_count,mean_test_accuracy,std_test_accuracy,mean_test_precision,std_test_precision,mean_test_recall,std_test_recall,mean_test_f1,std_test_f1,mean_test_roc_auc,std_test_roc_auc,params
28,LightGBM,50,5,20,1,1,0.75,0.02,0.76,0.02,0.75,0.03,0.75,0.02,0.83,0.02,"{'model__max_depth': 9, 'model__n_estimators':..."
79,LightGBM,160,3,20,1,1,0.75,0.02,0.77,0.03,0.74,0.03,0.75,0.02,0.82,0.02,"{'model__max_depth': 9, 'model__n_estimators':..."
96,LightGBM,160,5,20,1,1,0.75,0.02,0.77,0.02,0.74,0.03,0.75,0.02,0.83,0.02,"{'model__max_depth': 9, 'model__n_estimators':..."
113,LightGBM,256,3,20,1,1,0.75,0.02,0.76,0.02,0.74,0.03,0.75,0.02,0.82,0.02,"{'model__max_depth': 9, 'model__n_estimators':..."
97,LightGBM,160,5,20,1,1,0.75,0.02,0.77,0.02,0.74,0.03,0.75,0.02,0.82,0.02,"{'model__max_depth': 9, 'model__n_estimators':..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
102,RandomForest,256,3,20,1,1,0.68,0.02,0.72,0.03,0.61,0.04,0.66,0.03,0.74,0.02,"{'model__max_depth': 3, 'model__n_estimators':..."
119,RandomForest,256,5,20,1,1,0.67,0.02,0.71,0.03,0.62,0.04,0.66,0.03,0.74,0.02,"{'model__max_depth': 3, 'model__n_estimators':..."
85,RandomForest,160,5,20,1,1,0.67,0.02,0.70,0.02,0.62,0.04,0.66,0.03,0.74,0.02,"{'model__max_depth': 3, 'model__n_estimators':..."
103,RandomForest,256,3,20,1,1,0.67,0.02,0.72,0.03,0.60,0.04,0.65,0.03,0.74,0.02,"{'model__max_depth': 3, 'model__n_estimators':..."


In [None]:
# Melhor configuração do Word2Vec
best_w2v_config = {
    'vector_size': 160,
    'window': 5,
    'epochs': 20,
    'sg': 1,
    'min_count': 1
}

# Treinar o modelo Word2Vec
print("🧠 Treinando Word2Vec com melhor configuração...")
w2v_model = Word2Vec(
    sentences=tokenized,
    vector_size=best_w2v_config['vector_size'],
    window=best_w2v_config['window'],
    sg=best_w2v_config['sg'],
    min_count=best_w2v_config['min_count'],
    epochs=best_w2v_config['epochs'],
    workers=4
)

# Função para gerar vetor da molécula
def get_mol_vector(tokens, model):
    vecs = [model.wv[t] for t in tokens if t in model.wv]
    return np.mean(vecs, axis=0) if vecs else np.zeros(model.vector_size)

X = np.array([get_mol_vector(tokens, w2v_model) for tokens in tokenized])

# Pega os descritores (excluindo SMILES e Results)
descritores = df_descritores.drop(columns=['SMILES', 'Results']).reset_index(drop=True)

# Transforma os vetores em DataFrame
df_w2v = pd.DataFrame(X, columns=[f'w2v_{i}' for i in range(X.shape[1])])

# Garante que ambos têm o mesmo número de linhas
assert df_w2v.shape[0] == descritores.shape[0], "Número de linhas não bate entre W2V e descritores"

# Concatena vetores com descritores
X_concat = pd.concat([df_w2v, descritores], axis=1)
X_concat.columns = X_concat.columns.astype(str)

# Pipeline com LightGBM
pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("model", LGBMClassifier(
        max_depth=9,
        n_estimators=100,
        random_state=42,
        verbose=-1
    ))
])

# Validação cruzada
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Scorers para ambas as classes
scoring = {
    'accuracy': 'accuracy',
    'roc_auc': 'roc_auc',

    'precision_pos1': make_scorer(precision_score, average='binary', pos_label=1),
    'recall_pos1': make_scorer(recall_score, average='binary', pos_label=1),
    'f1_pos1': make_scorer(f1_score, average='binary', pos_label=1),

    'precision_pos0': make_scorer(precision_score, average='binary', pos_label=0),
    'recall_pos0': make_scorer(recall_score, average='binary', pos_label=0),
    'f1_pos0': make_scorer(f1_score, average='binary', pos_label=0),
}

# Avaliação
print("\n🚀 Avaliando modelo com validação cruzada (10 folds)...")
resultados = cross_validate(
    pipeline, X_concat, y,
    scoring=scoring,
    cv=cv,
    return_train_score=False
)

# Impressão dos resultados
print("\n🔍 Resultados médios na validação cruzada:\n")

# Separando por classe
for label in ['pos1', 'pos0']:
    print(f"📊 Classe positiva: y = {label[-1]}")
    for metric in ['precision', 'recall', 'f1']:
        key = f'test_{metric}_{label}'
        mean = resultados[key].mean()
        std = resultados[key].std()
        print(f"{metric:<10}: {mean:.4f} ± {std:.4f}")
    print()

# Accuracy e AUC gerais
for metric in ['accuracy', 'roc_auc']:
    mean = resultados[f'test_{metric}'].mean()
    std = resultados[f'test_{metric}'].std()
    print(f"{metric:<10}: {mean:.4f} ± {std:.4f}")

🧠 Treinando Word2Vec com melhor configuração...

🚀 Avaliando modelo com validação cruzada (10 folds)...

🔍 Resultados médios na validação cruzada:

📊 Classe positiva: y = 1
precision : 0.7542 ± 0.0181
recall    : 0.7327 ± 0.0294
f1        : 0.7429 ± 0.0165

📊 Classe positiva: y = 0
precision : 0.7289 ± 0.0194
recall    : 0.7494 ± 0.0284
f1        : 0.7385 ± 0.0156

accuracy  : 0.7409 ± 0.0146
roc_auc   : 0.8258 ± 0.0131


# Dataset individual

## in vivo

In [None]:
# Exemplo: lendo um CSV com uma coluna chamada 'smiles'
df = pd.read_csv("/content/df_final_vivo.csv")
df_estruturas = pd.read_csv("/content/Estruturas de alerta.csv")
smiles_list = df['SMILES'].tolist()

In [None]:
# Exemplo de uso
df_descritores = verificar_subestruturas_e_descritores(df, df_estruturas)
df_descritores.shape

Processando moléculas: 100%|██████████| 2223/2223 [00:28<00:00, 78.39molécula/s]


(2223, 368)

In [None]:
def smiles_to_tokens(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if not mol:
        return []

    tokens = []

    for atom in mol.GetAtoms():
        symbol = atom.GetSymbol()
        idx = atom.GetIdx()
        neighbors = [nbr.GetSymbol() for nbr in atom.GetNeighbors()]
        token = f'{symbol}({",".join(neighbors)})'
        tokens.append(token)

    return tokens

# Exemplo
tokenized = [smiles_to_tokens(s) for s in smiles_list]

In [None]:
# Treinando o modelo Word2Vec nos tokens
model = Word2Vec(
    sentences=tokenized,
    vector_size=128,
    window=5,
    sg=1,  # skip-gram
    min_count=1,
    epochs=30,
    workers=4
)


# Função para média + soma + máximo
def get_mol_vector(tokens, model):
    vecs = [model.wv[t] for t in tokens if t in model.wv]
    if not vecs:
        return np.zeros(model.vector_size * 3)
    vecs = np.array(vecs)
    return np.concatenate([vecs.mean(axis=0), vecs.sum(axis=0), vecs.max(axis=0)])

# Aplicando para todas as moléculas
mol_vectors = np.array([get_mol_vector(tokens, model) for tokens in tokenized])
df_Vetores = pd.DataFrame(mol_vectors)

In [None]:
df_final = pd.concat([df_descritores, df_Vetores], axis=1)

In [None]:
le = LabelEncoder()
X = df_final.drop(columns=['SMILES', 'Results'])
X.columns = X.columns.astype(str)  # <- Corrige o erro
y = le.fit_transform(df_final['Results'])

# Normalizar os dados (opcional, mas recomendado para alguns modelos)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Configurar o StratifiedKFold (10 folds)
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Criar dicionário para armazenar os resultados
results = []

# Iterar sobre os folds
for train_idx, test_idx in cv.split(X_scaled, y):
    X_train, X_test = X_scaled[train_idx], X_scaled[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    # Inicializar o LazyClassifier
    clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)

    # Treinar e testar os modelos
    models, predictions = clf.fit(X_train, X_test, y_train, y_test)

    # Salvar os resultados do fold
    results.append(models)

predict_results_vivo = (pd.concat(results).groupby(level=0).agg(['mean', 'std']).sort_values(by=('F1 Score', 'mean'), ascending=False)
)


100%|██████████| 32/32 [02:35<00:00,  4.86s/it]
100%|██████████| 32/32 [02:27<00:00,  4.60s/it]
100%|██████████| 32/32 [02:20<00:00,  4.40s/it]
100%|██████████| 32/32 [02:22<00:00,  4.46s/it]
100%|██████████| 32/32 [02:21<00:00,  4.43s/it]
100%|██████████| 32/32 [02:24<00:00,  4.50s/it]
100%|██████████| 32/32 [02:20<00:00,  4.39s/it]
100%|██████████| 32/32 [02:19<00:00,  4.36s/it]
100%|██████████| 32/32 [02:34<00:00,  4.83s/it]
100%|██████████| 32/32 [02:26<00:00,  4.57s/it]


In [None]:
predict_results_vivo.to_csv('results_vivo_V&D_lazy.csv')

In [None]:
predict_results_vivo

Unnamed: 0_level_0,Accuracy,Accuracy,Balanced Accuracy,Balanced Accuracy,ROC AUC,ROC AUC,F1 Score,F1 Score,Time Taken,Time Taken
Unnamed: 0_level_1,mean,std,mean,std,mean,std,mean,std,mean,std
Model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
ExtraTreesClassifier,0.81,0.02,0.78,0.02,0.78,0.02,0.8,0.02,1.18,0.25
RandomForestClassifier,0.8,0.02,0.77,0.02,0.77,0.02,0.8,0.02,3.53,0.27
NuSVC,0.8,0.02,0.77,0.02,0.77,0.02,0.79,0.02,1.38,0.18
XGBClassifier,0.8,0.03,0.77,0.03,0.77,0.03,0.79,0.03,8.39,0.51
LGBMClassifier,0.8,0.02,0.77,0.02,0.77,0.02,0.79,0.02,5.88,0.39
KNeighborsClassifier,0.79,0.03,0.77,0.04,0.77,0.04,0.79,0.03,0.11,0.02
BaggingClassifier,0.79,0.02,0.75,0.03,0.75,0.03,0.78,0.02,10.15,0.4
SVC,0.79,0.02,0.75,0.03,0.75,0.03,0.78,0.02,1.2,0.07
RidgeClassifier,0.77,0.04,0.75,0.03,0.75,0.03,0.77,0.03,0.2,0.06
RidgeClassifierCV,0.77,0.04,0.75,0.04,0.75,0.04,0.77,0.04,0.83,0.11


In [None]:
param_grid_w2v = {
    'vector_size': [50 ,128, 160, 256],  # mais relevante
    'window': [3, 5],                # afeta coocorrência local
    'sg': [1],                       # skip-gram tende a ser melhor com poucos dados
    'epochs': [20],                 # manter fixo se não houver indício de underfitting
    'min_count': [1]
}

# Classificadores e seus grids - Otimizados
modelos = {
    "RandomForest": {
        "pipeline": Pipeline([
            ("scaler", StandardScaler()),
            ("model", RandomForestClassifier(random_state=42))
        ]),
        "param_grid": {
            "model__n_estimators": [50, 100],
            "model__max_depth": [3, 6]
        }
    },
    "LightGBM": {
        "pipeline": Pipeline([
            ("scaler", StandardScaler()),
            ("model", LGBMClassifier(random_state=42, verbose=-1))  # verbose corrigido
        ]),
        "param_grid": {
            "model__n_estimators": [50, 100, 200],
            "model__max_depth": [3, 6, 9]
        }
    },
    "ExtraTrees": {
        "pipeline": Pipeline([
            ("scaler", StandardScaler()),
            ("model", ExtraTreesClassifier(random_state=42))
        ]),
        "param_grid": {
            "model__n_estimators": [50, 100],
            "model__max_depth": [3, 6]
        }
    }
}

# Scorers
scoring = {
    'accuracy': 'accuracy',
    'precision': 'precision',
    'recall': 'recall',
    'f1': 'f1',
    'roc_auc': 'roc_auc'
}

cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Caminho para salvar resultados
caminho_csv = "resultados_gridsearch_V&D_vivo.csv"
primeira_execucao = not os.path.exists(caminho_csv)

# Carregar CSV existente para evitar repetições
if os.path.exists(caminho_csv):
    df_existente = pd.read_csv(caminho_csv)
else:
    df_existente = pd.DataFrame()

# Combinações de hiperparâmetros do Word2Vec
combinacoes_w2v = list(product(
    param_grid_w2v['vector_size'],
    param_grid_w2v['window'],
    param_grid_w2v['epochs'],
    param_grid_w2v['sg'],
    param_grid_w2v['min_count']
))

for size, window, epochs, sg, min_count in combinacoes_w2v:
    print(f"\n🧠 Word2Vec: size={size}, window={window}, epochs={epochs}, sg={sg}, min_count={min_count}")
    try:
        # Treinamento do modelo Word2Vec
        w2v_model = Word2Vec(
            sentences=tokenized,
            vector_size=size,
            window=window,
            sg=sg,
            min_count=min_count,
            epochs=epochs,
            workers=4
        )

        def get_mol_vector(tokens, model):
            vecs = [model.wv[t] for t in tokens if t in model.wv]
            return np.mean(vecs, axis=0) if vecs else np.zeros(model.vector_size)

        X = np.array([get_mol_vector(tokens, w2v_model) for tokens in tokenized])

        # Pega os descritores (excluindo SMILES e Results)
        descritores = df_descritores.drop(columns=['SMILES', 'Results']).reset_index(drop=True)

        # Transforma os vetores em DataFrame
        df_w2v = pd.DataFrame(X, columns=[f'w2v_{i}' for i in range(X.shape[1])])

        # Garante que ambos têm o mesmo número de linhas
        assert df_w2v.shape[0] == descritores.shape[0], "Número de linhas não bate entre W2V e descritores"

        # Concatena vetores com descritores
        X_concat = pd.concat([df_w2v, descritores], axis=1)
        X_concat.columns = X_concat.columns.astype(str)

        # Avaliar cada classificador
        for nome_modelo, config in modelos.items():
            print(f"  🔍 Classificando com: {nome_modelo}")

            # Verificar se já foi processado
            if not df_existente.empty:
                filtro = (
                    (df_existente['modelo'] == nome_modelo) &
                    (df_existente['vector_size'] == size) &
                    (df_existente['window'] == window) &
                    (df_existente['epochs'] == epochs) &
                    (df_existente['sg'] == sg) &
                    (df_existente['min_count'] == min_count)
                )
                if filtro.any():
                    print(f"    ⚠️ Já processado anteriormente. Pulando...\n")
                    continue

            grid = GridSearchCV(
                estimator=config["pipeline"],
                param_grid=config["param_grid"],
                scoring=scoring,
                refit="f1",
                cv=cv,
                verbose=0,
                n_jobs=-1,
                return_train_score=False
            )

            grid.fit(X_concat, y)

            df_resultado = pd.DataFrame(grid.cv_results_)
            df_resultado['modelo'] = nome_modelo
            df_resultado['vector_size'] = size
            df_resultado['window'] = window
            df_resultado['epochs'] = epochs
            df_resultado['sg'] = sg
            df_resultado['min_count'] = min_count

            # Filtrar colunas relevantes
            colunas_mostrar = [
                'modelo', 'vector_size', 'window', 'epochs', 'sg', 'min_count',
                'mean_test_accuracy', 'std_test_accuracy',
                'mean_test_precision', 'std_test_precision',
                'mean_test_recall', 'std_test_recall',
                'mean_test_f1', 'std_test_f1',
                'mean_test_roc_auc', 'std_test_roc_auc',
                'params'
            ]

            df_filtrado = df_resultado[colunas_mostrar]

            # Salvar incrementalmente
            df_filtrado.to_csv(caminho_csv, mode='a', header=primeira_execucao, index=False)
            primeira_execucao = False

            print(f"    ✅ Melhor F1: {grid.best_score_:.4f} | Parâmetros: {grid.best_params_}")

    except Exception as e:
        print("⚠️ Erro ao treinar com esta combinação:", e)

print("\n✅ GridSearch finalizado para todas as combinações! Resultados salvos em:", caminho_csv)


🧠 Word2Vec: size=50, window=3, epochs=20, sg=1, min_count=1
  🔍 Classificando com: RandomForest
    ✅ Melhor F1: 0.5743 | Parâmetros: {'model__max_depth': 6, 'model__n_estimators': 100}
  🔍 Classificando com: LightGBM
    ✅ Melhor F1: 0.7157 | Parâmetros: {'model__max_depth': 9, 'model__n_estimators': 100}
  🔍 Classificando com: ExtraTrees
    ✅ Melhor F1: 0.4787 | Parâmetros: {'model__max_depth': 6, 'model__n_estimators': 100}

🧠 Word2Vec: size=50, window=5, epochs=20, sg=1, min_count=1
  🔍 Classificando com: RandomForest
    ✅ Melhor F1: 0.5759 | Parâmetros: {'model__max_depth': 6, 'model__n_estimators': 100}
  🔍 Classificando com: LightGBM
    ✅ Melhor F1: 0.7189 | Parâmetros: {'model__max_depth': 9, 'model__n_estimators': 200}
  🔍 Classificando com: ExtraTrees
    ✅ Melhor F1: 0.4796 | Parâmetros: {'model__max_depth': 6, 'model__n_estimators': 50}

🧠 Word2Vec: size=128, window=3, epochs=20, sg=1, min_count=1
  🔍 Classificando com: RandomForest
    ✅ Melhor F1: 0.5894 | Parâmetros:

In [None]:
df_resultados = pd.read_csv(caminho_csv)
df_resultados.sort_values(by=['mean_test_f1', 'mean_test_accuracy'], ascending=False)

Unnamed: 0,modelo,vector_size,window,epochs,sg,min_count,mean_test_accuracy,std_test_accuracy,mean_test_precision,std_test_precision,mean_test_recall,std_test_recall,mean_test_f1,std_test_f1,mean_test_roc_auc,std_test_roc_auc,params
45,LightGBM,128,3,20,1,1,0.81,0.02,0.79,0.05,0.68,0.03,0.73,0.02,0.86,0.03,"{'model__max_depth': 9, 'model__n_estimators':..."
94,LightGBM,160,5,20,1,1,0.81,0.01,0.79,0.03,0.67,0.03,0.73,0.01,0.86,0.02,"{'model__max_depth': 6, 'model__n_estimators':..."
43,LightGBM,128,3,20,1,1,0.80,0.03,0.78,0.05,0.68,0.04,0.72,0.04,0.86,0.03,"{'model__max_depth': 6, 'model__n_estimators':..."
46,LightGBM,128,3,20,1,1,0.80,0.02,0.78,0.04,0.67,0.03,0.72,0.02,0.86,0.02,"{'model__max_depth': 9, 'model__n_estimators':..."
131,LightGBM,256,5,20,1,1,0.80,0.02,0.79,0.04,0.67,0.03,0.72,0.02,0.85,0.02,"{'model__max_depth': 9, 'model__n_estimators':..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48,ExtraTrees,128,3,20,1,1,0.68,0.02,0.88,0.05,0.20,0.07,0.32,0.09,0.77,0.03,"{'model__max_depth': 3, 'model__n_estimators':..."
64,ExtraTrees,128,5,20,1,1,0.68,0.02,0.89,0.06,0.19,0.06,0.31,0.08,0.77,0.03,"{'model__max_depth': 3, 'model__n_estimators':..."
31,ExtraTrees,50,5,20,1,1,0.68,0.02,0.87,0.06,0.19,0.07,0.31,0.09,0.78,0.03,"{'model__max_depth': 3, 'model__n_estimators':..."
116,ExtraTrees,256,3,20,1,1,0.68,0.02,0.85,0.06,0.19,0.06,0.31,0.09,0.77,0.03,"{'model__max_depth': 3, 'model__n_estimators':..."


## Melhores resultado

In [None]:
# Melhor configuração do Word2Vec
best_w2v_config = {
    'vector_size': 50,
    'window': 3,
    'epochs': 20,
    'sg': 1,
    'min_count': 1
}

# Treinar o modelo Word2Vec
print("🧠 Treinando Word2Vec com melhor configuração...")
w2v_model = Word2Vec(
    sentences=tokenized,
    vector_size=best_w2v_config['vector_size'],
    window=best_w2v_config['window'],
    sg=best_w2v_config['sg'],
    min_count=best_w2v_config['min_count'],
    epochs=best_w2v_config['epochs'],
    workers=4
)

# Função para gerar vetor da molécula
def get_mol_vector(tokens, model):
    vecs = [model.wv[t] for t in tokens if t in model.wv]
    return np.mean(vecs, axis=0) if vecs else np.zeros(model.vector_size)

X = np.array([get_mol_vector(tokens, w2v_model) for tokens in tokenized])

# Pega os descritores (excluindo SMILES e Results)
descritores = df_descritores.drop(columns=['SMILES', 'Results']).reset_index(drop=True)

# Transforma os vetores em DataFrame
df_w2v = pd.DataFrame(X, columns=[f'w2v_{i}' for i in range(X.shape[1])])

# Garante que ambos têm o mesmo número de linhas
assert df_w2v.shape[0] == descritores.shape[0], "Número de linhas não bate entre W2V e descritores"

# Concatena vetores com descritores
X_concat = pd.concat([df_w2v, descritores], axis=1)
X_concat.columns = X_concat.columns.astype(str)

# Pipeline com LightGBM
pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("model", LGBMClassifier(
        max_depth=9,
        n_estimators=200,
        random_state=42,
        verbose=-1
    ))
])

# Validação cruzada
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Scorers para ambas as classes
scoring = {
    'accuracy': 'accuracy',
    'roc_auc': 'roc_auc',

    'precision_pos1': make_scorer(precision_score, average='binary', pos_label=1),
    'recall_pos1': make_scorer(recall_score, average='binary', pos_label=1),
    'f1_pos1': make_scorer(f1_score, average='binary', pos_label=1),

    'precision_pos0': make_scorer(precision_score, average='binary', pos_label=0),
    'recall_pos0': make_scorer(recall_score, average='binary', pos_label=0),
    'f1_pos0': make_scorer(f1_score, average='binary', pos_label=0),
}

# Avaliação
print("\n🚀 Avaliando modelo com validação cruzada (10 folds)...")
resultados = cross_validate(
    pipeline, X_concat, y,
    scoring=scoring,
    cv=cv,
    return_train_score=False
)

# Impressão dos resultados
print("\n🔍 Resultados médios na validação cruzada:\n")

# Separando por classe
for label in ['pos1', 'pos0']:
    print(f"📊 Classe positiva: y = {label[-1]}")
    for metric in ['precision', 'recall', 'f1']:
        key = f'test_{metric}_{label}'
        mean = resultados[key].mean()
        std = resultados[key].std()
        print(f"{metric:<10}: {mean:.4f} ± {std:.4f}")
    print()

# Accuracy e AUC gerais
for metric in ['accuracy', 'roc_auc']:
    mean = resultados[f'test_{metric}'].mean()
    std = resultados[f'test_{metric}'].std()
    print(f"{metric:<10}: {mean:.4f} ± {std:.4f}")

🧠 Treinando Word2Vec com melhor configuração...

🚀 Avaliando modelo com validação cruzada (10 folds)...

🔍 Resultados médios na validação cruzada:

📊 Classe positiva: y = 1
precision : 0.7674 ± 0.0442
recall    : 0.6667 ± 0.0253
f1        : 0.7127 ± 0.0249

📊 Classe positiva: y = 0
precision : 0.8103 ± 0.0123
recall    : 0.8744 ± 0.0297
f1        : 0.8409 ± 0.0169

accuracy  : 0.7953 ± 0.0200
roc_auc   : 0.8493 ± 0.0230


## in vitro

In [None]:
# Exemplo: lendo um CSV com uma coluna chamada 'smiles'
df = pd.read_csv("/content/df_final_vitro.csv")
df_estruturas = pd.read_csv("/content/Estruturas de alerta.csv")
smiles_list = df['SMILES'].tolist()

In [None]:
# Exemplo de uso
df_descritores = verificar_subestruturas_e_descritores(df, df_estruturas)
df_descritores.shape

Processando moléculas: 100%|██████████| 3062/3062 [00:31<00:00, 96.38molécula/s]


(3062, 368)

In [None]:
def smiles_to_tokens(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if not mol:
        return []

    tokens = []

    for atom in mol.GetAtoms():
        symbol = atom.GetSymbol()
        idx = atom.GetIdx()
        neighbors = [nbr.GetSymbol() for nbr in atom.GetNeighbors()]
        token = f'{symbol}({",".join(neighbors)})'
        tokens.append(token)

    return tokens

# Exemplo
tokenized = [smiles_to_tokens(s) for s in smiles_list]

In [None]:
# Treinando o modelo Word2Vec nos tokens
model = Word2Vec(
    sentences=tokenized,
    vector_size=128,
    window=5,
    sg=1,  # skip-gram
    min_count=1,
    epochs=30,
    workers=4
)


# Função para média + soma + máximo
def get_mol_vector(tokens, model):
    vecs = [model.wv[t] for t in tokens if t in model.wv]
    if not vecs:
        return np.zeros(model.vector_size * 3)
    vecs = np.array(vecs)
    return np.concatenate([vecs.mean(axis=0), vecs.sum(axis=0), vecs.max(axis=0)])

# Aplicando para todas as moléculas
mol_vectors = np.array([get_mol_vector(tokens, model) for tokens in tokenized])
df_Vetores = pd.DataFrame(mol_vectors)

In [None]:
df_final = pd.concat([df_descritores, df_Vetores], axis=1)

In [None]:
le = LabelEncoder()
X = df_final.drop(columns=['SMILES', 'Results'])
X.columns = X.columns.astype(str)  # <- Corrige o erro
y = le.fit_transform(df_final['Results'])

# Normalizar os dados (opcional, mas recomendado para alguns modelos)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Configurar o StratifiedKFold (10 folds)
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Criar dicionário para armazenar os resultados
results = []

# Iterar sobre os folds
for train_idx, test_idx in cv.split(X_scaled, y):
    X_train, X_test = X_scaled[train_idx], X_scaled[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    # Inicializar o LazyClassifier
    clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)

    # Treinar e testar os modelos
    models, predictions = clf.fit(X_train, X_test, y_train, y_test)

    # Salvar os resultados do fold
    results.append(models)

predict_results_vitro = (pd.concat(results).groupby(level=0).agg(['mean', 'std']).sort_values(by=('F1 Score', 'mean'), ascending=False)
)


100%|██████████| 32/32 [04:14<00:00,  7.96s/it]
100%|██████████| 32/32 [04:02<00:00,  7.57s/it]
100%|██████████| 32/32 [04:22<00:00,  8.20s/it]
100%|██████████| 32/32 [04:31<00:00,  8.49s/it]
100%|██████████| 32/32 [04:42<00:00,  8.83s/it]
100%|██████████| 32/32 [04:22<00:00,  8.22s/it]
100%|██████████| 32/32 [04:41<00:00,  8.78s/it]
100%|██████████| 32/32 [04:40<00:00,  8.76s/it]
100%|██████████| 32/32 [04:56<00:00,  9.27s/it]
100%|██████████| 32/32 [04:25<00:00,  8.30s/it]


In [None]:
predict_results_vitro.to_csv('results_vitro_V&D_lazy.csv')

In [None]:
predict_results_vitro

Unnamed: 0_level_0,Accuracy,Accuracy,Balanced Accuracy,Balanced Accuracy,ROC AUC,ROC AUC,F1 Score,F1 Score,Time Taken,Time Taken
Unnamed: 0_level_1,mean,std,mean,std,mean,std,mean,std,mean,std
Model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
LGBMClassifier,0.77,0.02,0.74,0.03,0.74,0.03,0.76,0.02,6.46,0.29
ExtraTreesClassifier,0.76,0.02,0.74,0.02,0.74,0.02,0.76,0.02,1.62,0.25
NuSVC,0.76,0.02,0.73,0.03,0.73,0.03,0.75,0.02,2.91,0.31
RandomForestClassifier,0.75,0.02,0.73,0.02,0.73,0.02,0.75,0.02,4.88,0.26
XGBClassifier,0.75,0.02,0.73,0.03,0.73,0.03,0.75,0.03,10.24,0.84
BaggingClassifier,0.75,0.02,0.74,0.02,0.74,0.02,0.75,0.02,12.78,0.38
RidgeClassifier,0.74,0.02,0.72,0.03,0.72,0.03,0.74,0.02,0.3,0.18
RidgeClassifierCV,0.74,0.02,0.72,0.02,0.72,0.02,0.74,0.02,1.05,0.25
LogisticRegression,0.74,0.02,0.71,0.02,0.71,0.02,0.73,0.02,0.39,0.05
SVC,0.74,0.02,0.71,0.02,0.71,0.02,0.73,0.02,2.69,0.29


In [None]:
param_grid_w2v = {
    'vector_size': [50 ,128, 160, 256],  # mais relevante
    'window': [3, 5],                # afeta coocorrência local
    'sg': [1],                       # skip-gram tende a ser melhor com poucos dados
    'epochs': [20],                 # manter fixo se não houver indício de underfitting
    'min_count': [1]
}

# Classificadores e seus grids - Otimizados
modelos = {
    "RandomForest": {
        "pipeline": Pipeline([
            ("scaler", StandardScaler()),
            ("model", RandomForestClassifier(random_state=42))
        ]),
        "param_grid": {
            "model__n_estimators": [50, 100],
            "model__max_depth": [3, 6]
        }
    },
    "LightGBM": {
        "pipeline": Pipeline([
            ("scaler", StandardScaler()),
            ("model", LGBMClassifier(random_state=42, verbose=-1))  # verbose corrigido
        ]),
        "param_grid": {
            "model__n_estimators": [50, 100, 200],
            "model__max_depth": [3, 6, 9],

        }
    },
    "ExtraTrees": {
        "pipeline": Pipeline([
            ("scaler", StandardScaler()),
            ("model", ExtraTreesClassifier(random_state=42))
        ]),
        "param_grid": {
            "model__n_estimators": [50, 100],
            "model__max_depth": [3, 6]
        }
    }
}

# Scorers
scoring = {
    'accuracy': 'accuracy',
    'precision': 'precision',
    'recall': 'recall',
    'f1': 'f1',
    'roc_auc': 'roc_auc'
}

cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Caminho para salvar resultados
caminho_csv = "resultados_gridsearch_V&D_vitro.csv"
primeira_execucao = not os.path.exists(caminho_csv)

# Carregar CSV existente para evitar repetições
if os.path.exists(caminho_csv):
    df_existente = pd.read_csv(caminho_csv)
else:
    df_existente = pd.DataFrame()

# Combinações de hiperparâmetros do Word2Vec
combinacoes_w2v = list(product(
    param_grid_w2v['vector_size'],
    param_grid_w2v['window'],
    param_grid_w2v['epochs'],
    param_grid_w2v['sg'],
    param_grid_w2v['min_count']
))

for size, window, epochs, sg, min_count in combinacoes_w2v:
    print(f"\n🧠 Word2Vec: size={size}, window={window}, epochs={epochs}, sg={sg}, min_count={min_count}")
    try:
        # Treinamento do modelo Word2Vec
        w2v_model = Word2Vec(
            sentences=tokenized,
            vector_size=size,
            window=window,
            sg=sg,
            min_count=min_count,
            epochs=epochs,
            workers=4
        )

        def get_mol_vector(tokens, model):
            vecs = [model.wv[t] for t in tokens if t in model.wv]
            return np.mean(vecs, axis=0) if vecs else np.zeros(model.vector_size)

        X = np.array([get_mol_vector(tokens, w2v_model) for tokens in tokenized])

                # Pega os descritores (excluindo SMILES e Results)
        descritores = df_descritores.drop(columns=['SMILES', 'Results']).reset_index(drop=True)

        # Transforma os vetores em DataFrame
        df_w2v = pd.DataFrame(X, columns=[f'w2v_{i}' for i in range(X.shape[1])])

        # Garante que ambos têm o mesmo número de linhas
        assert df_w2v.shape[0] == descritores.shape[0], "Número de linhas não bate entre W2V e descritores"

        # Concatena vetores com descritores
        X_concat = pd.concat([df_w2v, descritores], axis=1)
        X_concat.columns = X_concat.columns.astype(str)

        # Avaliar cada classificador
        for nome_modelo, config in modelos.items():
            print(f"  🔍 Classificando com: {nome_modelo}")

            # Verificar se já foi processado
            if not df_existente.empty:
                filtro = (
                    (df_existente['modelo'] == nome_modelo) &
                    (df_existente['vector_size'] == size) &
                    (df_existente['window'] == window) &
                    (df_existente['epochs'] == epochs) &
                    (df_existente['sg'] == sg) &
                    (df_existente['min_count'] == min_count)
                )
                if filtro.any():
                    print(f"    ⚠️ Já processado anteriormente. Pulando...\n")
                    continue

            grid = GridSearchCV(
                estimator=config["pipeline"],
                param_grid=config["param_grid"],
                scoring=scoring,
                refit="f1",
                cv=cv,
                verbose=0,
                n_jobs=-1,
                return_train_score=False
            )

            grid.fit(X_concat, y)

            df_resultado = pd.DataFrame(grid.cv_results_)
            df_resultado['modelo'] = nome_modelo
            df_resultado['vector_size'] = size
            df_resultado['window'] = window
            df_resultado['epochs'] = epochs
            df_resultado['sg'] = sg
            df_resultado['min_count'] = min_count

            # Filtrar colunas relevantes
            colunas_mostrar = [
                'modelo', 'vector_size', 'window', 'epochs', 'sg', 'min_count',
                'mean_test_accuracy', 'std_test_accuracy',
                'mean_test_precision', 'std_test_precision',
                'mean_test_recall', 'std_test_recall',
                'mean_test_f1', 'std_test_f1',
                'mean_test_roc_auc', 'std_test_roc_auc',
                'params'
            ]

            df_filtrado = df_resultado[colunas_mostrar]

            # Salvar incrementalmente
            df_filtrado.to_csv(caminho_csv, mode='a', header=primeira_execucao, index=False)
            primeira_execucao = False

            print(f"    ✅ Melhor F1: {grid.best_score_:.4f} | Parâmetros: {grid.best_params_}")

    except Exception as e:
        print("⚠️ Erro ao treinar com esta combinação:", e)

print("\n✅ GridSearch finalizado para todas as combinações! Resultados salvos em:", caminho_csv)


🧠 Word2Vec: size=50, window=3, epochs=20, sg=1, min_count=1
  🔍 Classificando com: RandomForest
    ✅ Melhor F1: 0.7952 | Parâmetros: {'model__max_depth': 6, 'model__n_estimators': 100}
  🔍 Classificando com: LightGBM
    ✅ Melhor F1: 0.8111 | Parâmetros: {'model__max_depth': 9, 'model__n_estimators': 200}
  🔍 Classificando com: ExtraTrees
    ✅ Melhor F1: 0.7831 | Parâmetros: {'model__max_depth': 6, 'model__n_estimators': 100}

🧠 Word2Vec: size=50, window=5, epochs=20, sg=1, min_count=1
  🔍 Classificando com: RandomForest
    ✅ Melhor F1: 0.7956 | Parâmetros: {'model__max_depth': 6, 'model__n_estimators': 100}
  🔍 Classificando com: LightGBM
    ✅ Melhor F1: 0.8070 | Parâmetros: {'model__max_depth': 9, 'model__n_estimators': 50}
  🔍 Classificando com: ExtraTrees
    ✅ Melhor F1: 0.7827 | Parâmetros: {'model__max_depth': 6, 'model__n_estimators': 100}

🧠 Word2Vec: size=128, window=3, epochs=20, sg=1, min_count=1
  🔍 Classificando com: RandomForest
    ✅ Melhor F1: 0.7980 | Parâmetros:

In [None]:
df_resultados = pd.read_csv(caminho_csv)
df_resultados.sort_values(by=['mean_test_f1', 'mean_test_accuracy'], ascending=False)

Unnamed: 0,modelo,vector_size,window,epochs,sg,min_count,mean_test_accuracy,std_test_accuracy,mean_test_precision,std_test_precision,mean_test_recall,std_test_recall,mean_test_f1,std_test_f1,mean_test_roc_auc,std_test_roc_auc,params
60,LightGBM,128,5,20,1,1,0.77,0.02,0.79,0.02,0.84,0.02,0.82,0.02,0.83,0.02,"{'model__max_depth': 6, 'model__n_estimators':..."
58,LightGBM,128,5,20,1,1,0.76,0.02,0.77,0.02,0.86,0.02,0.81,0.02,0.82,0.02,"{'model__max_depth': 6, 'model__n_estimators':..."
62,LightGBM,128,5,20,1,1,0.76,0.03,0.79,0.03,0.84,0.02,0.81,0.02,0.83,0.02,"{'model__max_depth': 9, 'model__n_estimators':..."
63,LightGBM,128,5,20,1,1,0.76,0.02,0.79,0.03,0.84,0.02,0.81,0.02,0.83,0.02,"{'model__max_depth': 9, 'model__n_estimators':..."
129,LightGBM,256,5,20,1,1,0.76,0.02,0.78,0.02,0.85,0.03,0.81,0.02,0.82,0.02,"{'model__max_depth': 9, 'model__n_estimators':..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115,ExtraTrees,256,3,20,1,1,0.62,0.00,0.62,0.00,1.00,0.00,0.76,0.00,0.73,0.02,"{'model__max_depth': 3, 'model__n_estimators':..."
132,ExtraTrees,256,5,20,1,1,0.62,0.00,0.62,0.00,0.99,0.00,0.76,0.00,0.72,0.02,"{'model__max_depth': 3, 'model__n_estimators':..."
14,ExtraTrees,50,3,20,1,1,0.62,0.00,0.62,0.00,1.00,0.00,0.76,0.00,0.73,0.02,"{'model__max_depth': 3, 'model__n_estimators':..."
30,ExtraTrees,50,5,20,1,1,0.62,0.00,0.62,0.00,0.99,0.00,0.76,0.00,0.72,0.03,"{'model__max_depth': 3, 'model__n_estimators':..."


In [None]:
# Melhor configuração do Word2Vec
best_w2v_config = {
    'vector_size': 128,
    'window': 5,
    'epochs': 20,
    'sg': 1,
    'min_count': 1
}

# Treinar o modelo Word2Vec
print("🧠 Treinando Word2Vec com melhor configuração...")
w2v_model = Word2Vec(
    sentences=tokenized,
    vector_size=best_w2v_config['vector_size'],
    window=best_w2v_config['window'],
    sg=best_w2v_config['sg'],
    min_count=best_w2v_config['min_count'],
    epochs=best_w2v_config['epochs'],
    workers=4
)

# Função para gerar vetor da molécula
def get_mol_vector(tokens, model):
    vecs = [model.wv[t] for t in tokens if t in model.wv]
    return np.mean(vecs, axis=0) if vecs else np.zeros(model.vector_size)

X = np.array([get_mol_vector(tokens, w2v_model) for tokens in tokenized])

# Pega os descritores (excluindo SMILES e Results)
descritores = df_descritores.drop(columns=['SMILES', 'Results']).reset_index(drop=True)

# Transforma os vetores em DataFrame
df_w2v = pd.DataFrame(X, columns=[f'w2v_{i}' for i in range(X.shape[1])])

# Garante que ambos têm o mesmo número de linhas
assert df_w2v.shape[0] == descritores.shape[0], "Número de linhas não bate entre W2V e descritores"

# Concatena vetores com descritores
X_concat = pd.concat([df_w2v, descritores], axis=1)
X_concat.columns = X_concat.columns.astype(str)

# Pipeline com LightGBM
pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("model", LGBMClassifier(
        max_depth=6,
        n_estimators=200,
        random_state=42,
        verbose=-1
    ))
])

# Validação cruzada
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Scorers para ambas as classes
scoring = {
    'accuracy': 'accuracy',
    'roc_auc': 'roc_auc',

    'precision_pos1': make_scorer(precision_score, average='binary', pos_label=1),
    'recall_pos1': make_scorer(recall_score, average='binary', pos_label=1),
    'f1_pos1': make_scorer(f1_score, average='binary', pos_label=1),

    'precision_pos0': make_scorer(precision_score, average='binary', pos_label=0),
    'recall_pos0': make_scorer(recall_score, average='binary', pos_label=0),
    'f1_pos0': make_scorer(f1_score, average='binary', pos_label=0),
}

# Avaliação
print("\n🚀 Avaliando modelo com validação cruzada (10 folds)...")
resultados = cross_validate(
    pipeline, X_concat, y,
    scoring=scoring,
    cv=cv,
    return_train_score=False
)

# Impressão dos resultados
print("\n🔍 Resultados médios na validação cruzada:\n")

# Separando por classe
for label in ['pos1', 'pos0']:
    print(f"📊 Classe positiva: y = {label[-1]}")
    for metric in ['precision', 'recall', 'f1']:
        key = f'test_{metric}_{label}'
        mean = resultados[key].mean()
        std = resultados[key].std()
        print(f"{metric:<10}: {mean:.4f} ± {std:.4f}")
    print()

# Accuracy e AUC gerais
for metric in ['accuracy', 'roc_auc']:
    mean = resultados[f'test_{metric}'].mean()
    std = resultados[f'test_{metric}'].std()
    print(f"{metric:<10}: {mean:.4f} ± {std:.4f}")

🧠 Treinando Word2Vec com melhor configuração...

🚀 Avaliando modelo com validação cruzada (10 folds)...

🔍 Resultados médios na validação cruzada:

📊 Classe positiva: y = 1
precision : 0.7853 ± 0.0150
recall    : 0.8301 ± 0.0253
f1        : 0.8068 ± 0.0144

📊 Classe positiva: y = 0
precision : 0.7046 ± 0.0295
recall    : 0.6393 ± 0.0348
f1        : 0.6697 ± 0.0243

accuracy  : 0.7564 ± 0.0169
roc_auc   : 0.8263 ± 0.0210
