In [None]:
!pip uninstall -y numpy
!pip install numpy==1.26.4 --no-cache-dir --force-reinstall
import os
os.kill(os.getpid(), 9)  # Reinicia o kernel


In [None]:
!pip install rdkit-pypi

In [None]:
!pip install --upgrade numpy gensim --force-reinstall

In [None]:
!pip install lazypredict

In [None]:
# Manipulação de dados
import pandas as pd
import numpy as np

# RDKit
from rdkit import Chem
from rdkit.Chem import rdMolDescriptors

# Gensim
from gensim.models import Word2Vec

# Scikit-learn
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.svm import NuSVC
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# XGBoost
from xgboost import XGBClassifier

# LazyPredict
from lazypredict.Supervised import LazyClassifier

# Utilitários
from itertools import product


# dataset juntos

In [None]:
# Exemplo: lendo um CSV com uma coluna chamada 'smiles'
df = pd.read_csv("/content/df_both.csv")
smiles_list = df['SMILES'].tolist()

## separando usando RDKit

In [None]:
def smiles_to_tokens(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if not mol:
        return []

    tokens = []

    for atom in mol.GetAtoms():
        symbol = atom.GetSymbol()
        idx = atom.GetIdx()
        neighbors = [nbr.GetSymbol() for nbr in atom.GetNeighbors()]
        token = f'{symbol}({",".join(neighbors)})'
        tokens.append(token)

    return tokens

# Exemplo
tokenized = [smiles_to_tokens(s) for s in smiles_list]



## tokenizando usando Word2Vec

In [None]:
# Treinando o modelo Word2Vec nos tokens
model = Word2Vec(
    sentences=tokenized,
    vector_size=128,
    window=5,
    sg=1,  # skip-gram
    min_count=1,
    epochs=30,
    workers=4
)


# Função para média + soma + máximo
def get_mol_vector(tokens, model):
    vecs = [model.wv[t] for t in tokens if t in model.wv]
    if not vecs:
        return np.zeros(model.vector_size * 3)
    vecs = np.array(vecs)
    return np.concatenate([vecs.mean(axis=0), vecs.sum(axis=0), vecs.max(axis=0)])

# Aplicando para todas as moléculas
mol_vectors = np.array([get_mol_vector(tokens, model) for tokens in tokenized])
df_ = pd.DataFrame(mol_vectors)

In [None]:
le = LabelEncoder()
X = df_
y = le.fit_transform(df['Results'])

# Normalizar os dados (opcional, mas recomendado para alguns modelos)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Configurar o StratifiedKFold (10 folds)
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# Criar dicionário para armazenar os resultados
results = []

# Iterar sobre os folds
for train_idx, test_idx in cv.split(X_scaled, y):
    X_train, X_test = X_scaled[train_idx], X_scaled[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    # Inicializar o LazyClassifier
    clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)

    # Treinar e testar os modelos
    models, predictions = clf.fit(X_train, X_test, y_train, y_test)

    # Salvar os resultados do fold
    results.append(models)

predict_results_both = (pd.concat(results).groupby(level=0).agg(['mean', 'std']).sort_values(by=('F1 Score', 'mean'), ascending=False)
)


In [None]:
predict_results_both

Unnamed: 0_level_0,Accuracy,Accuracy,Balanced Accuracy,Balanced Accuracy,ROC AUC,ROC AUC,F1 Score,F1 Score,Time Taken,Time Taken
Unnamed: 0_level_1,mean,std,mean,std,mean,std,mean,std,mean,std
Model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
RandomForestClassifier,0.73,0.01,0.73,0.01,0.73,0.01,0.73,0.01,5.55,1.05
ExtraTreesClassifier,0.72,0.01,0.72,0.01,0.72,0.01,0.72,0.01,1.42,0.39
LGBMClassifier,0.72,0.02,0.72,0.02,0.72,0.02,0.72,0.02,5.67,2.05
NuSVC,0.72,0.01,0.72,0.01,0.72,0.01,0.72,0.01,1.91,0.1
SVC,0.72,0.01,0.72,0.01,0.72,0.01,0.72,0.01,2.05,0.36
XGBClassifier,0.71,0.0,0.71,0.0,0.71,0.0,0.71,0.0,9.7,1.77
RidgeClassifierCV,0.7,0.01,0.7,0.01,0.7,0.01,0.7,0.01,1.22,1.5
RidgeClassifier,0.7,0.02,0.7,0.02,0.7,0.02,0.7,0.02,0.2,0.05
CalibratedClassifierCV,0.69,0.02,0.69,0.02,0.69,0.02,0.69,0.02,15.53,1.51
LogisticRegression,0.69,0.02,0.69,0.02,0.69,0.02,0.69,0.02,0.21,0.01


## GridSearch

In [None]:
# Parâmetros para o Word2Vec
param_grid_w2v = {
    'vector_size': [128, 160],
    'window': [3, 5],
    'epochs': [20, 30],
    'sg': [0, 1],
    'min_count': [1, 2]
}

# Classificadores e seus grids
modelos = {
    "RandomForest": {
        "pipeline": Pipeline([
            ("scaler", StandardScaler()),
            ("model", RandomForestClassifier(random_state=42))
        ]),
        "param_grid": {
            "model__n_estimators": [50, 100],
            "model__max_depth": [3, 6]
        }
    },
    "XGBoost": {
        "pipeline": Pipeline([
            ("scaler", StandardScaler()),
            ("model", XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'))
        ]),
        "param_grid": {
            "model__n_estimators": [50],
            "model__max_depth": [3],
            "model__learning_rate": [0.1]
        }
    },
    "ExtraTrees": {
        "pipeline": Pipeline([
            ("scaler", StandardScaler()),
            ("model", ExtraTreesClassifier(random_state=42))
        ]),
        "param_grid": {
            "model__n_estimators": [50, 100],
            "model__max_depth": [3, 6]
        }
    }
}

# Scorers
scoring = {
    'accuracy': 'accuracy',
    'precision': 'precision',
    'recall': 'recall',
    'f1': 'f1',
    'roc_auc': 'roc_auc'
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
resultados_finais = []

# Combinações de hiperparâmetros do Word2Vec
combinacoes_w2v = list(product(
    param_grid_w2v['vector_size'],
    param_grid_w2v['window'],
    param_grid_w2v['epochs'],
    param_grid_w2v['sg'],
    param_grid_w2v['min_count']
))

for size, window, epochs, sg, min_count in combinacoes_w2v:
    print(f"\n🧠 Word2Vec: size={size}, window={window}, epochs={epochs}, sg={sg}, min_count={min_count}")
    try:
        # Treinamento do modelo Word2Vec
        w2v_model = Word2Vec(
            sentences=tokenized,
            vector_size=size,
            window=window,
            sg=sg,
            min_count=min_count,
            epochs=epochs,
            workers=4
        )

        def get_mol_vector(tokens, model):
            vecs = [model.wv[t] for t in tokens if t in model.wv]
            return np.mean(vecs, axis=0) if vecs else np.zeros(model.vector_size)

        X = np.array([get_mol_vector(tokens, w2v_model) for tokens in tokenized])

        # Avaliar cada classificador
        for nome_modelo, config in modelos.items():
            print(f"  🔍 Classificando com: {nome_modelo}")

            grid = GridSearchCV(
                estimator=config["pipeline"],
                param_grid=config["param_grid"],
                scoring=scoring,
                refit="f1",
                cv=cv,
                verbose=0,
                n_jobs=-1,
                return_train_score=False
            )

            grid.fit(X, y)

            df_resultado = pd.DataFrame(grid.cv_results_)
            df_resultado['modelo'] = nome_modelo
            df_resultado['vector_size'] = size
            df_resultado['window'] = window
            df_resultado['epochs'] = epochs
            df_resultado['sg'] = sg
            df_resultado['min_count'] = min_count
            resultados_finais.append(df_resultado)

            print(f"    ✅ Melhor F1: {grid.best_score_:.4f} | Parâmetros: {grid.best_params_}")

    except Exception as e:
        print("⚠️ Erro ao treinar com esta combinação:", e)

# Juntando resultados
df_comparacao = pd.concat(resultados_finais, ignore_index=True)

# Exibir principais métricas
colunas_mostrar = ['modelo', 'vector_size', 'window', 'epochs', 'sg', 'min_count',
                   'mean_test_accuracy', 'mean_test_precision', 'mean_test_recall',
                   'mean_test_f1', 'mean_test_roc_auc', 'params']

df_resultados_finais = df_comparacao[colunas_mostrar].sort_values(by="mean_test_f1", ascending=False)


🧠 Word2Vec: size=128, window=3, epochs=20, sg=0, min_count=1
  🔍 Classificando com: RandomForest
    ✅ Melhor F1: 0.6753 | Parâmetros: {'model__max_depth': 6, 'model__n_estimators': 50}
  🔍 Classificando com: XGBoost
    ✅ Melhor F1: 0.6795 | Parâmetros: {'model__learning_rate': 0.1, 'model__max_depth': 3, 'model__n_estimators': 50}
  🔍 Classificando com: ExtraTrees
    ✅ Melhor F1: 0.6564 | Parâmetros: {'model__max_depth': 6, 'model__n_estimators': 50}

🧠 Word2Vec: size=128, window=3, epochs=20, sg=0, min_count=2
  🔍 Classificando com: RandomForest
    ✅ Melhor F1: 0.6855 | Parâmetros: {'model__max_depth': 6, 'model__n_estimators': 100}
  🔍 Classificando com: XGBoost
    ✅ Melhor F1: 0.6955 | Parâmetros: {'model__learning_rate': 0.1, 'model__max_depth': 3, 'model__n_estimators': 50}
  🔍 Classificando com: ExtraTrees
    ✅ Melhor F1: 0.6604 | Parâmetros: {'model__max_depth': 6, 'model__n_estimators': 100}

🧠 Word2Vec: size=128, window=3, epochs=20, sg=1, min_count=1
  🔍 Classificando 

In [None]:
df_resultados_finais

Unnamed: 0,modelo,vector_size,window,epochs,sg,min_count,mean_test_accuracy,mean_test_precision,mean_test_recall,mean_test_f1,mean_test_roc_auc,params
274,XGBoost,160,5,30,1,1,0.71,0.73,0.69,0.71,0.76,"{'model__learning_rate': 0.1, 'model__max_dept..."
282,RandomForest,160,5,30,1,2,0.71,0.73,0.68,0.70,0.77,"{'model__max_depth': 6, 'model__n_estimators':..."
272,RandomForest,160,5,30,1,1,0.71,0.73,0.68,0.70,0.77,"{'model__max_depth': 6, 'model__n_estimators':..."
103,XGBoost,128,5,20,1,2,0.71,0.73,0.68,0.70,0.76,"{'model__learning_rate': 0.1, 'model__max_dept..."
283,XGBoost,160,5,30,1,2,0.70,0.71,0.69,0.70,0.77,"{'model__learning_rate': 0.1, 'model__max_dept..."
...,...,...,...,...,...,...,...,...,...,...,...,...
0,RandomForest,128,3,20,0,1,0.65,0.69,0.58,0.63,0.71,"{'model__max_depth': 3, 'model__n_estimators':..."
153,RandomForest,160,3,20,0,2,0.65,0.69,0.58,0.63,0.71,"{'model__max_depth': 3, 'model__n_estimators':..."
82,RandomForest,128,5,20,0,2,0.65,0.70,0.57,0.63,0.71,"{'model__max_depth': 3, 'model__n_estimators':..."
154,RandomForest,160,3,20,0,2,0.65,0.70,0.57,0.62,0.71,"{'model__max_depth': 3, 'model__n_estimators':..."


# dataset separados


## in vivo

In [None]:
# Exemplo: lendo um CSV com uma coluna chamada 'smiles'
df = pd.read_csv("/content/df_final_vivo.csv")
smiles_list = df['SMILES'].tolist()

In [None]:
def smiles_to_tokens(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if not mol:
        return []

    tokens = []

    for atom in mol.GetAtoms():
        symbol = atom.GetSymbol()
        idx = atom.GetIdx()
        neighbors = [nbr.GetSymbol() for nbr in atom.GetNeighbors()]
        token = f'{symbol}({",".join(neighbors)})'
        tokens.append(token)

    return tokens

# Exemplo
tokenized = [smiles_to_tokens(s) for s in smiles_list]



In [None]:
# Treinando o modelo Word2Vec nos tokens
model = Word2Vec(
    sentences=tokenized,
    vector_size=128,
    window=5,
    sg=1,  # skip-gram
    min_count=1,
    epochs=30,
    workers=4
)


# Função para média + soma + máximo
def get_mol_vector(tokens, model):
    vecs = [model.wv[t] for t in tokens if t in model.wv]
    if not vecs:
        return np.zeros(model.vector_size * 3)
    vecs = np.array(vecs)
    return np.concatenate([vecs.mean(axis=0), vecs.sum(axis=0), vecs.max(axis=0)])

# Aplicando para todas as moléculas
mol_vectors = np.array([get_mol_vector(tokens, model) for tokens in tokenized])
df_Vivo = pd.DataFrame(mol_vectors)

In [None]:
le = LabelEncoder()
X = df_Vivo
y = le.fit_transform(df['Results'])

# Normalizar os dados (opcional, mas recomendado para alguns modelos)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Configurar o StratifiedKFold (10 folds)
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# Criar dicionário para armazenar os resultados
results = []

# Iterar sobre os folds
for train_idx, test_idx in cv.split(X_scaled, y):
    X_train, X_test = X_scaled[train_idx], X_scaled[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    # Inicializar o LazyClassifier
    clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)

    # Treinar e testar os modelos
    models, predictions = clf.fit(X_train, X_test, y_train, y_test)

    # Salvar os resultados do fold
    results.append(models)

predict_results_vivo = (pd.concat(results).groupby(level=0).agg(['mean', 'std']).sort_values(by=('F1 Score', 'mean'), ascending=False)
)


 97%|█████████▋| 31/32 [00:36<00:01,  1.17s/it]

[LightGBM] [Info] Number of positive: 564, number of negative: 918
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008201 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 73981
[LightGBM] [Info] Number of data points in the train set: 1482, number of used features: 384
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.380567 -> initscore=-0.487143
[LightGBM] [Info] Start training from score -0.487143


100%|██████████| 32/32 [00:39<00:00,  1.25s/it]
 97%|█████████▋| 31/32 [00:32<00:01,  1.07s/it]

[LightGBM] [Info] Number of positive: 564, number of negative: 918
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.014245 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 74021
[LightGBM] [Info] Number of data points in the train set: 1482, number of used features: 384
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.380567 -> initscore=-0.487143
[LightGBM] [Info] Start training from score -0.487143


100%|██████████| 32/32 [00:36<00:00,  1.15s/it]
 97%|█████████▋| 31/32 [00:39<00:00,  1.08it/s]

[LightGBM] [Info] Number of positive: 564, number of negative: 918
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008966 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 73696
[LightGBM] [Info] Number of data points in the train set: 1482, number of used features: 384
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.380567 -> initscore=-0.487143
[LightGBM] [Info] Start training from score -0.487143


100%|██████████| 32/32 [00:43<00:00,  1.35s/it]


In [None]:
predict_results_vivo

Unnamed: 0_level_0,Accuracy,Accuracy,Balanced Accuracy,Balanced Accuracy,ROC AUC,ROC AUC,F1 Score,F1 Score,Time Taken,Time Taken
Unnamed: 0_level_1,mean,std,mean,std,mean,std,mean,std,mean,std
Model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
ExtraTreesClassifier,0.78,0.02,0.75,0.01,0.75,0.01,0.78,0.02,0.87,0.19
RandomForestClassifier,0.78,0.02,0.75,0.01,0.75,0.01,0.78,0.01,2.61,0.31
LGBMClassifier,0.78,0.01,0.75,0.01,0.75,0.01,0.78,0.01,3.95,0.12
NuSVC,0.78,0.0,0.74,0.0,0.74,0.0,0.77,0.0,0.6,0.02
XGBClassifier,0.77,0.02,0.74,0.02,0.74,0.02,0.77,0.02,5.74,0.94
BaggingClassifier,0.76,0.0,0.72,0.0,0.72,0.0,0.75,0.0,5.13,0.25
KNeighborsClassifier,0.75,0.01,0.73,0.01,0.73,0.01,0.75,0.01,0.12,0.04
SVC,0.76,0.01,0.72,0.01,0.72,0.01,0.75,0.01,0.58,0.04
LogisticRegression,0.74,0.01,0.72,0.01,0.72,0.01,0.74,0.01,0.15,0.02
RidgeClassifier,0.74,0.01,0.72,0.01,0.72,0.01,0.74,0.01,0.08,0.02


### gridsearch

In [None]:
# Parâmetros para o Word2Vec
param_grid_w2v = {
    'vector_size': [128, 160],
    'window': [3, 5],
    'epochs': [20, 30],
    'sg': [0, 1],
    'min_count': [1, 2]
}

# Classificadores e seus grids
modelos = {
    "RandomForest": {
        "pipeline": Pipeline([
            ("scaler", StandardScaler()),
            ("model", RandomForestClassifier(random_state=42))
        ]),
        "param_grid": {
            "model__n_estimators": [50, 100],
            "model__max_depth": [3, 6]
        }
    },
    "XGBoost": {
        "pipeline": Pipeline([
            ("scaler", StandardScaler()),
            ("model", XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'))
        ]),
        "param_grid": {
            "model__n_estimators": [50],
            "model__max_depth": [3],
            "model__learning_rate": [0.1]
        }
    },
    "ExtraTrees": {
        "pipeline": Pipeline([
            ("scaler", StandardScaler()),
            ("model", ExtraTreesClassifier(random_state=42))
        ]),
        "param_grid": {
            "model__n_estimators": [50, 100],
            "model__max_depth": [3, 6]
        }
    }
}

# Scorers
scoring = {
    'accuracy': 'accuracy',
    'precision': 'precision',
    'recall': 'recall',
    'f1': 'f1',
    'roc_auc': 'roc_auc'
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
resultados_finais = []

# Combinações de hiperparâmetros do Word2Vec
combinacoes_w2v = list(product(
    param_grid_w2v['vector_size'],
    param_grid_w2v['window'],
    param_grid_w2v['epochs'],
    param_grid_w2v['sg'],
    param_grid_w2v['min_count']
))

for size, window, epochs, sg, min_count in combinacoes_w2v:
    print(f"\n🧠 Word2Vec: size={size}, window={window}, epochs={epochs}, sg={sg}, min_count={min_count}")
    try:
        # Treinamento do modelo Word2Vec
        w2v_model = Word2Vec(
            sentences=tokenized,
            vector_size=size,
            window=window,
            sg=sg,
            min_count=min_count,
            epochs=epochs,
            workers=4
        )

        def get_mol_vector(tokens, model):
            vecs = [model.wv[t] for t in tokens if t in model.wv]
            return np.mean(vecs, axis=0) if vecs else np.zeros(model.vector_size)

        X = np.array([get_mol_vector(tokens, w2v_model) for tokens in tokenized])

        # Avaliar cada classificador
        for nome_modelo, config in modelos.items():
            print(f"  🔍 Classificando com: {nome_modelo}")

            grid = GridSearchCV(
                estimator=config["pipeline"],
                param_grid=config["param_grid"],
                scoring=scoring,
                refit="f1",
                cv=cv,
                verbose=0,
                n_jobs=-1,
                return_train_score=False
            )

            grid.fit(X, y)

            df_resultado = pd.DataFrame(grid.cv_results_)
            df_resultado['modelo'] = nome_modelo
            df_resultado['vector_size'] = size
            df_resultado['window'] = window
            df_resultado['epochs'] = epochs
            df_resultado['sg'] = sg
            df_resultado['min_count'] = min_count
            resultados_finais.append(df_resultado)

            print(f"    ✅ Melhor F1: {grid.best_score_:.4f} | Parâmetros: {grid.best_params_}")

    except Exception as e:
        print("⚠️ Erro ao treinar com esta combinação:", e)

# Juntando resultados
df_comparacao = pd.concat(resultados_finais, ignore_index=True)

# Exibir principais métricas
colunas_mostrar = ['modelo', 'vector_size', 'window', 'epochs', 'sg', 'min_count',
                   'mean_test_accuracy', 'mean_test_precision', 'mean_test_recall',
                   'mean_test_f1', 'mean_test_roc_auc', 'params']

df_resultados_finais_vivo = df_comparacao[colunas_mostrar].sort_values(by="mean_test_f1", ascending=False)


🧠 Word2Vec: size=128, window=3, epochs=20, sg=0, min_count=1
  🔍 Classificando com: RandomForest
    ✅ Melhor F1: 0.5820 | Parâmetros: {'model__max_depth': 6, 'model__n_estimators': 100}
  🔍 Classificando com: XGBoost
    ✅ Melhor F1: 0.6011 | Parâmetros: {'model__learning_rate': 0.1, 'model__max_depth': 3, 'model__n_estimators': 50}
  🔍 Classificando com: ExtraTrees
    ✅ Melhor F1: 0.4890 | Parâmetros: {'model__max_depth': 6, 'model__n_estimators': 100}

🧠 Word2Vec: size=128, window=3, epochs=20, sg=0, min_count=2
  🔍 Classificando com: RandomForest
    ✅ Melhor F1: 0.5571 | Parâmetros: {'model__max_depth': 6, 'model__n_estimators': 100}
  🔍 Classificando com: XGBoost
    ✅ Melhor F1: 0.5936 | Parâmetros: {'model__learning_rate': 0.1, 'model__max_depth': 3, 'model__n_estimators': 50}
  🔍 Classificando com: ExtraTrees
    ✅ Melhor F1: 0.4955 | Parâmetros: {'model__max_depth': 6, 'model__n_estimators': 50}

🧠 Word2Vec: size=128, window=3, epochs=20, sg=1, min_count=1
  🔍 Classificando

In [None]:
df_resultados_finais_vivo

Unnamed: 0,modelo,vector_size,window,epochs,sg,min_count,mean_test_accuracy,mean_test_precision,mean_test_recall,mean_test_f1,mean_test_roc_auc,params
283,XGBoost,160,5,30,1,2,0.76,0.74,0.55,0.63,0.79,"{'model__learning_rate': 0.1, 'model__max_dept..."
211,XGBoost,160,3,30,1,2,0.76,0.75,0.54,0.63,0.80,"{'model__learning_rate': 0.1, 'model__max_dept..."
175,XGBoost,160,3,20,1,2,0.76,0.75,0.54,0.63,0.81,"{'model__learning_rate': 0.1, 'model__max_dept..."
247,XGBoost,160,5,20,1,2,0.76,0.75,0.54,0.62,0.80,"{'model__learning_rate': 0.1, 'model__max_dept..."
112,XGBoost,128,5,30,0,1,0.75,0.75,0.53,0.62,0.80,"{'model__learning_rate': 0.1, 'model__max_dept..."
...,...,...,...,...,...,...,...,...,...,...,...,...
42,ExtraTrees,128,3,30,0,1,0.65,0.84,0.09,0.16,0.72,"{'model__max_depth': 3, 'model__n_estimators':..."
284,ExtraTrees,160,5,30,1,2,0.65,0.86,0.09,0.16,0.73,"{'model__max_depth': 3, 'model__n_estimators':..."
257,ExtraTrees,160,5,30,0,1,0.64,0.83,0.09,0.16,0.72,"{'model__max_depth': 3, 'model__n_estimators':..."
86,ExtraTrees,128,5,20,0,2,0.64,0.80,0.08,0.15,0.72,"{'model__max_depth': 3, 'model__n_estimators':..."


## in vitro

In [None]:
# Exemplo: lendo um CSV com uma coluna chamada 'smiles'
df = pd.read_csv("/content/df_final_vitro.csv")
smiles_list = df['SMILES'].tolist()

In [None]:
def smiles_to_tokens(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if not mol:
        return []

    tokens = []

    for atom in mol.GetAtoms():
        symbol = atom.GetSymbol()
        idx = atom.GetIdx()
        neighbors = [nbr.GetSymbol() for nbr in atom.GetNeighbors()]
        token = f'{symbol}({",".join(neighbors)})'
        tokens.append(token)

    return tokens

# Exemplo
tokenized = [smiles_to_tokens(s) for s in smiles_list]



In [None]:
# Treinando o modelo Word2Vec nos tokens
model = Word2Vec(
    sentences=tokenized,
    vector_size=128,
    window=5,
    sg=1,  # skip-gram
    min_count=1,
    epochs=30,
    workers=4
)


# Função para média + soma + máximo
def get_mol_vector(tokens, model):
    vecs = [model.wv[t] for t in tokens if t in model.wv]
    if not vecs:
        return np.zeros(model.vector_size * 3)
    vecs = np.array(vecs)
    return np.concatenate([vecs.mean(axis=0), vecs.sum(axis=0), vecs.max(axis=0)])

# Aplicando para todas as moléculas
mol_vectors = np.array([get_mol_vector(tokens, model) for tokens in tokenized])
df_Vivo = pd.DataFrame(mol_vectors)

In [None]:
le = LabelEncoder()
X = df_Vivo
y = le.fit_transform(df['Results'])

# Normalizar os dados (opcional, mas recomendado para alguns modelos)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Configurar o StratifiedKFold (10 folds)
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# Criar dicionário para armazenar os resultados
results = []

# Iterar sobre os folds
for train_idx, test_idx in cv.split(X_scaled, y):
    X_train, X_test = X_scaled[train_idx], X_scaled[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    # Inicializar o LazyClassifier
    clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)

    # Treinar e testar os modelos
    models, predictions = clf.fit(X_train, X_test, y_train, y_test)

    # Salvar os resultados do fold
    results.append(models)

predict_results_vitro = (pd.concat(results).groupby(level=0).agg(['mean', 'std']).sort_values(by=('F1 Score', 'mean'), ascending=False)
)


 97%|█████████▋| 31/32 [00:50<00:01,  1.44s/it]

[LightGBM] [Info] Number of positive: 1252, number of negative: 789
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.014210 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 74984
[LightGBM] [Info] Number of data points in the train set: 2041, number of used features: 384
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.613425 -> initscore=0.461731
[LightGBM] [Info] Start training from score 0.461731


100%|██████████| 32/32 [00:55<00:00,  1.72s/it]
 97%|█████████▋| 31/32 [00:48<00:01,  1.42s/it]

[LightGBM] [Info] Number of positive: 1252, number of negative: 789
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011316 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 74940
[LightGBM] [Info] Number of data points in the train set: 2041, number of used features: 384
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.613425 -> initscore=0.461731
[LightGBM] [Info] Start training from score 0.461731


100%|██████████| 32/32 [00:52<00:00,  1.63s/it]
 97%|█████████▋| 31/32 [00:44<00:01,  1.51s/it]

[LightGBM] [Info] Number of positive: 1252, number of negative: 790
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011772 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 75095
[LightGBM] [Info] Number of data points in the train set: 2042, number of used features: 384
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.613124 -> initscore=0.460465
[LightGBM] [Info] Start training from score 0.460465


100%|██████████| 32/32 [00:48<00:00,  1.51s/it]


In [None]:
predict_results_vitro

Unnamed: 0_level_0,Accuracy,Accuracy,Balanced Accuracy,Balanced Accuracy,ROC AUC,ROC AUC,F1 Score,F1 Score,Time Taken,Time Taken
Unnamed: 0_level_1,mean,std,mean,std,mean,std,mean,std,mean,std
Model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
ExtraTreesClassifier,0.73,0.01,0.71,0.02,0.71,0.02,0.73,0.01,1.03,0.12
RandomForestClassifier,0.73,0.01,0.7,0.01,0.7,0.01,0.73,0.01,3.42,0.02
LGBMClassifier,0.73,0.0,0.7,0.01,0.7,0.01,0.72,0.0,4.18,0.06
XGBClassifier,0.72,0.01,0.7,0.02,0.7,0.02,0.72,0.01,7.58,0.31
NuSVC,0.72,0.01,0.69,0.01,0.69,0.01,0.71,0.01,1.37,0.21
BaggingClassifier,0.7,0.02,0.69,0.02,0.69,0.02,0.7,0.02,6.58,0.3
QuadraticDiscriminantAnalysis,0.7,0.01,0.67,0.0,0.67,0.0,0.7,0.0,0.29,0.01
LogisticRegression,0.7,0.01,0.67,0.02,0.67,0.02,0.7,0.01,0.34,0.14
RidgeClassifierCV,0.7,0.01,0.67,0.02,0.67,0.02,0.69,0.02,0.31,0.02
RidgeClassifier,0.7,0.01,0.67,0.01,0.67,0.01,0.69,0.01,0.1,0.04


### gridsearch

In [None]:
# Parâmetros para o Word2Vec
param_grid_w2v = {
    'vector_size': [128, 160],
    'window': [3, 5],
    'epochs': [20, 30],
    'sg': [0, 1],
    'min_count': [1, 2]
}

# Classificadores e seus grids
modelos = {
    "RandomForest": {
        "pipeline": Pipeline([
            ("scaler", StandardScaler()),
            ("model", RandomForestClassifier(random_state=42))
        ]),
        "param_grid": {
            "model__n_estimators": [50, 100],
            "model__max_depth": [3, 6]
        }
    },
    "XGBoost": {
        "pipeline": Pipeline([
            ("scaler", StandardScaler()),
            ("model", XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'))
        ]),
        "param_grid": {
            "model__n_estimators": [50],
            "model__max_depth": [3],
            "model__learning_rate": [0.1]
        }
    },
    "ExtraTrees": {
        "pipeline": Pipeline([
            ("scaler", StandardScaler()),
            ("model", ExtraTreesClassifier(random_state=42))
        ]),
        "param_grid": {
            "model__n_estimators": [50, 100],
            "model__max_depth": [3, 6]
        }
    }
}

# Scorers
scoring = {
    'accuracy': 'accuracy',
    'precision': 'precision',
    'recall': 'recall',
    'f1': 'f1',
    'roc_auc': 'roc_auc'
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
resultados_finais = []

# Combinações de hiperparâmetros do Word2Vec
combinacoes_w2v = list(product(
    param_grid_w2v['vector_size'],
    param_grid_w2v['window'],
    param_grid_w2v['epochs'],
    param_grid_w2v['sg'],
    param_grid_w2v['min_count']
))

for size, window, epochs, sg, min_count in combinacoes_w2v:
    print(f"\n🧠 Word2Vec: size={size}, window={window}, epochs={epochs}, sg={sg}, min_count={min_count}")
    try:
        # Treinamento do modelo Word2Vec
        w2v_model = Word2Vec(
            sentences=tokenized,
            vector_size=size,
            window=window,
            sg=sg,
            min_count=min_count,
            epochs=epochs,
            workers=4
        )

        def get_mol_vector(tokens, model):
            vecs = [model.wv[t] for t in tokens if t in model.wv]
            return np.mean(vecs, axis=0) if vecs else np.zeros(model.vector_size)

        X = np.array([get_mol_vector(tokens, w2v_model) for tokens in tokenized])

        # Avaliar cada classificador
        for nome_modelo, config in modelos.items():
            print(f"  🔍 Classificando com: {nome_modelo}")

            grid = GridSearchCV(
                estimator=config["pipeline"],
                param_grid=config["param_grid"],
                scoring=scoring,
                refit="f1",
                cv=cv,
                verbose=0,
                n_jobs=-1,
                return_train_score=False
            )

            grid.fit(X, y)

            df_resultado = pd.DataFrame(grid.cv_results_)
            df_resultado['modelo'] = nome_modelo
            df_resultado['vector_size'] = size
            df_resultado['window'] = window
            df_resultado['epochs'] = epochs
            df_resultado['sg'] = sg
            df_resultado['min_count'] = min_count
            resultados_finais.append(df_resultado)

            print(f"    ✅ Melhor F1: {grid.best_score_:.4f} | Parâmetros: {grid.best_params_}")

    except Exception as e:
        print("⚠️ Erro ao treinar com esta combinação:", e)

# Juntando resultados
df_comparacao = pd.concat(resultados_finais, ignore_index=True)

# Exibir principais métricas
colunas_mostrar = ['modelo', 'vector_size', 'window', 'epochs', 'sg', 'min_count',
                   'mean_test_accuracy', 'mean_test_precision', 'mean_test_recall',
                   'mean_test_f1', 'mean_test_roc_auc', 'params']

df_resultados_finais_vitro = df_comparacao[colunas_mostrar].sort_values(by="mean_test_f1", ascending=False)


🧠 Word2Vec: size=128, window=3, epochs=20, sg=0, min_count=1
  🔍 Classificando com: RandomForest
    ✅ Melhor F1: 0.7796 | Parâmetros: {'model__max_depth': 6, 'model__n_estimators': 100}
  🔍 Classificando com: XGBoost
    ✅ Melhor F1: 0.7761 | Parâmetros: {'model__learning_rate': 0.1, 'model__max_depth': 3, 'model__n_estimators': 50}
  🔍 Classificando com: ExtraTrees
    ✅ Melhor F1: 0.7773 | Parâmetros: {'model__max_depth': 6, 'model__n_estimators': 50}

🧠 Word2Vec: size=128, window=3, epochs=20, sg=0, min_count=2
  🔍 Classificando com: RandomForest
    ✅ Melhor F1: 0.7866 | Parâmetros: {'model__max_depth': 6, 'model__n_estimators': 100}
  🔍 Classificando com: XGBoost
    ✅ Melhor F1: 0.7692 | Parâmetros: {'model__learning_rate': 0.1, 'model__max_depth': 3, 'model__n_estimators': 50}
  🔍 Classificando com: ExtraTrees
    ✅ Melhor F1: 0.7764 | Parâmetros: {'model__max_depth': 6, 'model__n_estimators': 50}

🧠 Word2Vec: size=128, window=3, epochs=20, sg=1, min_count=1
  🔍 Classificando 

In [None]:
df_resultados_finais_vitro

Unnamed: 0,modelo,vector_size,window,epochs,sg,min_count,mean_test_accuracy,mean_test_precision,mean_test_recall,mean_test_f1,mean_test_roc_auc,params
57,RandomForest,128,3,30,1,1,0.72,0.73,0.88,0.80,0.78,"{'model__max_depth': 6, 'model__n_estimators':..."
237,RandomForest,160,5,20,1,1,0.72,0.72,0.88,0.79,0.77,"{'model__max_depth': 6, 'model__n_estimators':..."
236,RandomForest,160,5,20,1,1,0.72,0.72,0.88,0.79,0.77,"{'model__max_depth': 6, 'model__n_estimators':..."
101,RandomForest,128,5,20,1,2,0.72,0.73,0.87,0.79,0.78,"{'model__max_depth': 6, 'model__n_estimators':..."
245,RandomForest,160,5,20,1,2,0.72,0.73,0.87,0.79,0.77,"{'model__max_depth': 6, 'model__n_estimators':..."
...,...,...,...,...,...,...,...,...,...,...,...,...
69,ExtraTrees,128,3,30,1,2,0.61,0.61,1.00,0.76,0.71,"{'model__max_depth': 3, 'model__n_estimators':..."
203,ExtraTrees,160,3,30,1,1,0.61,0.61,1.00,0.76,0.70,"{'model__max_depth': 3, 'model__n_estimators':..."
284,ExtraTrees,160,5,30,1,2,0.61,0.61,0.99,0.76,0.70,"{'model__max_depth': 3, 'model__n_estimators':..."
204,ExtraTrees,160,3,30,1,1,0.61,0.61,1.00,0.76,0.71,"{'model__max_depth': 3, 'model__n_estimators':..."
