In [1]:
# ============================================================
# RDKit – manipulação e padronização de moléculas químicas
# ============================================================
from rdkit import Chem, RDLogger
from rdkit.Chem import Descriptors, AllChem
from rdkit.Chem.MolStandardize import rdMolStandardize


# ============================================================
# Utilidades gerais – sistema, regex, datas, progresso e numéricos
# ============================================================
import os
import re
from datetime import datetime
from tqdm import tqdm         # barras de progresso
import joblib                 # salvar/carregar modelos
import numpy as np
import pandas as pd
import torch


# ============================================================
# Transformers – *tokenizers* e modelos pré-treinados (HuggingFace)
# ============================================================
from transformers import AutoTokenizer, AutoModel


# ============================================================
# Scikit-learn – divisão, validação cruzada e *pipelines*
# ============================================================
from sklearn.model_selection import (
    train_test_split, StratifiedKFold, cross_validate, cross_val_score,
    StratifiedGroupKFold
)
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer


# ============================================================
# Scikit-learn – métricas de avaliação
# ============================================================
from sklearn.metrics import (
    make_scorer, precision_score, recall_score, f1_score,
    accuracy_score, roc_auc_score, classification_report
)


# ============================================================
# Scikit-learn – modelos lineares
# ============================================================
from sklearn.linear_model import (
    LogisticRegression, RidgeClassifier, SGDClassifier,
    PassiveAggressiveClassifier, Perceptron
)


# ============================================================
# Scikit-learn – Máquinas de Vetores de Suporte (SVM)
# ============================================================
from sklearn.svm import SVC, LinearSVC, NuSVC


# ============================================================
# Scikit-learn – métodos baseados em vizinhos
# ============================================================
from sklearn.neighbors import KNeighborsClassifier


# ============================================================
# Scikit-learn – árvores de decisão e modelos *ensemble*
# ============================================================
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.ensemble import (
    RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier,
    AdaBoostClassifier, BaggingClassifier, HistGradientBoostingClassifier
)


# ============================================================
# Scikit-learn – modelos Naive Bayes
# ============================================================
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB


# ============================================================
# Scikit-learn – análise discriminante
# ============================================================
from sklearn.discriminant_analysis import (
    LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
)


# ============================================================
# Scikit-learn – redes neurais (MLP)
# ============================================================
from sklearn.neural_network import MLPClassifier


# ============================================================
# Modelos externos – gradiente boosting modernos
# ============================================================
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier


# ============================================================
# Imbalanced-learn – tratamento de desbalanceamento
# ============================================================
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline


In [2]:
import importlib

# Lista de módulos e nomes amigáveis
modules = {
    "rdkit": "RDKit",
    "os": "os",
    "re": "re",
    "datetime": "datetime",
    "tqdm": "tqdm",
    "joblib": "joblib",
    "numpy": "numpy",
    "pandas": "pandas",
    "torch": "torch",
    "transformers": "transformers",
    "sklearn": "scikit-learn",
    "xgboost": "xgboost",
    "lightgbm": "lightgbm",
    "catboost": "catboost",
    "imblearn": "imbalanced-learn",
}

def get_version(module_name, friendly_name):
    try:
        module = importlib.import_module(module_name)

        # Tentativa padrão
        if hasattr(module, "__version__"):
            return f"{friendly_name}: {module.__version__}"

        # Casos especiais --------------------------

        # RDKit
        if module_name == "rdkit":
            from rdkit import rdBase
            return f"{friendly_name}: {rdBase.rdkitVersion}"

        # os, re, datetime → módulos padrão não têm versão
        if module_name in ("os", "re", "datetime"):
            return f"{friendly_name}: módulo padrão (sem versão)"

        return f"{friendly_name}: versão não encontrada"

    except Exception as e:
        return f"{friendly_name}: erro ao importar ({e})"


# Printar tudo organizado
print("=== VERSÕES DAS BIBLIOTECAS ===\n")
for mod, name in modules.items():
    print(get_version(mod, name))


=== VERSÕES DAS BIBLIOTECAS ===

RDKit: 2025.03.2
os: módulo padrão (sem versão)
re: 2.2.1
datetime: módulo padrão (sem versão)
tqdm: 4.67.1
joblib: 1.5.1
numpy: 2.2.6
pandas: 2.3.3
torch: 2.9.1+cu128
transformers: 4.57.1
scikit-learn: 1.6.1
xgboost: 3.0.2
lightgbm: 4.6.0
catboost: 1.2.8
imbalanced-learn: 0.14.0


# Limpeza de Dados

In [3]:
# Disable informational messages from RDKit
RDLogger.DisableLog('rdApp.info')

def is_valid_smiles(smiles):
    """
    Checks whether a SMILES string is valid.

    Parameters:
        smiles (str): The SMILES representation of the molecule.

    Returns:
        bool: True if the SMILES is valid, False otherwise.
    """
    # Return False if SMILES is None, not a string, or empty/whitespace only
    if pd.isna(smiles) or not isinstance(smiles, str) or smiles.strip() == "":
        return False

    # Remove leading/trailing spaces
    smiles = smiles.strip()

    try:
        # First, try parsing the SMILES with normal sanitization
        mol = Chem.MolFromSmiles(smiles, sanitize=True)
        if mol is not None:
            return True
    except Exception:
        # Ignore errors here and try without sanitization
        pass  

    try:
        # If normal sanitization fails, parse without sanitization
        mol = Chem.MolFromSmiles(smiles, sanitize=False)
        if mol is not None:
            # Attempt manual sanitization and catch possible errors
            Chem.SanitizeMol(mol, catchErrors=True)
            return True
    except Exception:
        pass

    # If all parsing attempts fail, return False
    return False

# Vetorização

In [4]:
#=========================
# FUNÇÃO DE EMBEDDING
#=========================
def embed_model(model_name, smiles_list):

    print(f"\nCarregando modelo: {model_name}")

    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
    model.to(device).eval()

    def get_embedding(smiles):
        inputs = tokenizer(smiles, return_tensors="pt", truncation=True, padding=True)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        with torch.no_grad():
            outputs = model(**inputs)
        emb = outputs.last_hidden_state.mean(dim=1).squeeze()
        return emb.cpu().numpy()

    embeddings = [get_embedding(s) for s in tqdm(smiles_list, desc=f"Embeddings {model_name}")]
    return np.vstack(embeddings)

# dataset


## in vivo

In [5]:
df_vivo= pd.read_csv('in vivo + cpdb.csv')
df_vivo.shape

(19883, 9)

In [6]:
df_vivo['SMILES_valido'] = df_vivo['SMILES'].apply(is_valid_smiles)
df_vivo['SMILES_valido'].value_counts()



SMILES_valido
True    19883
Name: count, dtype: int64

In [7]:
# Drop unnecessary columns
df_vivo.drop(columns=['Chemical', 'Identificador', 'SMILES_valido','species',	'strain',	'Male',	'Female'], inplace=True)

# Convert text columns to lowercase
df_vivo['Results'] = df_vivo['Results'].str.lower()
df_vivo['Type'] = df_vivo['Type'].str.lower()

# Remove duplicate rows if any
df_vivo.drop_duplicates(inplace=True)

df_vivo.shape

(3617, 3)

In [8]:
# Identify SMILES (in the filtered DataFrame) that have more than one 'Results' value
smiles_multiple_results = df_vivo.groupby("SMILES")["Results"].nunique()
smiles_multiple_results = smiles_multiple_results[smiles_multiple_results > 1].index

# Remove SMILES that have more than one result
df_final_vivo = df_vivo[~df_vivo["SMILES"].isin(smiles_multiple_results)]

print("Total de compostos não divergentes:", len(df_final_vivo))

df_organicos = df_final_vivo[df_final_vivo["SMILES"].str.contains(r"C(?![a-z])", regex=True, na=False)]

print("Total de compostos orgânicos:", len(df_organicos))

# Drop 'Type' column as it is no longer needed
df_final_vivo = df_organicos.drop(columns='Type')

# Reset index for clean ordering
df_final_vivo.reset_index(drop=True, inplace=True)

# Show final shape
print("Total de compostos final:", len(df_final_vivo))

Total de compostos não divergentes: 2227
Total de compostos orgânicos: 2090
Total de compostos final: 2090


# Experimentation

In [10]:
#=========================
# LISTA DE MODELOS
#=========================
model_list = [
    "DeepChem/ChemBERTa-77M-MTR",
    "DeepChem/ChemBERTa-77M-MLM",
    "DeepChem/ChemBERTa-100M-MLM",
    "ibm-research/MoLFormer-XL-both-10pct",
    "seyonec/ChemBERTa-zinc-base-v1",
    "alvaroalon2/biobert_chemical_ner",
    "google-bert/bert-base-uncased"
]

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#=========================
# MODELOS CLÁSSICOS
#=========================
model_zoo = {
    "LogisticRegression": LogisticRegression(max_iter=1000, n_jobs=-1),
    "RidgeClassifier": RidgeClassifier(),
    "SGDClassifier": SGDClassifier(max_iter=1000, tol=1e-3, n_jobs=-1),
    "PassiveAggressive": PassiveAggressiveClassifier(max_iter=1000, random_state=42),
    "Perceptron": Perceptron(max_iter=1000, tol=1e-3, random_state=42),

    "LinearSVC": LinearSVC(),
    "SVC": SVC(),
    "NuSVC": NuSVC(),

    "KNeighbors": KNeighborsClassifier(),

    "DecisionTree": DecisionTreeClassifier(random_state=42),
    "ExtraTree": ExtraTreeClassifier(random_state=42),
    "RandomForest": RandomForestClassifier(n_estimators=300, random_state=42, n_jobs=-1),
    "ExtraTrees": ExtraTreesClassifier(n_estimators=300, random_state=42, n_jobs=-1),
    "GradientBoosting": GradientBoostingClassifier(random_state=42),
    "HistGradientBoosting": HistGradientBoostingClassifier(random_state=42),

    "AdaBoost": AdaBoostClassifier(random_state=42),
    "Bagging": BaggingClassifier(random_state=42, n_jobs=-1),

    "GaussianNB": GaussianNB(),
    "BernoulliNB": BernoulliNB(),

    "LDA": LinearDiscriminantAnalysis(),
    "QDA": QuadraticDiscriminantAnalysis(),

    "MLPClassifier": MLPClassifier(max_iter=1000, random_state=42),

    "XGBoost": XGBClassifier(
        n_estimators=500, learning_rate=0.05, max_depth=6,
        subsample=0.8, colsample_bytree=0.8, random_state=42,
        use_label_encoder=False, eval_metric="logloss", n_jobs=-1
    ),

    "LightGBM": LGBMClassifier(
        n_estimators=500, learning_rate=0.05, max_depth=-1,
        subsample=0.8, colsample_bytree=0.8, random_state=42, n_jobs=-1
    ),

    "CatBoost": CatBoostClassifier(
        iterations=500, learning_rate=0.05, depth=6, verbose=False, random_state=42
    )
}

#=========================
# CROSS VALIDATION
#=========================
cv = StratifiedGroupKFold(n_splits=10, shuffle=True, random_state=42)

scoring = {
    "accuracy": "accuracy",
    "balanced_accuracy": "balanced_accuracy",
    "f1": "f1",
    "roc_auc": "roc_auc",
    "precision": "precision",
    "recall": "recall",
}

#=========================
# BALANCEAMENTOS
#=========================
balancers = {
    "original": None,
    "under": RandomUnderSampler(random_state=42),
    "over": RandomOverSampler(random_state=42),
    "smooth": SMOTE(random_state=42)
}

#=========================
# LOOP PRINCIPAL
#=========================
os.makedirs("Resultados", exist_ok=True)
os.makedirs("Resultados_folds", exist_ok=True)

for model_name in model_list:

    smiles_list = df_final_vivo["SMILES"].tolist()
    embeddings = embed_model(model_name, smiles_list)

    X = pd.DataFrame(embeddings).fillna(0)
    y = df_final_vivo["Results"].astype(str).str.lower().map({"positive": 1, "negative": 0})
    groups = df_final_vivo["SMILES"].values

    for balance_name, sampler in balancers.items():

        print(f"\nRodando sampling {balance_name} no modelo {model_name}")

        pipelines = {}

        for name, clf in model_zoo.items():
            steps = [("scaler", StandardScaler())]

            if sampler is not None:
                steps.append(("sampler", sampler))

            steps.append(("clf", clf))
            pipelines[name] = ImbPipeline(steps=steps)

        # ==============================
        # LISTAS PARA SALVAMENTO
        # ==============================
        summary_rows = []   # médias e std
        fold_rows = []      # resultados fold a fold

        for name, pipe in pipelines.items():

            cvres = cross_validate(
                pipe, X, y,
                cv=cv,
                scoring=scoring,
                n_jobs=-1,
                groups=groups,
                return_train_score=False
            )

            # ---------------------------
            # SALVAR FOLDS INDIVIDUAIS
            # ---------------------------
            for i in range(cv.get_n_splits()):
                fold_rows.append({
                    "EmbeddingModel": model_name,
                    "Balance": balance_name,
                    "Model": name,
                    "Fold": i + 1,
                    "Accuracy": cvres["test_accuracy"][i],
                    "BalancedAcc": cvres["test_balanced_accuracy"][i],
                    "F1": cvres["test_f1"][i],
                    "ROC_AUC": cvres["test_roc_auc"][i],
                    "Precision": cvres["test_precision"][i],
                    "Recall": cvres["test_recall"][i],
                    "FitTime": cvres["fit_time"][i],
                    "ScoreTime": cvres["score_time"][i],
                })

            # ---------------------------
            # SALVAR MÉDIA E STD
            # ---------------------------
            summary_rows.append({
                "EmbeddingModel": model_name,
                "Balance": balance_name,
                "Model": name,
                "Accuracy_mean": np.mean(cvres["test_accuracy"]),
                "Accuracy_std":  np.std(cvres["test_accuracy"], ddof=1),
                "BalAcc_mean":   np.mean(cvres["test_balanced_accuracy"]),
                "BalAcc_std":    np.std(cvres["test_balanced_accuracy"], ddof=1),
                "F1_mean":       np.mean(cvres["test_f1"]),
                "F1_std":        np.std(cvres["test_f1"], ddof=1),
                "ROC_AUC_mean":  np.mean(cvres["test_roc_auc"]),
                "ROC_AUC_std":   np.std(cvres["test_roc_auc"], ddof=1),
                "Precision_mean":np.mean(cvres["test_precision"]),
                "Precision_std": np.std(cvres["test_precision"], ddof=1),
                "Recall_mean":   np.mean(cvres["test_recall"]),
                "Recall_std":    np.std(cvres["test_recall"], ddof=1),
                "FitTime_mean":  np.mean(cvres["fit_time"]),
                "FitTime_std":   np.std(cvres["fit_time"], ddof=1),
                "ScoreTime_mean":np.mean(cvres["score_time"]),
                "ScoreTime_std": np.std(cvres["score_time"], ddof=1),
            })

        # ==============================
        # SALVAR RESULTADOS (SUMMARY)
        # ==============================
        df_summary = (
            pd.DataFrame(summary_rows)
            .sort_values("F1_mean", ascending=False)
            .reset_index(drop=True)
        )

        safe = model_name.replace("/", "_")
        path_summary = f"Resultados/Embedding_{safe}_{balance_name}.csv"
        df_summary.to_csv(path_summary, index=False)
        print(f"[OK] Resumo salvo em: {path_summary}")
        print(df_summary.head(3))

        # ==============================
        # SALVAR RESULTADOS POR FOLD
        # ==============================
        df_folds = pd.DataFrame(fold_rows)
        path_folds = f"Resultados_folds/Embedding_{safe}_{balance_name}_folds.csv"
        df_folds.to_csv(path_folds, index=False)

        print(f"[OK] Folds salvos em: {path_folds}")



Carregando modelo: DeepChem/ChemBERTa-77M-MTR


Some weights of RobertaModel were not initialized from the model checkpoint at DeepChem/ChemBERTa-77M-MTR and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Embeddings DeepChem/ChemBERTa-77M-MTR: 100%|██████████| 2090/2090 [00:03<00:00, 577.60it/s]



Rodando sampling original no modelo DeepChem/ChemBERTa-77M-MTR
[OK] Resumo salvo em: Resultados/Embedding_DeepChem_ChemBERTa-77M-MTR_original.csv
               EmbeddingModel   Balance     Model  Accuracy_mean  \
0  DeepChem/ChemBERTa-77M-MTR  original   XGBoost       0.803349   
1  DeepChem/ChemBERTa-77M-MTR  original  CatBoost       0.801435   
2  DeepChem/ChemBERTa-77M-MTR  original  LightGBM       0.797129   

   Accuracy_std  BalAcc_mean  BalAcc_std   F1_mean    F1_std  ROC_AUC_mean  \
0      0.019462     0.772175    0.029561  0.710666  0.048396      0.847973   
1      0.025040     0.770237    0.029860  0.707623  0.045433      0.843544   
2      0.024933     0.767973    0.032698  0.705371  0.050303      0.841123   

   ROC_AUC_std  Precision_mean  Precision_std  Recall_mean  Recall_std  \
0     0.028747        0.809852       0.052165     0.636214    0.063501   
1     0.023372        0.812937       0.059903     0.630672    0.064086   
2     0.028302        0.792964       0.052855

Some weights of RobertaModel were not initialized from the model checkpoint at DeepChem/ChemBERTa-77M-MLM and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Embeddings DeepChem/ChemBERTa-77M-MLM: 100%|██████████| 2090/2090 [00:03<00:00, 624.56it/s]



Rodando sampling original no modelo DeepChem/ChemBERTa-77M-MLM
[OK] Resumo salvo em: Resultados/Embedding_DeepChem_ChemBERTa-77M-MLM_original.csv
               EmbeddingModel   Balance     Model  Accuracy_mean  \
0  DeepChem/ChemBERTa-77M-MLM  original     NuSVC       0.787081   
1  DeepChem/ChemBERTa-77M-MLM  original  LightGBM       0.789474   
2  DeepChem/ChemBERTa-77M-MLM  original   XGBoost       0.786603   

   Accuracy_std  BalAcc_mean  BalAcc_std   F1_mean    F1_std  ROC_AUC_mean  \
0      0.036873     0.763175    0.042469  0.702607  0.063133      0.847147   
1      0.029148     0.761084    0.028173  0.698640  0.038184      0.843359   
2      0.032467     0.757327    0.033851  0.692493  0.048144      0.842501   

   ROC_AUC_std  Precision_mean  Precision_std  Recall_mean  Recall_std  \
0     0.035723        0.752695       0.058679     0.661993    0.079767   
1     0.027883        0.775431       0.034106     0.637655    0.054154   
2     0.030843        0.773334       0.044192

Some weights of RobertaModel were not initialized from the model checkpoint at DeepChem/ChemBERTa-100M-MLM and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Embeddings DeepChem/ChemBERTa-100M-MLM:   0%|          | 0/2090 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Embeddings DeepChem/ChemBERTa-100M-MLM: 100%|██████████| 2090/2090 [00:08<00:00, 240.63it/s]



Rodando sampling original no modelo DeepChem/ChemBERTa-100M-MLM
[OK] Resumo salvo em: Resultados/Embedding_DeepChem_ChemBERTa-100M-MLM_original.csv
                EmbeddingModel   Balance          Model  Accuracy_mean  \
0  DeepChem/ChemBERTa-100M-MLM  original          NuSVC       0.769378   
1  DeepChem/ChemBERTa-100M-MLM  original  MLPClassifier       0.752632   
2  DeepChem/ChemBERTa-100M-MLM  original     KNeighbors       0.746890   

   Accuracy_std  BalAcc_mean  BalAcc_std   F1_mean    F1_std  ROC_AUC_mean  \
0      0.038989     0.742871    0.040779  0.674199  0.055872      0.820064   
1      0.039907     0.736783    0.036022  0.673590  0.042565      0.811502   
2      0.035265     0.728004    0.035904  0.661189  0.047598      0.790739   

   ROC_AUC_std  Precision_mean  Precision_std  Recall_mean  Recall_std  \
0     0.034581        0.736008       0.053720     0.624895    0.070970   
1     0.038106        0.688123       0.064419     0.664079    0.052781   
2     0.043764     

Embeddings ibm-research/MoLFormer-XL-both-10pct:   0%|          | 0/2090 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Embeddings ibm-research/MoLFormer-XL-both-10pct: 100%|██████████| 2090/2090 [00:28<00:00, 72.50it/s]



Rodando sampling original no modelo ibm-research/MoLFormer-XL-both-10pct
[OK] Resumo salvo em: Resultados/Embedding_ibm-research_MoLFormer-XL-both-10pct_original.csv
                         EmbeddingModel   Balance          Model  \
0  ibm-research/MoLFormer-XL-both-10pct  original          NuSVC   
1  ibm-research/MoLFormer-XL-both-10pct  original  MLPClassifier   
2  ibm-research/MoLFormer-XL-both-10pct  original     KNeighbors   

   Accuracy_mean  Accuracy_std  BalAcc_mean  BalAcc_std   F1_mean    F1_std  \
0       0.783254      0.028446     0.758938    0.028945  0.697770  0.039326   
1       0.762679      0.033470     0.746405    0.038256  0.685260  0.053690   
2       0.762679      0.031914     0.746053    0.037430  0.682775  0.055658   

   ROC_AUC_mean  ROC_AUC_std  Precision_mean  Precision_std  Recall_mean  \
0      0.839889     0.033663        0.750112       0.038823     0.653584   
1      0.811506     0.047579        0.694913       0.055790     0.677255   
2      0.814163

Embeddings seyonec/ChemBERTa-zinc-base-v1: 100%|██████████| 2090/2090 [00:05<00:00, 403.68it/s]



Rodando sampling original no modelo seyonec/ChemBERTa-zinc-base-v1
[OK] Resumo salvo em: Resultados/Embedding_seyonec_ChemBERTa-zinc-base-v1_original.csv
                   EmbeddingModel   Balance     Model  Accuracy_mean  \
0  seyonec/ChemBERTa-zinc-base-v1  original     NuSVC       0.747368   
1  seyonec/ChemBERTa-zinc-base-v1  original   XGBoost       0.750239   
2  seyonec/ChemBERTa-zinc-base-v1  original  LightGBM       0.750718   

   Accuracy_std  BalAcc_mean  BalAcc_std   F1_mean    F1_std  ROC_AUC_mean  \
0      0.033667     0.721325    0.035911  0.647385  0.050575      0.798741   
1      0.022533     0.717155    0.026589  0.634628  0.040745      0.804977   
2      0.027481     0.716544    0.031338  0.632324  0.050671      0.798863   

   ROC_AUC_std  Precision_mean  Precision_std  Recall_mean  Recall_std  \
0     0.027907        0.696751       0.058986     0.606579    0.057381   
1     0.032709        0.723552       0.049618     0.568937    0.060739   
2     0.033714       

Some weights of BertModel were not initialized from the model checkpoint at alvaroalon2/biobert_chemical_ner and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Embeddings alvaroalon2/biobert_chemical_ner: 100%|██████████| 2090/2090 [00:08<00:00, 234.94it/s]



Rodando sampling original no modelo alvaroalon2/biobert_chemical_ner
[OK] Resumo salvo em: Resultados/Embedding_alvaroalon2_biobert_chemical_ner_original.csv
                     EmbeddingModel   Balance               Model  \
0  alvaroalon2/biobert_chemical_ner  original               NuSVC   
1  alvaroalon2/biobert_chemical_ner  original       MLPClassifier   
2  alvaroalon2/biobert_chemical_ner  original  LogisticRegression   

   Accuracy_mean  Accuracy_std  BalAcc_mean  BalAcc_std   F1_mean    F1_std  \
0       0.727273      0.050636     0.695657    0.058172  0.607498  0.088033   
1       0.710526      0.044213     0.686740    0.044662  0.606384  0.063598   
2       0.692823      0.035791     0.668453    0.034921  0.583921  0.046911   

   ROC_AUC_mean  ROC_AUC_std  Precision_mean  Precision_std  Recall_mean  \
0      0.765314     0.048899        0.677850       0.105036     0.555249   
1      0.739421     0.049251        0.640587       0.094298     0.583626   
2      0.726489    

Embeddings google-bert/bert-base-uncased: 100%|██████████| 2090/2090 [01:05<00:00, 31.68it/s]



Rodando sampling original no modelo google-bert/bert-base-uncased
[OK] Resumo salvo em: Resultados/Embedding_google-bert_bert-base-uncased_original.csv
                  EmbeddingModel   Balance          Model  Accuracy_mean  \
0  google-bert/bert-base-uncased  original  MLPClassifier       0.753589   
1  google-bert/bert-base-uncased  original       LightGBM       0.766986   
2  google-bert/bert-base-uncased  original          NuSVC       0.754545   

   Accuracy_std  BalAcc_mean  BalAcc_std   F1_mean    F1_std  ROC_AUC_mean  \
0      0.041849     0.735088    0.043443  0.669113  0.056595      0.801841   
1      0.038014     0.734986    0.041165  0.660031  0.061114      0.815869   
2      0.034061     0.726044    0.034056  0.652500  0.047758      0.817004   

   ROC_AUC_std  Precision_mean  Precision_std  Recall_mean  Recall_std  \
0     0.037051        0.692486       0.066168     0.652198    0.075959   
1     0.037653        0.747368       0.047160     0.594667    0.080980   
2     0