# requirimentos

In [None]:
!pip install gcsfs datasets
!pip install --upgrade sentence-transformers
!pip install --upgrade transformers
!pip install unidecode
!python -m spacy download pt_core_news_md
!pip install --upgrade torch


# treino

In [None]:
from datasets import Dataset
import pandas as pd
import torch

from sentence_transformers import (
    SentenceTransformer,
    InputExample,
    SentenceTransformerTrainer,
    SentenceTransformerTrainingArguments,
)
from sentence_transformers.losses import CosineSimilarityLoss
from sentence_transformers.losses import MultipleNegativesRankingLoss
from transformers import get_linear_schedule_with_warmup
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sklearn.utils.class_weight import compute_sample_weight


model = SentenceTransformer("neuralmind/bert-large-portuguese-cased")


torch.manual_seed(42)

csv_path = "/content/base_de_dados_binarios_lema.csv"
data = pd.read_csv(csv_path)
sample_weights = compute_sample_weight(class_weight="balanced", y=data["Label"])
train_examples = [
    InputExample(texts=[row["Text1"], row["Text2"]], label=float(row["Label"]))
    for _, row in data.iterrows()
]

dataset = Dataset.from_dict({
    "text1": [example.texts[0] for example in train_examples],
    "text2": [example.texts[1] for example in train_examples],
    "label": [example.label for example in train_examples],
})

split = dataset.train_test_split(test_size=0.1)
train_dataset = split["train"]
eval_dataset = split["test"]

loss = CosineSimilarityLoss(model)

args = SentenceTransformerTrainingArguments(
    output_dir="models/bert-ptbr-regression",
    num_train_epochs=5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    learning_rate=1e-5,
    warmup_ratio=0.1,
    weight_decay=0.1,
    fp16=True,
    logging_steps=5,
    eval_strategy="steps",
    eval_steps=200,
    save_strategy="steps",
    save_steps=1000,
    save_total_limit=2,
    load_best_model_at_end=True,
    run_name="bert-ptbr-regression",
    logging_dir="/content/logs",
    greater_is_better=False,
    metric_for_best_model="eval_loss",
)


evaluator = EmbeddingSimilarityEvaluator(
    sentences1=eval_dataset["text1"],
    sentences2=eval_dataset["text2"],
    scores=eval_dataset["label"],
    name="eval-similarity",
)


trainer = SentenceTransformerTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    loss=loss,
    evaluator=evaluator,
)

trainer.train()

model.save_pretrained("models/acho14k15epochs")
