In [23]:
import pandas as pd
import numpy as np
import joblib
import csv
from datasets import Dataset, DatasetDict

df_final = pd.read_csv('../Novo_projeto_ML/DataFrame/df_final.csv')

In [24]:
from transformers import AutoTokenizer

# 1. Definir qual o modelo pré-treinado que vamos usar como base.
model_checkpoint = "t5-small"

# 2. Carregar o tokenizador associado a esse modelo.
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [25]:
from datasets import Dataset, DatasetDict

# DataFrame 'df_final' para o formato Dataset
dataset = Dataset.from_pandas(df_final)

# Dividir em treino e teste
train_test_split = dataset.train_test_split(test_size=0.1)

# Organizar no Dicionário de Datasets
raw_datasets = DatasetDict({
    'train': train_test_split['train'],
    'test': train_test_split['test']
})

print(raw_datasets)

DatasetDict({
    train: Dataset({
        features: ['English', 'Portuguese'],
        num_rows: 418644
    })
    test: Dataset({
        features: ['English', 'Portuguese'],
        num_rows: 46517
    })
})


In [26]:
prefix = "translate English to Portuguese: "

def preprocess_function(examples):
    
    inputs = [prefix + doc for doc in examples["English"]] 
    model_inputs = tokenizer(inputs, max_length=128, truncation=True)

   
    labels = tokenizer(text_target=examples["Portuguese"], max_length=128, truncation=True) 

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)

print(tokenized_datasets)

Map: 100%|██████████| 418644/418644 [00:21<00:00, 19818.87 examples/s]
Map: 100%|██████████| 46517/46517 [00:02<00:00, 18681.13 examples/s]

DatasetDict({
    train: Dataset({
        features: ['English', 'Portuguese', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 418644
    })
    test: Dataset({
        features: ['English', 'Portuguese', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 46517
    })
})





In [None]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq

# Carregar o modelo pré-treinado
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)


# Definir as regras e configurações do treino
training_args = Seq2SeqTrainingArguments(
    output_dir="./meu_modelo_tradutor",
    
    do_eval=True,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    save_total_limit=3,
    predict_with_generate=True,
)

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

print("\nTrainer configurado. Pronto para iniciar o treino!")

# Iniciar o treino!
trainer.train()

print("\n--- PASSO 3 CONCLUÍDO ---")
print("O treino foi finalizado! O seu modelo está guardado na pasta 'meu_modelo_tradutor'.")