In [1]:
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Carregar datasets
df = pd.read_parquet('../../datasets/fsda/fsda_insa_dataset_final.parquet')
df

Unnamed: 0,FOOD_NAME,FOOD_INGREDIENTS,FOOD_SERVING_SIZE,ENERGY (KCAL),PROTEIN (G),TOTAL LIPID (FAT) (G),"CARBOHYDRATE, BY DIFFERENCE (G)",LANGUAGE
0,KELLOGG APPLE JACKS .9OZ 100CT,"SUGAR, CORN FLOUR BLEND (WHOLE GRAIN YELLOW CO...",25.0 G,370.0,5.2,3.7,87.8,EN
1,GARDENBURGER GB VEGGIE BURGERS BLACK BEAN 3OZ ...,"WATER, COOKED BLACK BEANS (BLACK BEANS, WATER)...",85.0 G,112.0,10.8,3.0,16.9,EN
2,KEEBLER CONES WAFFLE CONE 264CT,"BLEACHED AND ENRICHED FLOUR (WHEAT FLOUR, NIAC...",14.0 G,423.0,6.0,5.0,88.0,EN
3,KEEBLER CONES WAFFLE BOWL 60CT,"BLEACHED AND ENRICHED FLOUR (WHEAT FLOUR, NIAC...",19.0 G,424.0,6.5,5.3,87.7,EN
4,KEEBLER CONES WAFFLE CONE 264CT,"BLEACHED AND ENRICHED FLOUR (WHEAT FLOUR, NIAC...",14.0 G,423.0,6.0,5.0,88.0,EN
...,...,...,...,...,...,...,...,...
457777,Vinho maduro tinto,vinho,100g,66.0,0.1,0.0,0.2,PT
457778,Vinho rosé,vinho,100g,72.0,0.2,0.0,2.4,PT
457779,Vinho verde branco,vinho,100g,59.0,0.0,0.0,0.1,PT
457780,Vinho verde tinto,vinho,100g,57.0,0.0,0.0,0.3,PT


In [3]:
# 2. Texto = nome + serving size
df['input_text'] = df['FOOD_NAME'] + ' - ' + df['FOOD_SERVING_SIZE']

# 3. Converter para Dataset Hugging Face
macro_columns = ["ENERGY (KCAL)", "PROTEIN (G)", "TOTAL LIPID (FAT) (G)", "CARBOHYDRATE, BY DIFFERENCE (G)"]
dataset = Dataset.from_pandas(df)

In [4]:
# 4. Tokenizer + tokenização batched
model_name = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_and_format(examples):
    tokens = tokenizer(examples['input_text'], padding="max_length", truncation=True, max_length=32)
    labels = [[examples[col][i] for col in macro_columns] for i in range(len(examples['input_text']))]
    tokens["labels"] = labels
    return tokens

dataset = dataset.map(tokenize_and_format, batched=True)

Map: 100%|██████████| 457782/457782 [00:15<00:00, 29496.65 examples/s]


In [5]:
# 5. Split
dataset = dataset.train_test_split(test_size=0.2)
train_data = dataset["train"]
test_data = dataset["test"]

train_data.set_format(type="torch")
test_data.set_format(type="torch")

# 6. Modelo de regressão com Trainer
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=4,
    problem_type="regression"
)

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
# 7. Métricas customizadas
def compute_metrics(eval_pred):
    preds, labels = eval_pred
    preds = np.array(preds)
    labels = np.array(labels)

    mae = mean_absolute_error(labels, preds)
    mse = mean_squared_error(labels, preds)
    rmse = np.sqrt(mse)
    r2 = r2_score(labels, preds)

    return {
        "mae": mae,
        "mse": mse,
        "rmse": rmse,
        "r2": r2
    }

In [7]:
# 8. Training arguments
training_args = TrainingArguments(
    output_dir="./results/bert_regression",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="mae",
    greater_is_better=False,
    logging_dir="./logs/bert_regression",
    fp16=True
)

# 9. Trainer final
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=test_data,
    compute_metrics=compute_metrics
)

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
trainer.model.to(device)

trainer.train()

Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
# 10. Salvar modelo e tokenizer
model.save_pretrained("./models/bert_regression")
tokenizer.save_pretrained("./models/bert_regression")