In [1]:
import torch

# Detect device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


Using device: cpu


In [2]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer

dataset_name = "normalized_merged_dataset"

# Load the datasets
df = pd.read_csv("../../data/who_dataset/normalized/"+dataset_name+".csv")

# Define text and numerical features
text_column = "meal_description"
macro_columns = ["carb", "protein", "fat", "energy"]

# Convert dataframe to Hugging Face Dataset
dataset = Dataset.from_pandas(df)

model_name = "xlm-roberta-base"

# Load BERT tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_and_format(examples):
    tokens = tokenizer(examples[text_column], padding="max_length", truncation=True, return_tensors="pt")

    labels = torch.tensor(
        [[examples[col][i] for col in macro_columns] for i in range(len(examples[text_column]))],
        dtype=torch.float32
    )

    tokens["labels"] = labels
    return {key: tokens[key] for key in tokens}



# Apply tokenization
dataset = dataset.map(tokenize_and_format, batched=True)

# Split into train & test sets
dataset = dataset.train_test_split(test_size=0.2)
train_data = dataset["train"]
test_data = dataset["test"]

# ✅ Ensure the datasets format is correct and move it to the correct device
train_data.set_format(type="torch")
test_data.set_format(type="torch")


Map:   0%|          | 0/44795 [00:00<?, ? examples/s]

In [3]:
from transformers import AutoModelForSequenceClassification

num_labels = len(macro_columns)  # Now predicting 4 continuous values (carb, protein, fat, energy)

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
    problem_type="regression"
)

# Move model to the correct device
model.to(device)

KeyboardInterrupt: 

In [9]:
from transformers import Trainer, TrainingArguments
import torch.nn as nn

class MSETrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels").to(torch.float32).to(device)  # ✅ Move labels to correct device
        inputs = {k: v.to(device) for k, v in inputs.items()}  # ✅ Move all inputs to device

        outputs = model(**inputs)
        logits = outputs.logits

        loss_fct = nn.MSELoss()
        loss = loss_fct(logits, labels)

        return (loss, outputs) if return_outputs else loss


import numpy as np

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits
    mae = np.mean(np.abs(preds - labels))
    return {"mae": mae}


In [10]:
torch.cuda.empty_cache()
model.gradient_checkpointing_enable()


training_args = TrainingArguments(
    output_dir="./results_who_insa_dataset",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    lr_scheduler_type="cosine",
    warmup_ratio=0.1,
    logging_dir="./logs_who_insa_dataset",
    fp16=True
)

trainer = MSETrainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=test_data,
    compute_metrics=compute_metrics
)

trainer.train()

Epoch,Training Loss,Validation Loss



KeyboardInterrupt



In [4]:
import joblib

scaler = joblib.load("../../data/who_dataset/pickle/scaler_merged_dataset.pkl")

def predict_macros(text):
    text = full_clean_pipeline(text)  # passa pelo pré-processamento completo

    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    outputs = model(**inputs)
    predicted_macros = outputs.logits.detach().cpu().numpy()[0]

    predicted_macros = scaler.inverse_transform([predicted_macros])[0]

    return {
        "carbs": predicted_macros[0],
        "protein": predicted_macros[1],
        "fat": predicted_macros[2],
        "energy": predicted_macros[3]
    }

model.save_pretrained("bert_regression_whoInsaDataset")
tokenizer.save_pretrained("bert_regression_whoInsaDataset")

NameError: name 'model' is not defined