In [41]:
import torch

# Detect device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


Using device: cuda


In [42]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer

dataset_name = "normalized_wweia_meal_natural"

# Load the dataset
df = pd.read_csv("../data/normalized/"+dataset_name+".csv")

# Define text and numerical features
text_column = "meal_description"
macro_columns = ["carb", "protein", "fat", "energy"]

# Convert dataframe to Hugging Face Dataset
dataset = Dataset.from_pandas(df)

# Load BERT tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")


def tokenize_and_format(examples):
    tokens = tokenizer(examples[text_column], padding="max_length", truncation=True)

    # Convert labels into a list of lists (shape: [batch_size, 4])
    labels = torch.tensor([[examples[col][i] for col in macro_columns] for i in range(len(examples[text_column]))],
                          dtype=torch.float32)

    tokens["labels"] = labels  # Attach labels correctly
    return tokens


# Apply tokenization
dataset = dataset.map(tokenize_and_format, batched=True)

# Split into train & test sets
dataset = dataset.train_test_split(test_size=0.2)
train_data = dataset["train"]
test_data = dataset["test"]

# ✅ Ensure the dataset format is correct and move it to the correct device
train_data.set_format(type="torch")
test_data.set_format(type="torch")


Map:   0%|          | 0/5532 [00:00<?, ? examples/s]

In [43]:
from transformers import AutoModelForSequenceClassification

num_labels = len(macro_columns)  # Now predicting 4 continuous values (carb, protein, fat, energy)

model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=num_labels,
    problem_type="regression"
)

# Move model to the correct device
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [44]:
from transformers import Trainer, TrainingArguments
import torch.nn as nn

class MSETrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels").to(torch.float32).to(device)  # ✅ Move labels to correct device
        inputs = {k: v.to(device) for k, v in inputs.items()}  # ✅ Move all inputs to device

        outputs = model(**inputs)
        logits = outputs.logits

        loss_fct = nn.MSELoss()
        loss = loss_fct(logits, labels)

        return (loss, outputs) if return_outputs else loss


In [45]:
training_args = TrainingArguments(
    output_dir="./results_" + dataset_name,
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir="./logs_" + dataset_name
)

trainer = MSETrainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=test_data,
    processing_class=tokenizer
)

trainer.train()

Epoch,Training Loss,Validation Loss
1,0.0118,0.005531
2,0.006,0.005059
3,0.0044,0.003262
4,0.0031,0.002858
5,0.0024,0.002896


TrainOutput(global_step=2770, training_loss=0.0051910255144649465, metrics={'train_runtime': 1434.6549, 'train_samples_per_second': 15.422, 'train_steps_per_second': 1.931, 'total_flos': 5821436634624000.0, 'train_loss': 0.0051910255144649465, 'epoch': 5.0})

In [46]:
import joblib

scaler = joblib.load("../pre_processing/scaler_nutrition.pkl")

def predict_macros(text):
    inputs = tokenizer(text, return_tensors="pt", padding="max_length", truncation=True)
    inputs = {k: v.to(device) for k, v in inputs.items()}  # ✅ Move input tensors to the correct device

    outputs = model(**inputs)
    predicted_macros = outputs.logits.detach().cpu().numpy()[0]  # ✅ Move predictions back to CPU before converting to NumPy

    predicted_macros = scaler.inverse_transform(predicted_macros)

    return {
        "carbs": predicted_macros[0],
        "protein": predicted_macros[1],
        "fat": predicted_macros[2],
        "energy": predicted_macros[3]
    }

model.save_pretrained("bert_nutrition_classifier_" + dataset_name)
tokenizer.save_pretrained("bert_nutrition_classifier_" + dataset_name)

('bert_nutrition_classifier_normalized_wweia_meal_natural\\tokenizer_config.json',
 'bert_nutrition_classifier_normalized_wweia_meal_natural\\special_tokens_map.json',
 'bert_nutrition_classifier_normalized_wweia_meal_natural\\vocab.txt',
 'bert_nutrition_classifier_normalized_wweia_meal_natural\\added_tokens.json',
 'bert_nutrition_classifier_normalized_wweia_meal_natural\\tokenizer.json')