In [157]:
import pandas as pd
from transformers import BertTokenizer, BertPreTrainedModel, BertModel, Trainer, TrainingArguments
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [158]:
# Load preprocessed dataset
file_path = "../../data/insa_dataset/processed_nutri_data.csv"
df = pd.read_csv(file_path)

# Define tokenizer for Portuguese (Portugal)
tokenizer = BertTokenizer.from_pretrained("neuralmind/bert-large-portuguese-cased")

In [159]:
# Select numeric and categorical columns for training
numeric_cols = df.select_dtypes(include=["float64", "int64"]).columns.tolist()

df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors='coerce')
df[numeric_cols] = df[numeric_cols].astype(np.float32)

categorical_cols = ['n√≠vel_1', 'n√≠vel_2', 'n√≠vel_3']
df[categorical_cols] = df[categorical_cols].astype(str)
df['nome_do_alimento'] = df['nome_do_alimento'].astype(str)

df['descricao'] = df['nome_do_alimento'] + " " + df['n√≠vel_1'].fillna('') + " " + df['n√≠vel_2'].fillna('') + " " + df['n√≠vel_3'].fillna('')

# convert descricao values to string, using lambda function
df['descricao'] = df['descricao'].astype(str).str.strip()

# Check if some value of the dataset is object
print(df.select_dtypes(include=['object']).columns)

Index(['nome_do_alimento', 'n√≠vel_1', 'n√≠vel_2', 'n√≠vel_3', 'descricao'], dtype='object')


In [160]:
# Custom Dataset Class for BERT
class NutriDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=128):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, index):
        row = self.dataframe.iloc[index]
        text = row['descricao']
        numeric_labels = torch.tensor(row[numeric_cols].values, dtype=torch.float32)

        encoding = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )

        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "numeric_labels": numeric_labels
        }

In [161]:
class BertNutritionModel(BertPreTrainedModel):
    def __init__(self, config, num_numeric_labels):
        super().__init__(config)
        self.bert = BertModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.numeric_regressor = nn.Linear(config.hidden_size, num_numeric_labels)

    def forward(self, input_ids, attention_mask, numeric_labels=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = self.dropout(outputs.pooler_output)
        numeric_preds = self.numeric_regressor(pooled_output)

        loss = None
        if numeric_labels is not None:
            loss_fn_reg = nn.MSELoss()
            loss = loss_fn_reg(numeric_preds, numeric_labels)

        return {
            "loss": loss,
            "numeric_preds": numeric_preds
        }

In [162]:
# Create dataset
dataset = NutriDataset(df, tokenizer)

# Split dataset into train, validation, and test
train_size = int(0.7 * len(dataset))
val_size = int(0.15 * len(dataset))
test_size = len(dataset) - train_size - val_size
train_dataset, val_dataset, test_dataset = map(lambda ds: torch.utils.data.Subset(dataset, ds.indices), random_split(dataset, [train_size, val_size, test_size]))

# Debugging: Verify dataset types
print(f"Tipo de train_dataset: {type(train_dataset)}")
print(f"Tipo de val_dataset: {type(val_dataset)}")
print(f"Tipo de test_dataset: {type(test_dataset)}")


Tipo de train_dataset: <class 'torch.utils.data.dataset.Subset'>
Tipo de val_dataset: <class 'torch.utils.data.dataset.Subset'>
Tipo de test_dataset: <class 'torch.utils.data.dataset.Subset'>


In [163]:
model = BertNutritionModel.from_pretrained(
    "neuralmind/bert-large-portuguese-cased",
    num_numeric_labels=num_numeric_labels
).to(device)

# Training arguments
training_args = TrainingArguments(
    output_dir="./bert_nutrition_model",
    evaluation_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=50,
)

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

# Train model
trainer.train()

Some weights of BertNutritionModel were not initialized from the model checkpoint at neuralmind/bert-large-portuguese-cased and are newly initialized: ['numeric_regressor.bias', 'numeric_regressor.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


TypeError: can't convert np.ndarray of type numpy.object_. The only supported types are: float64, float32, float16, complex64, complex128, int64, int32, int16, int8, uint64, uint32, uint16, uint8, and bool.