In [None]:
import torch
import os
from modules import paths
import numpy as np
from datasets import Dataset
from transformers import AutoTokenizer, Trainer, TrainingArguments, DataCollatorWithPadding, set_seed, TrainerCallback
import pandas as pd
from modules.dataset import get_page_len_dataset
from modules.transformer_model import build_model, metrics
import nest_asyncio
import types




In [2]:
nest_asyncio.apply()

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
set_seed(0)


In [4]:

df_train = get_page_len_dataset('train')
df_val = get_page_len_dataset('valid')


labels = sorted(df_train['label'].unique())
lab2id = {l:i for i,l in enumerate(labels)}
df_train['label'] = df_train['label'].map(lab2id)
df_val  ['label'] = df_val  ['label'].map(lab2id)

train_ds = Dataset.from_pandas(df_train[['description','label']])
val_ds   = Dataset.from_pandas(df_val  [['description','label']])

In [5]:
model_type = "xlm-roberta-base"
batch_size = 32
epochs = 3
learning_rate = 1e-4
weight_decay = 0.01
classes = 3

In [None]:
model = build_model(model_type, classes).to(device)

orig_forward = model.forward
def patched_forward(self, *args, **kwargs):
    # se qualcuno ha passato input_ids/attention_mask come posizionali,
    # li spostiamo in kwargs
    if args:
        kwargs["input_ids"] = args[0]
        if len(args) > 1:
            kwargs["attention_mask"] = args[1]
    return orig_forward(**kwargs)
model.forward = types.MethodType(patched_forward, model)

tokenizer = AutoTokenizer.from_pretrained(model_type)

_real_collator = DataCollatorWithPadding(tokenizer=tokenizer)
def debug_collator(features):
    print("\n>>> DEBUG COL LATOR: feature keys:", list(features[0].keys()))
    # lunghezze raw
    lens = [len(f.get("input_ids", [])) for f in features]
    print("    RAW input_ids lengths:", lens)
    batch = _real_collator(features)
    # stampo le shape padded
    print("    PADDED batch shapes:",
          "input_ids",      batch["input_ids"].shape,
          "attention_mask", batch["attention_mask"].shape,
          "labels",         batch["labels"].shape)
    print(">>> ------------------")
    return batch




Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
def tokenize(examples):
    return tokenizer(examples["description"], padding=True, truncation=True)

In [8]:
train_data = train_ds.map(tokenize, batched=True, remove_columns=["description"])
val_data = val_ds.map(tokenize, batched=True, remove_columns=["description"])
train_data = train_data.rename_column("label", "labels")
val_data   = val_data.rename_column("label",   "labels")

train_data.set_format(type="torch", columns=["input_ids","attention_mask","labels"])
val_data.set_format(  type="torch", columns=["input_ids","attention_mask","labels"])



Map:   0%|          | 0/6249 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

In [9]:
collator = DataCollatorWithPadding(tokenizer=tokenizer)
training_args = TrainingArguments(
    output_dir=".results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epochs,
    weight_decay=weight_decay,
    logging_dir="./logs",
    load_best_model_at_end=True,
)


In [10]:
train = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
    tokenizer=tokenizer,
    data_collator=debug_collator,
    compute_metrics=metrics,
)

  train = Trainer(


In [11]:
train.train()
train.evaluate()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33murbini-2007465[0m ([33murbini-2007465-sapienza-universit-di-roma[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin



>>> DEBUG COL LATOR: feature keys: ['labels', 'input_ids', 'attention_mask']
    RAW input_ids lengths: [58, 56, 57, 57, 57, 56, 58, 63, 56, 56, 61, 56, 56, 56, 57, 56, 57, 56, 61, 57, 58, 57, 56, 56, 61, 57, 56, 61, 61, 58, 56, 56]
    PADDED batch shapes: input_ids torch.Size([32, 63]) attention_mask torch.Size([32, 63]) labels torch.Size([32])
>>> ------------------

>>> DEBUG COL LATOR: feature keys: ['labels', 'input_ids', 'attention_mask']
    RAW input_ids lengths: [56, 56, 57, 57, 57, 56, 56, 58, 61, 56, 61, 57, 58, 58, 61, 58, 56, 56, 56, 57, 58, 61, 61, 56, 58, 61, 57, 61, 61, 56, 61, 61]
    PADDED batch shapes: input_ids torch.Size([32, 61]) attention_mask torch.Size([32, 61]) labels torch.Size([32])
>>> ------------------

>>> 🕵️ INSIDE MODEL FORWARD
    input_ids.shape:      torch.Size([32, 63])
    attention_mask.shape: torch.Size([32, 63])
    labels.shape:         torch.Size([32])


TypeError: XLMRobertaForSequenceClassification.forward() got multiple values for argument 'input_ids'