In [None]:
# Clona il repository indicato nella consegna
!git clone https://github.com/MarcoNapoleone/FAIR-DA4ER.git
%cd FAIR-DA4ER

# Installa le dipendenze (Ditto richiede librerie specifiche)
!pip install -r requirements.txt
!pip install tensorboardX
# Installa NVIDIA Apex (opzionale ma consigliato per Ditto, se fallisce procedi senza o usa fp16=False)
# Nota: L'installazione di Apex su Colab può essere rognosa, spesso Ditto funziona anche senza se configurato bene.

fatal: destination path 'FAIR-DA4ER' already exists and is not an empty directory.
/content/FAIR-DA4ER/ditto/FAIR-DA4ER


In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
# Assicurati di essere nella cartella di Ditto
%cd /content/FAIR-DA4ER/ditto

# Crea la cartella per i dati
!mkdir -p data/vehicles

# Sposta e rinomina i file caricati nella cartella corretta
!cp /content/ditto_train.txt data/vehicles/train.txt
!cp /content/ditto_val.txt   data/vehicles/val.txt
!cp /content/ditto_test.txt  data/vehicles/test.txt

# Verifica che i file siano al posto giusto
!ls data/vehicles

/content/FAIR-DA4ER/ditto
test.txt  train.txt  val.txt


In [None]:
# --- CELLA MANCANTE: AGGIORNAMENTO CONFIGS.JSON ---
import json
import os

# Assicuriamoci di essere nella cartella giusta
%cd /content/FAIR-DA4ER/ditto

config_path = 'configs.json'

# 1. Leggi la configurazione esistente
with open(config_path, 'r') as f:
    data = json.load(f)

# 2. Definisci la nuova configurazione per i veicoli
new_config = {
    "name": "vehicles",
    "task_type": "classification",
    "vocab": ["0", "1"],
    "trainset": "data/vehicles/train.txt",
    "validset": "data/vehicles/val.txt",
    "testset": "data/vehicles/test.txt"
}

# 3. Aggiungi alla lista (o dizionario) evitando duplicati
if isinstance(data, list):
    # Rimuovi eventuali vecchie versioni per non fare pasticci
    data = [entry for entry in data if entry.get('name') != 'vehicles']
    data.append(new_config)
else:
    data['vehicles'] = new_config

# 4. Salva le modifiche
with open(config_path, 'w') as f:
    json.dump(data, f, indent=4)

print("✔ Configurazione 'vehicles' aggiunta con successo!")

/content/FAIR-DA4ER/ditto
✔ Configurazione 'vehicles' aggiunta con successo!


In [None]:
%%writefile /content/FAIR-DA4ER/ditto/ditto_light/dataset.py
import torch
from torch.utils.data import Dataset
from transformers import AutoTokenizer
import json

def get_tokenizer(lm):
    return AutoTokenizer.from_pretrained(lm)

class DittoDataset(Dataset):
    def __init__(self, path, max_len=256, lm='distilbert', da=None, size=None):
        self.tokenizer = get_tokenizer(lm)
        self.pairs = []
        self.labels = []
        self.max_len = max_len
        self.size = size

        # Logica Tab Robusta
        with open(path) as f:
            for line in f:
                parts = line.strip().split('\t')
                if len(parts) < 3: continue
                if len(parts) == 3:
                    s1, s2, label = parts
                else:
                    s1 = parts[0]
                    label = parts[-1]
                    s2 = " ".join(parts[1:-1])
                self.pairs.append((s1, s2))
                self.labels.append(int(label))

        if size is not None and size > 0:
            self.pairs = self.pairs[:size]
            self.labels = self.labels[:size]

        self.da = da

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        x1, x2 = self.pairs[idx]
        if self.da is not None:
            x1, x2 = self.da.transform(x1, x2)

        encoded_inputs = self.tokenizer(x1, x2,
                                      max_length=self.max_len,
                                      padding='max_length',
                                      truncation=True)

        item = {key: torch.tensor(val) for key, val in encoded_inputs.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    # --- FIX CRUCIALE QUI SOTTO ---
    def pad(self, batch):
        # Restituiamo una TUPLA di 3 elementi come si aspetta ditto.py (x1, x2, y)
        # x1 = input_ids, x2 = attention_mask, y = labels

        input_ids = torch.stack([b['input_ids'] for b in batch])
        attention_mask = torch.stack([b['attention_mask'] for b in batch])
        labels = torch.stack([b['labels'] for b in batch])

        return input_ids, attention_mask, labels

Overwriting /content/FAIR-DA4ER/ditto/ditto_light/dataset.py


In [None]:
# Patch per ditto_light/ditto.py
# Corregge la funzione 'evaluate' per gestire input_ids e attention_mask separati

file_path = '/content/FAIR-DA4ER/ditto/ditto_light/ditto.py'

with open(file_path, 'r') as f:
    lines = f.readlines()

new_lines = []
for line in lines:
    # 1. Correggiamo l'unpacking: da "x, y" a "x1, x2, y"
    if 'x, y = batch' in line:
        indent = line.split('x, y')[0]
        new_lines.append(f'{indent}x1, x2, y = batch\n')

    # 2. Correggiamo lo spostamento su GPU
    elif 'x = x.to(device)' in line:
        indent = line.split('x =')[0]
        new_lines.append(f'{indent}x1 = x1.to(device)\n')
        new_lines.append(f'{indent}x2 = x2.to(device)\n')

    # 3. Correggiamo la chiamata al modello
    elif 'logits = model(x)' in line:
        new_lines.append(line.replace('model(x)', 'model(x1, x2)'))

    else:
        new_lines.append(line)

with open(file_path, 'w') as f:
    f.writelines(new_lines)

print("✔ Patch applicata a ditto.py! Ora la validazione funzionerà.")

✔ Patch applicata a ditto.py! Ora la validazione funzionerà.


In [None]:
%cd /content/FAIR-DA4ER/ditto

# Ho rimosso --fp16 per sicurezza
!python train_ditto.py \
  --task vehicles \
  --batch_size 16 \
  --max_len 256 \
  --lr 3e-5 \
  --n_epochs 1 \
  --save_model \
  --lm distilbert-base-uncased

/content/FAIR-DA4ER/ditto
Running cuda
Loading weights: 100% 100/100 [00:00<00:00, 1363.62it/s, Materializing param=transformer.layer.5.sa_layer_norm.weight]
[1mDistilBertModel LOAD REPORT[0m from: distilbert-base-uncased
Key                     | Status     |  | 
------------------------+------------+--+-
vocab_transform.bias    | [38;5;208mUNEXPECTED[0m |  | 
vocab_projector.bias    | [38;5;208mUNEXPECTED[0m |  | 
vocab_layer_norm.bias   | [38;5;208mUNEXPECTED[0m |  | 
vocab_layer_norm.weight | [38;5;208mUNEXPECTED[0m |  | 
vocab_transform.weight  | [38;5;208mUNEXPECTED[0m |  | 

[3mNotes:
- [38;5;208mUNEXPECTED[0m[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m
step: 0, loss: 0.6919939517974854
step: 10, loss: 0.6929281949996948
step: 20, loss: 0.7240093946456909
step: 30, loss: 0.6856732368469238
step: 40, loss: 0.6813648343086243
step: 50, loss: 0.6941933035850525
step: 60, loss: 0.680903434753418
step: 70,

In [None]:
%%writefile /content/FAIR-DA4ER/ditto/matcher.py
import torch
import torch.nn as nn
import os
import numpy as np
import sys
import json
import argparse
from torch.utils.data import DataLoader
from tqdm import tqdm
from sklearn.metrics import precision_score, recall_score, f1_score

# Import interni
from ditto_light.ditto import DittoModel
from ditto_light.dataset import DittoDataset

def evaluate(model, iterator, threshold=None):
    model.eval()
    all_y = []
    all_probs = []

    with torch.no_grad():
        for batch in tqdm(iterator):
            x1, x2, y = batch

            if torch.cuda.is_available():
                x1 = x1.cuda()
                x2 = x2.cuda()
                y = y.cuda()

            logits = model(x1, x2)
            probs = logits.softmax(dim=1)[:, 1]

            all_probs += probs.cpu().numpy().tolist()
            all_y += y.cpu().numpy().tolist()

    if threshold is None:
        threshold = 0.5

    pred = [1 if p > threshold else 0 for p in all_probs]

    f1 = f1_score(all_y, pred)
    p = precision_score(all_y, pred)
    r = recall_score(all_y, pred)

    return f1, p, r, threshold

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--task", type=str, default="vehicles")
    parser.add_argument("--input_path", type=str, default="data/vehicles/test.txt")
    parser.add_argument("--output_path", type=str, default="output/result.jsonl")
    parser.add_argument("--model_path", type=str, default="checkpoints/")
    parser.add_argument("--lm", type=str, default="distilbert-base-uncased")
    parser.add_argument("--max_len", type=int, default=256)
    parser.add_argument("--use_gpu", action="store_true")
    parser.add_argument("--fp16", action="store_true")
    parser.add_argument("--batch_size", type=int, default=64)

    args = parser.parse_args()

    device = 'cuda' if args.use_gpu and torch.cuda.is_available() else 'cpu'

    print(f"Loading model architecture ({args.lm})...")
    model = DittoModel(device=device, lm=args.lm)

    # Gestione percorso file
    if os.path.isdir(args.model_path):
        files = [f for f in os.listdir(args.model_path) if f.endswith('.pt')]
        if not files:
            print(f"❌ Nessun file .pt trovato in {args.model_path}")
            return
        checkpoint_path = os.path.join(args.model_path, files[0])
    else:
        checkpoint_path = args.model_path

    print(f"Loading weights from: {checkpoint_path}")

    # --- FIX QUI SOTTO: Caricamento intelligente ---
    checkpoint = torch.load(checkpoint_path, map_location=device)

    if 'model' in checkpoint:
        # Se il file contiene lo stato completo del training, estraiamo solo la parte 'model'
        print("Extracting model weights from training checkpoint...")
        model.load_state_dict(checkpoint['model'])
    else:
        # Altrimenti carichiamo direttamente (vecchio formato)
        model.load_state_dict(checkpoint)
    # -----------------------------------------------

    model.to(device)

    print(f"Loading data from {args.input_path}...")
    test_dataset = DittoDataset(args.input_path, max_len=args.max_len, lm=args.lm)
    test_loader = DataLoader(test_dataset, batch_size=args.batch_size, shuffle=False,
                             num_workers=0, collate_fn=test_dataset.pad)

    print("Running inference...")
    f1, p, r, th = evaluate(model, test_loader)

    print("="*30)
    print(f"✅ RISULTATI FINALI (Punto 4.H)")
    print(f"Precision: {p:.4f}")
    print(f"Recall:    {r:.4f}")
    print(f"F1 Score:  {f1:.4f}")
    print("="*30)

    os.makedirs(os.path.dirname(args.output_path), exist_ok=True)
    with open(args.output_path, 'w') as f:
        json.dump({"precision": p, "recall": r, "f1": f1}, f)

if __name__ == "__main__":
    main()

Overwriting /content/FAIR-DA4ER/ditto/matcher.py


In [None]:
%cd /content/FAIR-DA4ER/ditto

!python matcher.py \
  --task vehicles \
  --input_path data/vehicles/test.txt \
  --model_path checkpoints/vehicles/model.pt \
  --lm distilbert-base-uncased \
  --max_len 256 \
  --use_gpu

/content/FAIR-DA4ER/ditto
Loading model architecture (distilbert-base-uncased)...
Loading weights: 100% 100/100 [00:00<00:00, 930.84it/s, Materializing param=transformer.layer.5.sa_layer_norm.weight]
[1mDistilBertModel LOAD REPORT[0m from: distilbert-base-uncased
Key                     | Status     |  | 
------------------------+------------+--+-
vocab_layer_norm.bias   | [38;5;208mUNEXPECTED[0m |  | 
vocab_projector.bias    | [38;5;208mUNEXPECTED[0m |  | 
vocab_layer_norm.weight | [38;5;208mUNEXPECTED[0m |  | 
vocab_transform.bias    | [38;5;208mUNEXPECTED[0m |  | 
vocab_transform.weight  | [38;5;208mUNEXPECTED[0m |  | 

[3mNotes:
- [38;5;208mUNEXPECTED[0m[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m
Loading weights from: checkpoints/vehicles/model.pt
Extracting model weights from training checkpoint...
Loading data from data/vehicles/test.txt...
Running inference...
100% 57/57 [01:34<00:00,  1.65s/it]
✅ RI