In [1]:
!pip install --upgrade transformers datasets



In [2]:
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import numpy as np
import torch

  from .autonotebook import tqdm as notebook_tqdm


OSError: [WinError 1114] Routine di inizializzazione della libreria di collegamento dinamico (DLL) non riuscita. Error loading "C:\Users\alexf\OneDrive\Desktop\UPM\biomedical informatics\Assignement 1- Retrieval Information\Binary-Classifier-for-Nutritional-IR\venv\Lib\site-packages\torch\lib\c10.dll" or one of its dependencies.

In [3]:
# Controlla se è disponibile una GPU (MOLTO raccomandato)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Utilizzo del dispositivo: {device}")

Utilizzo del dispositivo: cuda


In [4]:
# --- 1. CARICARE I SET DI DATI ---
try:
    train_df = pd.read_csv("train_set.csv")
    val_df = pd.read_csv("validation_set.csv")
    test_df = pd.read_csv("test_set.csv")
except FileNotFoundError:
    print("ERRORE: File di set non trovati.")
    print("Esegui prima 'split_data.py'.")
    exit()

# Converti i DataFrame di Pandas in 'Dataset' di Hugging Face
ds = DatasetDict({
    'train': Dataset.from_pandas(train_df),
    'validation': Dataset.from_pandas(val_df),
    'test': Dataset.from_pandas(test_df)
})

print(f"Dataset caricati:\n{ds}")

Dataset caricati:
DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 916
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 197
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 197
    })
})


In [27]:
# --- 2. CARICARE TOKENIZER E MODELLO ---
MODEL_NAME = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract"

# Carica il Tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Carica il Modello
# num_labels=2 (classe 0 e classe 1)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)
model.to(device)  # Sposta il modello sulla GPU se disponibile

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [28]:
# --- 3. TOKENIZZARE I DATI ---
def tokenize_function(examples):
    # 'truncation=True' taglia i testi più lunghi di 512 token
    # 'padding="max_length"' aggiunge token finti fino a 512
    return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=512)


# Applica la tokenizzazione a tutti i set in parallelo
tokenized_datasets = ds.map(tokenize_function, batched=True)

# Rimuovi la colonna 'text' (non più necessaria) e formatta per il training
tokenized_datasets = tokenized_datasets.remove_columns(["text"])
tokenized_datasets.set_format("torch")



Map:   0%|          | 0/916 [00:00<?, ? examples/s]

Map:   0%|          | 0/197 [00:00<?, ? examples/s]

Map:   0%|          | 0/197 [00:00<?, ? examples/s]

In [29]:

# --- 4. DEFINIRE LE METRICHE DI VALUTAZIONE ---
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    acc = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average='binary')
    precision = precision_score(labels, predictions, average='binary')
    recall = recall_score(labels, predictions, average='binary')

    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [30]:
# --- 5. DEFINIRE GLI ARGOMENTI DI TRAINING ---
training_args = TrainingArguments(
    output_dir="./polyphenol_classifier",  # Cartella dove salvare il modello
    eval_strategy="epoch",  # Valuta alla fine di ogni epoca
    save_strategy="epoch",  # Salva il modello alla fine di ogni epoca
    num_train_epochs=6,  # 3 epoche sono un buon punto di partenza
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=2e-5,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    load_best_model_at_end=True,  # Carica il modello migliore alla fine
    metric_for_best_model="f1",  # Scegli il modello migliore in base all'F1-score
    report_to="none"  # Disabilita il logging online (wandb)
)

In [31]:
# --- 6. CREARE IL TRAINER ---
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

  trainer = Trainer(


In [32]:
# --- 7. AVVIARE IL TRAINING (FINE-TUNING) ---
print("\n--- INIZIO FINE-TUNING MODELLO ---")
trainer.train()
print("--- FINE-TUNING COMPLETATO ---")


--- INIZIO FINE-TUNING MODELLO ---


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.5099,0.526546,0.746193,0.795082,0.668966,0.979798
2,0.4785,0.50145,0.751269,0.801619,0.668919,1.0
3,0.4827,0.496666,0.751269,0.801619,0.668919,1.0
4,0.5373,0.511827,0.751269,0.801619,0.668919,1.0
5,0.4828,0.507756,0.751269,0.801619,0.668919,1.0
6,0.4007,0.574882,0.700508,0.751055,0.644928,0.89899


--- FINE-TUNING COMPLETATO ---


In [33]:
# --- 8. VALUTAZIONE FINALE SUL TEST SET ---
print("\n--- VALUTAZIONE SUL TEST SET (DATI MAI VISTI) ---")
test_results = trainer.evaluate(eval_dataset=tokenized_datasets["test"])

print(f"Accuracy sul Test Set: {test_results['eval_accuracy']:.4f}")
print(f"F1-score sul Test Set: {test_results['eval_f1']:.4f}")
print(f"Precision sul Test Set: {test_results['eval_precision']:.4f}")
print(f"Recall sul Test Set: {test_results['eval_recall']:.4f}")


--- VALUTAZIONE SUL TEST SET (DATI MAI VISTI) ---


Accuracy sul Test Set: 0.7259
F1-score sul Test Set: 0.7823
Precision sul Test Set: 0.6467
Recall sul Test Set: 0.9898


In [34]:

# Salva i risultati finali su un file
with open("test_results.txt", "w") as f:
    f.write(str(test_results))

# Salva il modello finale
trainer.save_model("./polyphenol_classifier_final")
print("Modello finale salvato in './polyphenol_classifier_final'")

Modello finale salvato in './polyphenol_classifier_final'


In [35]:
!zip -r modello_finito.zip ./polyphenol_classifier_final

  adding: polyphenol_classifier_final/ (stored 0%)
  adding: polyphenol_classifier_final/training_args.bin (deflated 53%)
  adding: polyphenol_classifier_final/tokenizer_config.json (deflated 74%)
  adding: polyphenol_classifier_final/model.safetensors (deflated 7%)
  adding: polyphenol_classifier_final/vocab.txt (deflated 54%)
  adding: polyphenol_classifier_final/tokenizer.json (deflated 71%)
  adding: polyphenol_classifier_final/special_tokens_map.json (deflated 42%)
  adding: polyphenol_classifier_final/config.json (deflated 49%)
