In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import T5EncoderModel, AutoTokenizer 
from datasets import load_dataset, Dataset
from peft import LoraConfig, get_peft_model, TaskType
from sklearn.metrics import mean_squared_error, accuracy_score
from scipy.stats import spearmanr
import numpy as np
from torch.utils.data import DataLoader
from tqdm import tqdm
import json
import gc
from google.colab import drive


drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:


class T5EncoderForRegression(nn.Module):
    def __init__(self, model_name):
        super().__init__()
        self.encoder = T5EncoderModel.from_pretrained(model_name)
        hidden_size = self.encoder.config.d_model

        # Regressor pi√π profondo
        self.regressor = nn.Sequential(
            nn.Linear(hidden_size, hidden_size // 2),
            nn.LayerNorm(hidden_size // 2),
            nn.GELU(),
            nn.Dropout(0.15),
            nn.Linear(hidden_size // 2, 64),
            nn.LayerNorm(64),
            nn.GELU(),
            nn.Dropout(0.1),
            nn.Linear(64, 1)
        )

        # Inizializzazione
        for module in self.regressor.modules():
            if isinstance(module, nn.Linear):
                nn.init.xavier_uniform_(module.weight)
                if module.bias is not None:
                    nn.init.zeros_(module.bias)

        self.loss_fn = nn.MSELoss()
        self.config = self.encoder.config
        self.config.problem_type = "regression"
        self.config.num_labels = 1
        self.forward_count = 0

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        labels=None,
        **kwargs
    ):

        self.forward_count += 1
        # Encoder forward
        outputs = self.encoder(
            input_ids=input_ids,
            attention_mask=attention_mask,
            return_dict=True
        )

        # Mean pooling with safety checks
        last_hidden = outputs.last_hidden_state  # [batch, seq_len, hidden]
        if self.forward_count <= 3 or torch.isnan(last_hidden).any():
            print(f"\nüîç Forward pass #{self.forward_count}")
            print(f"  Encoder output shape: {last_hidden.shape}")
            print(f"  Encoder output has NaN: {torch.isnan(last_hidden).any()}")
            print(f"  Encoder output min/max: {last_hidden.min().item():.4f}/{last_hidden.max().item():.4f}")

        mask = attention_mask.unsqueeze(-1).float()  # [batch, seq_len, 1]

        # Prevent division by zero
        mask_sum = mask.sum(dim=1)
        mask_sum = torch.clamp(mask_sum, min=1e-9)

        # Weighted average by attention mask
        pooled = (last_hidden * mask).sum(dim=1) / mask_sum

        # Debug pooled output
        if self.forward_count <= 3 or torch.isnan(pooled).any():
            print(f"  Pooled output shape: {pooled.shape}")
            print(f"  Pooled has NaN: {torch.isnan(pooled).any()}")
            print(f"  Pooled min/max: {pooled.min().item():.4f}/{pooled.max().item():.4f}")
            print(f"  Regressor weight stats: mean={self.regressor[1].weight.mean().item():.4f}, std={self.regressor[1].weight.std().item():.4f}")
        # Regression head
        logits = self.regressor(pooled).squeeze(-1)  # [batch]

        loss = None
        if labels is not None:
            # Ensure labels are float and handle NaN values
            labels = labels.float()
            valid_mask = ~torch.isnan(labels) & ~torch.isinf(labels)

            if valid_mask.sum() > 0:
                loss = F.mse_loss(logits[valid_mask], labels[valid_mask])
            else:
                # If no valid labels, create a dummy loss
                loss = torch.tensor(0.0, device=logits.device, requires_grad=True)
                print(f"  ‚ö†Ô∏è No valid samples in batch!")
        return {
            "loss": loss,
            "logits": logits
        }


In [None]:


def print_trainable_parameters(model):
    """
    Stampa il numero di parametri trainable nel modello.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param:.2f}"
    )

# Caricamento del modello FlanT5 Encoder 
print("Caricamento del modello FlanT5 Encoder (xl version)...")
model_name = "google/flan-t5-xl"  

model= T5EncoderForRegression(model_name)


tokenizer = AutoTokenizer.from_pretrained(model_name)

# Configurazione LoRA - rank ridotto per memoria
lora_config = LoraConfig(
    task_type=TaskType.FEATURE_EXTRACTION,  
    r=32,
    lora_alpha=64,
    lora_dropout=0.2,
    target_modules=["q", "v", "k","o"],  
    bias="none",
    inference_mode=False,
    modules_to_save=["regressor"]  
)

model.encoder = get_peft_model(model.encoder, lora_config)

# CRITICAL: Ensure all parameters are in float32
for param in model.parameters():
    if param.dtype == torch.float16:
        param.data = param.data.float()

print("Trainable Parameters:")
print_trainable_parameters(model)

Caricamento del modello FlanT5 Encoder (base version)...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Trainable Parameters:
trainable params: 14748929 || all params: 1238276353 || trainable%: 1.19


In [13]:
def build_prompt(example):
    """Costruisce prompt per batch di esempi"""
    prompts = []
    batch_size = len(example["precontext"])

    for i in range(batch_size):
        p = example["precontext"][i]
        s = example["sentence"][i]
        e = example["ending"][i]
        homonym = example["homonym"][i]
        judged_meaning = example["judged_meaning"][i]
        example_sentence = example["example_sentence"][i]

        story = f"{p} {s} {e}"

        prompt = f"""Rate how plausible the meaning is in the context.
Answer ONLY with a number between 1 and 5.
You may use decimals (e.g., 2.543, 4.032).

Story:
{story}

Target word: {homonym}
Sense: {judged_meaning}
Example: {example_sentence}

Answer:"""

        prompts.append(prompt)

    return prompts


In [None]:
def preprocess_function(examples):
    prompt = build_prompt(examples)

    model_inputs = tokenizer(
        prompt,
        max_length=320,
        truncation=True,
        padding='max_length',
        return_tensors=None
    )

    
    labels = []
    stdevs = []

    for i, v in enumerate(examples["average"]):
        
        if v is None or (isinstance(v, float) and np.isnan(v)):
            labels.append(3.0)  # Default value
        else:
            try:
                label_val = float(v)
                # Clamp to valid range [1, 5]
                label_val = max(1.0, min(5.0, label_val))
                labels.append(label_val)
            except (ValueError, TypeError):
                labels.append(3.0)

        # Handle stdev
        stdev_val = examples["stdev"][i]
        if stdev_val is None or (isinstance(stdev_val, float) and np.isnan(stdev_val)):
            stdevs.append(1.0)  # Default stdev
        else:
            try:
                stdevs.append(float(stdev_val))
            except (ValueError, TypeError):
                stdevs.append(1.0)

    model_inputs["labels"] = labels
    model_inputs["stdev"] = stdevs

    return model_inputs


In [None]:
# Percorsi originali
train_path = 'dataset/train.json'
dev_path   = 'dataset/dev.json'

with open(train_path, "r") as f:
    data_train = json.load(f)

train_dataset = Dataset.from_list(list(data_train.values()))

with open(dev_path, "r") as f:
    data_dev = json.load(f)

dev_dataset = Dataset.from_list(list(data_dev.values()))


train_dataset = train_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=['choices', 'nonsensical', 'sample_id']  
)

dev_dataset = dev_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=['choices', 'nonsensical', 'sample_id']
)


train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels','stdev'])
dev_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels','stdev'])



# Test forward pass before training
print("\nüß™ Testing forward pass with sample batch...")
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)




Map:   0%|          | 0/2280 [00:00<?, ? examples/s]

Map:   0%|          | 0/588 [00:00<?, ? examples/s]


üß™ Testing forward pass with sample batch...


T5EncoderForRegression(
  (encoder): PeftModelForFeatureExtraction(
    (base_model): LoraModel(
      (model): T5EncoderModel(
        (shared): Embedding(32128, 2048)
        (encoder): T5Stack(
          (embed_tokens): Embedding(32128, 2048)
          (block): ModuleList(
            (0): T5Block(
              (layer): ModuleList(
                (0): T5LayerSelfAttention(
                  (SelfAttention): T5Attention(
                    (q): lora.Linear(
                      (base_layer): Linear(in_features=2048, out_features=2048, bias=False)
                      (lora_dropout): ModuleDict(
                        (default): Dropout(p=0.2, inplace=False)
                      )
                      (lora_A): ModuleDict(
                        (default): Linear(in_features=2048, out_features=32, bias=False)
                      )
                      (lora_B): ModuleDict(
                        (default): Linear(in_features=32, out_features=2048, bias=False)
            

In [None]:
def weighted_contrastive_loss(embeddings, labels, weights):
    """
    Weighted contrastive loss that penalizes pairs with low weights more.
    """
    embeddings = F.normalize(embeddings, dim=1)

    
    emb_dist = torch.cdist(embeddings, embeddings, p=2)
    labels = labels.view(-1, 1)
    label_dist = torch.cdist(labels, labels, p=1)
    label_dist = label_dist / (label_dist.max().detach() + 1e-8)

   
    weights = weights.view(-1, 1)
    pair_weights = torch.matmul(weights, weights.t())

    loss = (pair_weights * (emb_dist - label_dist) ** 2).mean()
    return loss

In [17]:
import numpy as np
from sklearn.metrics import mean_squared_error, accuracy_score
from scipy.stats import spearmanr

def compute_metrics(eval_pred):
    """Compute metrics with proper error handling"""
    predictions, labels = eval_pred

    print(f"\nüìä Computing metrics:")
    print(f"  Predictions shape: {predictions.shape}")
    print(f"  Labels shape: {labels.shape}")
    print(f"  Predictions sample (first 5): {predictions.reshape(-1)[:5]}")
    print(f"  Labels sample (first 5): {labels[:5]}")


    # Convert to numpy arrays
    preds = np.array(predictions).reshape(-1)
    labels = np.array(labels).astype(float)

    print(f"  Predictions has NaN: {np.isnan(preds).any()}")
    print(f"  Predictions has inf: {np.isinf(preds).any()}")
    print(f"  Labels has NaN: {np.isnan(labels).any()}")

    # Remove invalid values
    valid_mask = ~(np.isnan(preds) | np.isnan(labels) | np.isinf(preds) | np.isinf(labels))
    preds = preds[valid_mask]
    labels = labels[valid_mask]

    if len(preds) == 0:
        print("‚ö†Ô∏è Warning: No valid predictions!")
        return {"spearman": 0.0, "accuracy_within_std": 0.0}

    print(f"  Valid predictions range: [{preds.min():.4f}, {preds.max():.4f}]")
    print(f"  Valid labels range: [{labels.min():.4f}, {labels.max():.4f}]")
    # Spearman correlation
    try:
        rho, _ = spearmanr(labels, preds)
        if rho is None or np.isnan(rho):
            rho = 0.0
    except Exception as e:
        print(f"‚ö†Ô∏è Error computing Spearman: {e}")
        rho = 0.0

    # Accuracy within standard deviation (using default std=1.0)
    stds = np.ones_like(labels)
    correct = np.abs(preds - labels) <= stds
    acc = correct.mean()

    return {
        "spearman": float(rho),
        "accuracy_within_std": float(acc),
    }



In [None]:
from transformers import TrainingArguments
from transformers import Trainer


# Custom Trainer to combine Weighted SmoothL1 + contrastive loss
class CustomTrainer(Trainer):
    def __init__(self, *args, cont_weight=0.2, uncertainty_scale=0.5, **kwargs): 
        super().__init__(*args, **kwargs)
        self.cont_weight = cont_weight
        self.uncertainty_scale = uncertainty_scale

    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None, **kwargs):
        labels = inputs.get("labels")
        stdevs = inputs.get("stdev")

        labels = labels.float().to(device)
        stdevs = stdevs.float().to(device) if stdevs is not None else torch.ones_like(labels)

        outputs = model(**inputs)
        logits = outputs.get("logits")

        # 1. Compute weights from stdevs
        weights = torch.exp(-self.uncertainty_scale * stdevs)

        # 2. Weighted SmoothL1
        if labels is not None and logits is not None:
            loss_fct = nn.SmoothL1Loss(beta=1.0, reduction='none')
            per_sample_loss = loss_fct(logits, labels)
            smooth_l1_loss = (per_sample_loss * weights).mean() 
        else:
            smooth_l1_loss = outputs.get("loss")

        # 3. Weighted Contrastive Loss 
        embeddings = outputs.get("embeddings", None)
        if embeddings is not None and labels is not None:
            cont_loss = weighted_contrastive_loss(embeddings, labels, weights)
            loss = smooth_l1_loss + self.cont_weight * cont_loss
        else:
            loss = smooth_l1_loss

        return (loss, outputs) if return_outputs else loss


In [None]:
from transformers import Adafactor
from torch.optim.lr_scheduler import ReduceLROnPlateau


optimizer = Adafactor(
    model.parameters(),
    lr=3e-4,                    
    scale_parameter=False,
    relative_step=False
)

scheduler = ReduceLROnPlateau(
    optimizer,
    mode='min',      
    factor=0.5,      
    patience=2,      
)


training_args = TrainingArguments(
    output_dir="./results_t5_regression",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-4,
    per_device_train_batch_size=6,
    per_device_eval_batch_size=6,
    num_train_epochs=10,
    warmup_ratio=0.1,
    weight_decay=0.01,
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy_within_std",
    greater_is_better=True,
    fp16=False,
    max_grad_norm=1.0,
    save_total_limit=3,
    report_to="none",
    save_safetensors=False,
    remove_unused_columns=False,
    dataloader_num_workers=0,
    lr_scheduler_type="cosine",
)

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    optimizers=(optimizer, scheduler),
    eval_dataset=dev_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    cont_weight=0.2,
    uncertainty_scale=0.5 
)


print("\nBegin of the training...")
trainer.train()

print("\n‚úÖ Training ended!")

trainer.save_model("./final_model")
print("Model saved in ./final_model")

  super().__init__(*args, **kwargs)



Inizio training...

üîç Forward pass #1
  Encoder output shape: torch.Size([6, 320, 2048])
  Encoder output has NaN: False
  Encoder output min/max: -1.5571/1.2647
  Pooled output shape: torch.Size([6, 2048])
  Pooled has NaN: False
  Pooled min/max: -0.3711/0.3910
  Regressor weight stats: mean=1.0000, std=0.0000


Epoch,Training Loss,Validation Loss,Spearman,Accuracy Within Std
1,0.2669,0.250629,0.692047,0.732993
2,0.1875,0.202408,0.72903,0.784014
3,0.1409,0.212003,0.720424,0.768707
4,0.148,0.195618,0.761414,0.780612
5,0.0976,0.197717,0.756327,0.785714
6,0.0919,0.191518,0.759292,0.782313
7,0.0797,0.183816,0.75647,0.809524
8,0.0688,0.181774,0.765153,0.797619
9,0.0586,0.178573,0.765958,0.806122
10,0.0673,0.181156,0.765631,0.797619



üîç Forward pass #2
  Encoder output shape: torch.Size([6, 320, 2048])
  Encoder output has NaN: False
  Encoder output min/max: -1.4534/1.1071
  Pooled output shape: torch.Size([6, 2048])
  Pooled has NaN: False
  Pooled min/max: -0.3846/0.3757
  Regressor weight stats: mean=1.0000, std=0.0003

üîç Forward pass #3
  Encoder output shape: torch.Size([6, 320, 2048])
  Encoder output has NaN: False
  Encoder output min/max: -1.6417/1.4015
  Pooled output shape: torch.Size([6, 2048])
  Pooled has NaN: False
  Pooled min/max: -0.3912/0.3940
  Regressor weight stats: mean=1.0000, std=0.0004

üìä Computing metrics:
  Predictions shape: (588,)
  Labels shape: (588,)
  Predictions sample (first 5): [3.815782  4.0848007 3.8892853 4.3121996 3.97677  ]
  Labels sample (first 5): [3.6 3.6 3.8 4.2 3. ]
  Predictions has NaN: False
  Predictions has inf: False
  Labels has NaN: False
  Valid predictions range: [1.5667, 4.6750]
  Valid labels range: [1.0000, 5.0000]

üìä Computing metrics:
  Pre

In [None]:

print("\nüîç Running evaluation on dev set...")

model.eval()
eval_results = trainer.evaluate(eval_dataset=dev_dataset)

print("\n" + "="*50)
print("üìä DEV SET EVALUATION")
print("="*50)
print(f"Loss:                     {eval_results['eval_loss']:.4f}")
print(f"Spearman Correlation:     {eval_results['eval_spearman']:.4f}")
print(f"Accuracy within STD:      {eval_results['eval_accuracy_within_std']:.4f}")
print("="*50)


üîç Running evaluation on dev set...



üìä Computing metrics:
  Predictions shape: (588,)
  Labels shape: (588,)
  Predictions sample (first 5): [4.473003  3.2046552 3.992885  3.3995848 3.872621 ]
  Labels sample (first 5): [3.6 3.6 3.8 4.2 3. ]
  Predictions has NaN: False
  Predictions has inf: False
  Labels has NaN: False
  Valid predictions range: [1.2684, 4.8653]
  Valid labels range: [1.0000, 5.0000]

üìä DEV SET EVALUATION
Loss:                     0.1838
Spearman Correlation:     0.7565
Accuracy within STD:      0.8095


In [20]:
trainer.save_model("./t5eval_contrastive")
tokenizer.save_pretrained("./t5eval_contrastive")
print("Modello salvato in t5eval_contrastive")

!zip -r t5eval_stdev6.zip t5eval_contrastive
!cp "t5eval_stdev6.zip" "/content/drive/MyDrive/Colab Notebooks/LLM_Project/"
print("Evaluation results saved in zip")

Modello salvato in t5eval_contrastive
  adding: t5eval_contrastive/ (stored 0%)
  adding: t5eval_contrastive/training_args.bin (deflated 53%)
  adding: t5eval_contrastive/tokenizer.json (deflated 74%)
  adding: t5eval_contrastive/tokenizer_config.json (deflated 95%)
  adding: t5eval_contrastive/spiece.model (deflated 48%)
  adding: t5eval_contrastive/special_tokens_map.json (deflated 85%)
  adding: t5eval_contrastive/pytorch_model.bin (deflated 7%)
Evaluation results saved in zip
