## Google Colab compatible notebook that runs the inference experiments (100x100)

In [None]:
import transformers
import torch
import json
import os
import numpy as np
from tqdm import tqdm
from datasets import load_dataset, Dataset
from collections import defaultdict
from huggingface_hub import login
from google.colab import userdata
from peft import PeftModel
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
login(token=userdata.get("HF_TOKEN"))
from google.colab import drive
drive.mount('/content/drive')

In [None]:
blob_path="/content/drive/MyDrive/DLA_project/Checkpoints/BLOB_seed1"
lora_path="/content/drive/MyDrive/DLA_project/Checkpoints/LoRA_seed1"

In [None]:
ds_test = load_dataset("knkarthick/dialogsum", split="test")

In [None]:
"cuda" if torch.cuda.is_available() else "cpu"

In [None]:
MAX_NEW_TOKENS = 128
BATCH_SIZE = 8
#NUM_BEAMS = 4
NUM_BEAMS = 1
LENGTH_PENALTY = 1
MAX_SOURCE_LENGTH = 1028

In [None]:
backbone_model_name = "facebook/bart-base"
tokenizer = AutoTokenizer.from_pretrained(backbone_model_name)
tokenizer.model_max_length = MAX_SOURCE_LENGTH
backbone= AutoModelForSeq2SeqLM.from_pretrained(backbone_model_name)
backbone1 = AutoModelForSeq2SeqLM.from_pretrained(backbone_model_name)
backbone2 = AutoModelForSeq2SeqLM.from_pretrained(backbone_model_name)

In [None]:
lora_model = PeftModel.from_pretrained(
    backbone1,
    lora_path,
    is_trainable=False
)

blob_model = PeftModel.from_pretrained(
    backbone2,
    blob_path,
    is_trainable=False
)


# Deterministic Decoding


## BLoB-BART and LoRA-BART on test set with beam=4

In [None]:
def batched_deterministic_generate(model, tokenizer, batch):
    """
    batch: list of strings containing the dialogues to be summarized
    """
    MAX_SOURCE_LEN = model.config.max_position_embeddings

    device = model.device
    with torch.no_grad():
        inputs = tokenizer(
            batch,
            truncation=True,
            max_length=MAX_SOURCE_LEN,
            padding=True,
            return_tensors="pt"
        ).to(device)

        outputs = model.generate(
            **inputs,
            do_sample=False,          
            early_stopping=True,
            use_cache=True,
            num_beams=NUM_BEAMS,                
            length_penalty=LENGTH_PENALTY,
            max_new_tokens=MAX_NEW_TOKENS,
            return_dict_in_generate=False,
            output_scores = False 
        )

        # decode whole batch at once
        summaries = tokenizer.batch_decode(
            outputs,
            skip_special_tokens=True
        )
        return summaries



def run_batched_summarization(ds, model, tokenizer, output_json_path):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)
    model.eval()

    results = {}

    # iterate in batches
    for start in tqdm(range(0, len(ds), BATCH_SIZE)):
        end = min(start + BATCH_SIZE, len(ds))
        batch = ds[start:end]

        batch_X = batch['dialogue']
        batch_ids = batch['id']

        summaries = batched_deterministic_generate(
            model, tokenizer, batch_X
        )

        # sanity check
        if len(batch_ids) != len(summaries):
            raise RuntimeError(
                f"Batch size mismatch: "
                f"{len(batch_ids)} ids vs {len(summaries)} summaries"
            )

        # store results
        for idx, summary in zip(batch_ids, summaries):
            results[idx] = summary

    # save to JSON
    with open(output_json_path, "w") as f:
        json.dump(results, f, indent=2)

    print(f"Saved results to {output_json_path}")

In [None]:
lora_output = "LoRA_summaries_test.json"
run_batched_summarization(ds_test, lora_model, tokenizer, lora_output)

In [None]:
blob_output = "BLoB_summaries_test.json"
run_batched_summarization(ds_test, blob_model, tokenizer, blob_output)

In [None]:
# write results to google-drive

drive_dir = "/content/drive/MyDrive/DLA_project"

import shutil
shutil.copy(lora_output, drive_dir)
shutil.copy(blob_output, drive_dir)

## Evaluation

In [None]:
blob_summaries_path="/content/drive/MyDrive/DLA_project/BLoB_summaries_test.json"
lora_summaries_path="/content/drive/MyDrive/DLA_project/LoRA_summaries_test.json"

In [None]:
# convert dataset to a BERTScore and ROUGE friendly format
with open(blob_summaries_path) as f_blob:
    blob_map = json.load(f_blob)

with open(lora_summaries_path) as f_lora:
    lora_map = json.load(f_lora)

groups = {}

for row in ds_test:
    base_id = row['id'].rsplit('_', 1)[0] # test_X
    full_id = row['id']

    if base_id not in groups:
        groups[base_id] = {
            "id":base_id,
            "references":[],
            "blob_candidate":None,
            "lora_candidate":None
        }

        candidate_id = f"{base_id}_1"
        groups[base_id]["blob_candidate"] = blob_map.get(candidate_id)
        groups[base_id]["lora_candidate"] = lora_map.get(candidate_id)

    groups[base_id]["references"].append(row["summary"])

ds_eval = Dataset.from_list(list(groups.values()))

In [None]:
candidates_blob = ds_eval["blob_candidate"]
candidates_lora = ds_eval["lora_candidate"] 
references = ds_eval["references"]   

In [None]:
ds_eval

### Evaluation BERTScore

Max over references --> Mean over samples.
We have 500 dialouges, and for each dialouge we have 3 reference summaries.
For each candidate summarie we will choose the max BERTScore over the three references, and then average over the 500 dialouges.

In [None]:
!pip install bert-score

In [None]:
from bert_score import score

In [None]:

bertscore_model = "microsoft/deberta-xlarge-mnli" # we use this model for the BERTScores

def compute_stats(candidates, references):
    P, R, F1 = score(
        candidates,
        references,
        lang="en",
        model_type=bertscore_model,
        batch_size=8,
        device="cuda",
        rescale_with_baseline=True
    )
    F1_np = F1.detach().cpu().numpy()
    return {
        "mean_f1": float(F1_np.mean()),
        "std_f1": float(F1_np.std(ddof=0)),
        "f1_values": F1_np
    }


In [None]:
blob_stats = compute_stats(candidates_blob, references)
print("BLOB:")
print(blob_stats['mean_f1'], blob_stats['std_f1'])

In [None]:
lora_stats = compute_stats(candidates_lora, references)
print("LORA:")
print(lora_stats['mean_f1'], lora_stats['std_f1'])

In [None]:
from scipy.stats import ttest_rel
ttest_rel(lora_stats["f1_values"], blob_stats["f1_values"])

### Evaluation ROUGE(1,2,L)

In [None]:
pip install evaluate rouge-score

In [None]:
import evaluate
import numpy as np

rouge = evaluate.load("rouge")

def compute_rouge_with_std(candidates, references):
    results = rouge.compute(
        predictions=candidates,
        references=references,
        rouge_types=["rouge1", "rouge2", "rougeL"],
        use_aggregator=False
    )

    return {
        "rouge1_mean": np.mean(results["rouge1"]),
        "rouge1_std":  np.std(results["rouge1"], ddof=0),
        "rouge2_mean": np.mean(results["rouge2"]),
        "rouge2_std":  np.std(results["rouge2"], ddof=0),
        "rougeL_mean": np.mean(results["rougeL"]),
        "rougeL_std":  np.std(results["rougeL"], ddof=0),
        "per_example": results   # for t-tests
    }

In [None]:
rouge_blob = compute_rouge_with_std(candidates_blob, references)
rouge_lora = compute_rouge_with_std(candidates_lora, references)

print("BLOB:", rouge_blob)
print("LORA:", rouge_lora)

In [None]:
from scipy.stats import ttest_rel

ttest_rel(
    rouge_lora["per_example"]["rouge1"],
    rouge_blob["per_example"]["rouge1"]
)

In [None]:
ttest_rel(
    rouge_lora["per_example"]["rouge2"],
    rouge_blob["per_example"]["rouge2"]
)

In [None]:
ttest_rel(
    rouge_lora["per_example"]["rougeL"],
    rouge_blob["per_example"]["rougeL"]
)

# Generate N summaries per input text with beam=1 LoRA

In [None]:
num_examples=100

In [None]:
ds_test

In [None]:
# Build indices for unique dialogues (keeps first occurrence of each dialogue)
seen = set()
unique_indices = []
for idx, d in enumerate(ds_test['dialogue']):
    if d in seen:
        continue
    seen.add(d)
    unique_indices.append(idx)

unique_indices = unique_indices[:num_examples]

In [None]:
ds_sub = ds_test.select(unique_indices) # ds_sub, dataset containing the first 100 unique dialgoues.

In [None]:
# For each dialouge in ds_sub, create 100 summaries for that dialouge, save it as json file. {"dialouge_id": ds_sub['id'], "samples":list[str] }

def generate_100_summaries(model, tokenizer, dialogue, num_summaries=100):
    """
    batch: list of strings containing the dialogues to be summarized
    """
    MAX_SOURCE_LEN = model.config.max_position_embeddings
    device = model.device

    with torch.no_grad():
        inputs = tokenizer(
            "Summarize: " + dialogue,
            truncation=True,
            max_length=512, #match BLOB
            padding=True,
            return_tensors="pt"
        ).to(device)

        outputs = model.generate(
            **inputs,
            do_sample=True,            
            use_cache=True,
            num_beams=1,                
            length_penalty=LENGTH_PENALTY,
            num_return_sequences=num_summaries,
            max_new_tokens=MAX_NEW_TOKENS,
            return_dict_in_generate=False,
            output_scores = False 
        )

        # decode whole batch at once
        summaries = tokenizer.batch_decode(
            outputs,
            skip_special_tokens=True
        )
        if len(summaries) != num_summaries:
            raise RuntimeError(f"Expected {num_summaries} samples, got {len(summaries)}")

        return summaries




def run_generate_and_save(ds_sub, model, tokenizer, drive_dir):

    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)
    model.eval()

    for record in tqdm(ds_sub):
        record_id = record["id"]
        dialogue = record["dialogue"]

        samples = generate_100_summaries(
            model,
            tokenizer,
            dialogue
        )

        out_path = os.path.join(drive_dir, f"{record_id}(LoRA).json")
        with open(out_path, "w") as f:
            json.dump(
                {
                    "dialogue_id": record_id,
                    "samples": samples
                },
                f,
                indent=2
            )

    print(f"Saved {len(ds_sub)} JSON files to {drive_dir}")

In [None]:
drive_dir = "/content/drive/MyDrive/DLA_project/LoRA_100_x_100_summaries_2.0"
run_generate_and_save(ds_sub, lora_model, tokenizer, drive_dir)

# Bayesian Sampling Evaluation

In [None]:
blob_100_path = "/content/drive/MyDrive/DLA_project/BLoB_100_x_100_summaries"
lora_100_path = "/content/drive/MyDrive/DLA_project/LoRA_100_x_100_summaries_2.0"

ds_test= load_dataset("knkarthick/dialogsum", split="test")


# Build a lookup: id -> summary
id_to_summary = {ex["id"]: ex["summary"] for ex in ds_test}

records = []

NUM_DIALOGUES = 100  # test_0 ... test_99

for x in range(NUM_DIALOGUES):
    base_id = f"test_{x}"
    file_id = f"{base_id}_1"

    # load LoRA summaries
    lora_path = os.path.join(lora_100_path, f"{file_id}(LoRA).json")
    with open(lora_path, "r") as f:
        lora_data = json.load(f)

    # load BLoB summaries
    blob_path = os.path.join(blob_100_path, f"{file_id}(BLoB).json")
    with open(blob_path, "r") as f:
       blob_data = json.load(f)

    lora_summaries = lora_data["samples"]
    blob_summaries = blob_data["samples"]

    # sanity checks
    if len(lora_summaries) != 100:
        raise RuntimeError(f"{file_id}: LoRA has {len(lora_summaries)} samples")
    if len(blob_summaries) != 100:
       raise RuntimeError(f"{file_id}: BLoB has {len(blob_summaries)} samples")

    # collect 3 reference summaries
    references = []
    for i in range(1, 4):
        ref_id = f"{base_id}_{i}"
        if ref_id not in id_to_summary:
            raise KeyError(f"Missing reference: {ref_id}")
        references.append(id_to_summary[ref_id])

    records.append({
        "id": base_id,
        "blob_summaries": blob_summaries,
        "lora_summaries": lora_summaries,
        "references": references,
    })

# create Hugging Face dataset
ds_eval1 = Dataset.from_list(records)

In [None]:
ds_eval1

In [None]:
for i in range(len(ds_eval1)):
    row = ds_eval1[i]
    assert isinstance(row["id"], str)
    assert len(row["lora_summaries"]) == 100
    assert len(row["blob_summaries"]) == 100
    assert len(row["references"]) == 3

### BERTScore

In [None]:
!pip install bert-score

In [None]:
from bert_score import score


def compute_bertscores(
    ds,
    model_type="microsoft/deberta-xlarge-mnli",
    device="cuda" if torch.cuda.is_available() else "cpu",
    fine_tuning_type=None,
    aggregation=None
):
    all_scores = []

    for row in tqdm(ds):
        summaries = row[f"{fine_tuning_type}_summaries"]      # 100 summaries
        references = row["references"]         # 3 references

        # Repeat summaries for each reference
        # summaries: [s1, s2, ..., s100]
        # expanded_summaries: [s1, s1, s1, s2, s2, s2, ...]
        expanded_summaries = []
        expanded_references = []

        for s in summaries:
            for r in references:
                expanded_summaries.append(s)
                expanded_references.append(r)

        # Compute BERTScore
        _, _, f1 = score(
            expanded_summaries,
            expanded_references,
            model_type=model_type,
            device=device,
            verbose=False,
        )

        # f1 shape: (100 * 3,)
        f1 = f1.cpu().numpy().reshape(len(summaries), len(references))
        if aggregation == "mean":
        # Average over references → one score per summary
            mean_f1_per_summary = f1.mean(axis=1)  # shape (100,)

        if aggregation == "max":
            mean_f1_per_summary = f1.max(axis=1)  # shape (100,)

        if aggregation == "min":
            mean_f1_per_summary = f1.min(axis=1)  # shape (100,)

        all_scores.extend(mean_f1_per_summary.tolist())

    return np.array(all_scores)

In [None]:
out_path = "/content/drive/MyDrive/DLA_project/BERTSCORES/blob_bertscores_mean.npy"
np.save(out_path, dist_bert_blob_mean)

In [None]:
dist_bert_lora_mean = compute_bertscores(
    ds_eval1,
    fine_tuning_type="lora",
    aggregation="mean",
)
out_path = "/content/drive/MyDrive/DLA_project/BERTSCORES_2/lora_bertscores_mean.npy"
np.save(out_path, dist_bert_lora_mean)

dist_bert_lora_max = compute_bertscores(
    ds_eval1,
    fine_tuning_type="lora",
    aggregation="max",
)
out_path = "/content/drive/MyDrive/DLA_project/BERTSCORES_2/lora_bertscores_max.npy"
np.save(out_path, dist_bert_lora_max)

################

dist_bert_lora_min = compute_bertscores(
    ds_eval1,
    fine_tuning_type="lora",
    aggregation="min",
)
out_path = "/content/drive/MyDrive/DLA_project/BERTSCORES_2/lora_bertscores_min.npy"
np.save(out_path, dist_bert_lora_min)

In [None]:
dist_bert_blob_mean = compute_bertscores(
    ds_eval1,
    fine_tuning_type="blob",
    aggregation="mean",
)
out_path = "/content/drive/MyDrive/DLA_project/BERTSCORES/blob_bertscores_mean.npy"
np.save(out_path, dist_bert_blob_mean)


dist_bert_blob_max = compute_bertscores(
    ds_eval1,
    fine_tuning_type="blob",
    aggregation="max",
)
out_path = "/content/drive/MyDrive/DLA_project/BERTSCORES/blob_bertscores_max.npy"
np.save(out_path, dist_bert_blob_max)

#############/content/drive/MyDrive/DLA_project/BERTSCORES/

dist_bert_blob_min = compute_bertscores(
    ds_eval1,
    fine_tuning_type="blob",
    aggregation="min",
)
out_path = "/content/drive/MyDrive/DLA_project/BERTSCORES/blob_bertscores_min.npy"
np.save(out_path, dist_bert_blob_min)

###############

dist_bert_lora_mean = compute_bertscores(
    ds_eval1,
    fine_tuning_type="lora",
    aggregation="mean",
)
out_path = "/content/drive/MyDrive/DLA_project/BERTSCORES/lora_bertscores_mean.npy"
np.save(out_path, dist_bert_lora_mean)

dist_bert_lora_max = compute_bertscores(
    ds_eval1,
    fine_tuning_type="lora",
    aggregation="max",
)
out_path = "/content/drive/MyDrive/DLA_project/BERTSCORES/lora_bertscores_max.npy"
np.save(out_path, dist_bert_lora_max)

################

dist_bert_lora_min = compute_bertscores(
    ds_eval1,
    fine_tuning_type="lora",
    aggregation="min",
)
out_path = "/content/drive/MyDrive/DLA_project/BERTSCORES/lora_bertscores_min.npy"
np.save(out_path, dist_bert_lora_min)

### ROUGE

In [None]:
!pip install evaluate rouge-score

In [None]:
from rouge_score import rouge_scorer


scorer = rouge_scorer.RougeScorer(
    ["rouge1", "rouge2", "rougeL"],
    use_stemmer=True,
)

def compute_rougescores(
    ds,
    fine_tuning_type,
    aggregation="mean",
    rouge_type="rouge1",
):
    if fine_tuning_type not in {"blob", "lora"}:
        raise ValueError("fine_tuning_type must be 'blob' or 'lora'")
    if aggregation not in {"mean", "max", "min"}:
        raise ValueError("aggregation must be 'mean', 'max', 'min'")

    all_scores = []

    for row in tqdm(ds):
        summaries = row[f"{fine_tuning_type}_summaries"]  # 100
        references = row["references"]                    # 3

        for summary in summaries:
            ref_scores = []

            for ref in references:
                score = scorer.score(ref, summary)[rouge_type].fmeasure
                ref_scores.append(score)

            ref_scores = np.asarray(ref_scores)

            if aggregation == "mean":
                all_scores.append(ref_scores.mean())
            elif aggregation == "max":
                all_scores.append(ref_scores.max())
            else:
                all_scores.append(ref_scores.min())

    return np.asarray(all_scores)

#### ROUGE1

In [None]:
# Mean over references
rouge1_blob_mean = compute_rougescores(
    ds_eval1,
    fine_tuning_type="blob",
    aggregation="mean",
    rouge_type="rouge1"
)
print(len(rouge1_blob_mean))
output_path="/content/drive/MyDrive/DLA_project/ROUGEScores/rouge1_blob_mean.npy"
np.save(output_path, rouge1_blob_mean)

# Oracle ROUGE
rouge1_blob_max = compute_rougescores(
    ds_eval1,
    fine_tuning_type="blob",
    aggregation="max",
    rouge_type="rouge1"
)
print(len(rouge1_blob_max))
output_path="/content/drive/MyDrive/DLA_project/ROUGEScores/rouge1_blob_max.npy"
np.save(output_path, rouge1_blob_max)


# Worst-case ROUGE
rouge1_blob_min = compute_rougescores(
    ds_eval1,
    fine_tuning_type="blob",
    aggregation="min",
    rouge_type="rouge1"
)
print(len(rouge1_blob_min))
output_path="/content/drive/MyDrive/DLA_project/ROUGEScores/rouge1_blob_min.npy"
np.save(output_path, rouge1_blob_min)

In [None]:

#######      LORA      ############
# Mean over references
rouge1_lora_mean = compute_rougescores(
    ds_eval1,
    fine_tuning_type="lora",
    aggregation="mean",
    rouge_type="rouge1"
)
print(len(rouge1_lora_mean))
output_path = "/content/drive/MyDrive/DLA_project/ROUGEScoresLora/rouge1_lora_mean.npy"
np.save(output_path, rouge1_lora_mean)


# Oracle ROUGE (max over references)
rouge1_lora_max = compute_rougescores(
    ds_eval1,
    fine_tuning_type="lora",
    aggregation="max",
    rouge_type="rouge1"
)
print(len(rouge1_lora_max))
output_path = "/content/drive/MyDrive/DLA_project/ROUGEScoresLora/rouge1_lora_max.npy"
np.save(output_path, rouge1_lora_max)


# Worst-case ROUGE (min over references)
rouge1_lora_min = compute_rougescores(
    ds_eval1,
    fine_tuning_type="lora",
    aggregation="min",
    rouge_type="rouge1"
)
print(len(rouge1_lora_min))
output_path = "/content/drive/MyDrive/DLA_project/ROUGEScoresLora/rouge1_lora_min.npy"
np.save(output_path, rouge1_lora_min)

#### ROUGE2

In [None]:
# Mean over references
rouge2_blob_mean = compute_rougescores(
    ds_eval1,
    fine_tuning_type="blob",
    aggregation="mean",
    rouge_type="rouge2"
)
print(len(rouge2_blob_mean))
output_path="/content/drive/MyDrive/DLA_project/ROUGEScores/rouge2_blob_mean.npy"
np.save(output_path, rouge2_blob_mean)

# Oracle ROUGE
rouge2_blob_max = compute_rougescores(
    ds_eval1,
    fine_tuning_type="blob",
    aggregation="max",
    rouge_type="rouge2"
)
print(len(rouge2_blob_max))
output_path="/content/drive/MyDrive/DLA_project/ROUGEScores/rouge2_blob_max.npy"
np.save(output_path, rouge2_blob_max)


# Worst-case ROUGE
rouge2_blob_min = compute_rougescores(
    ds_eval1,
    fine_tuning_type="blob",
    aggregation="min",
    rouge_type="rouge2"
)
print(len(rouge2_blob_min))
output_path="/content/drive/MyDrive/DLA_project/ROUGEScores/rouge2_blob_min.npy"
np.save(output_path, rouge2_blob_min)

In [None]:


#######      LORA      ############
# Mean over references
rouge2_lora_mean = compute_rougescores(
    ds_eval1,
    fine_tuning_type="lora",
    aggregation="mean",
    rouge_type="rouge2"
)
print(len(rouge2_lora_mean))
output_path = "/content/drive/MyDrive/DLA_project/ROUGEScoresLora/rouge2_lora_mean.npy"
np.save(output_path, rouge2_lora_mean)


# Oracle ROUGE (max over references)
rouge2_lora_max = compute_rougescores(
    ds_eval1,
    fine_tuning_type="lora",
    aggregation="max",
    rouge_type="rouge2"
)
print(len(rouge2_lora_max))
output_path = "/content/drive/MyDrive/DLA_project/ROUGEScoresLora/rouge2_lora_max.npy"
np.save(output_path, rouge2_lora_max)


# Worst-case ROUGE (min over references)
rouge2_lora_min = compute_rougescores(
    ds_eval1,
    fine_tuning_type="lora",
    aggregation="min",
    rouge_type="rouge2"
)
print(len(rouge2_lora_min))
output_path = "/content/drive/MyDrive/DLA_project/ROUGEScoresLora/rouge2_lora_min.npy"
np.save(output_path, rouge2_lora_min)

#### ROUGEL

In [None]:
# Mean over references
rougeL_blob_mean = compute_rougescores(
    ds_eval1,
    fine_tuning_type="blob",
    aggregation="mean",
    rouge_type="rougeL"
)
print(len(rougeL_blob_mean))
output_path="/content/drive/MyDrive/DLA_project/ROUGEScores/rougeL_blob_mean.npy"
np.save(output_path, rougeL_blob_mean)

# Oracle ROUGE
rougeL_blob_max = compute_rougescores(
    ds_eval1,
    fine_tuning_type="blob",
    aggregation="max",
    rouge_type="rougeL"
)
print(len(rougeL_blob_max))
output_path="/content/drive/MyDrive/DLA_project/ROUGEScores/rougeL_blob_max.npy"
np.save(output_path, rougeL_blob_max)


# Worst-case ROUGE
rougeL_blob_min = compute_rougescores(
    ds_eval1,
    fine_tuning_type="blob",
    aggregation="min",
    rouge_type="rougeL"
)
print(len(rougeL_blob_min))
output_path="/content/drive/MyDrive/DLA_project/ROUGEScores/rougeL_blob_min.npy"
np.save(output_path, rougeL_blob_min)


In [None]:
#######      LORA      ############
# Mean over references
rougeL_lora_mean = compute_rougescores(
    ds_eval1,
    fine_tuning_type="lora",
    aggregation="mean",
    rouge_type="rougeL"
)
print(len(rougeL_lora_mean))
output_path = "/content/drive/MyDrive/DLA_project/ROUGEScoresLora/rougeL_lora_mean.npy"
np.save(output_path, rougeL_lora_mean)


# Oracle ROUGE (max over references)
rougeL_lora_max = compute_rougescores(
    ds_eval1,
    fine_tuning_type="lora",
    aggregation="max",
    rouge_type="rougeL"
)
print(len(rougeL_lora_max))
output_path = "/content/drive/MyDrive/DLA_project/ROUGEScoresLora/rougeL_lora_max.npy"
np.save(output_path, rougeL_lora_max)


# Worst-case ROUGE (min over references)
rougeL_lora_min = compute_rougescores(
    ds_eval1,
    fine_tuning_type="lora",
    aggregation="min",
    rouge_type="rougeL"
)
print(len(rougeL_lora_min))
output_path = "/content/drive/MyDrive/DLA_project/ROUGEScoresLora/rougeL_lora_min.npy"
np.save(output_path, rougeL_lora_min)