## Google Colab compatible notebook that runs the inference experiments (100x100)

In [None]:
import transformers
import torch
import json
import os
import numpy as np
from tqdm import tqdm
from datasets import load_dataset, Dataset
from collections import defaultdict
from huggingface_hub import login
from google.colab import userdata
from peft import PeftModel
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
login(token=userdata.get("HF_TOKEN"))
from google.colab import drive
drive.mount('/content/drive')



Mounted at /content/drive


In [None]:
blob_path="/content/drive/MyDrive/DLA_project/Checkpoints/BLOB_seed1"
lora_path="/content/drive/MyDrive/DLA_project/Checkpoints/LoRA_seed1"

In [None]:
ds_test = load_dataset("knkarthick/dialogsum", split="test")

README.md: 0.00B [00:00, ?B/s]

train.csv:   0%|          | 0.00/11.3M [00:00<?, ?B/s]

validation.csv: 0.00B [00:00, ?B/s]

test.csv: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/12460 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/500 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1500 [00:00<?, ? examples/s]

In [None]:
"cuda" if torch.cuda.is_available() else "cpu"

'cpu'

In [None]:
MAX_NEW_TOKENS = 128
BATCH_SIZE = 8
#NUM_BEAMS = 4
NUM_BEAMS = 1
LENGTH_PENALTY = 1
MAX_SOURCE_LENGTH = 1028

In [None]:
backbone_model_name = "facebook/bart-base"
tokenizer = AutoTokenizer.from_pretrained(backbone_model_name)
tokenizer.model_max_length = MAX_SOURCE_LENGTH
backbone= AutoModelForSeq2SeqLM.from_pretrained(backbone_model_name)
backbone1 = AutoModelForSeq2SeqLM.from_pretrained(backbone_model_name)
backbone2 = AutoModelForSeq2SeqLM.from_pretrained(backbone_model_name)

config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

In [None]:
lora_model = PeftModel.from_pretrained(
    backbone1,
    lora_path,
    is_trainable=False
)

blob_model = PeftModel.from_pretrained(
    backbone2,
    blob_path,
    is_trainable=False
)




# Deterministic Decoding


## BLoB-BART and LoRA-BART on test set with beam=4

In [None]:
def batched_deterministic_generate(model, tokenizer, batch):
    """
    batch: list of strings containing the dialogues to be summarized
    """
    MAX_SOURCE_LEN = model.config.max_position_embeddings

    device = model.device
    with torch.no_grad():
        inputs = tokenizer(
            batch,
            truncation=True,
            max_length=MAX_SOURCE_LEN,
            padding=True,
            return_tensors="pt"
        ).to(device)

        outputs = model.generate(
            **inputs,
            do_sample=False,          
            early_stopping=True,
            use_cache=True,
            num_beams=NUM_BEAMS,                
            length_penalty=LENGTH_PENALTY,
            max_new_tokens=MAX_NEW_TOKENS,
            return_dict_in_generate=False,
            output_scores = False 
        )

        # decode whole batch at once
        summaries = tokenizer.batch_decode(
            outputs,
            skip_special_tokens=True
        )
        return summaries



def run_batched_summarization(ds, model, tokenizer, output_json_path):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)
    model.eval()

    results = {}

    # iterate in batches
    for start in tqdm(range(0, len(ds), BATCH_SIZE)):
        end = min(start + BATCH_SIZE, len(ds))
        batch = ds[start:end]

        batch_X = batch['dialogue']
        batch_ids = batch['id']

        summaries = batched_deterministic_generate(
            model, tokenizer, batch_X
        )

        # sanity check
        if len(batch_ids) != len(summaries):
            raise RuntimeError(
                f"Batch size mismatch: "
                f"{len(batch_ids)} ids vs {len(summaries)} summaries"
            )

        # store results
        for idx, summary in zip(batch_ids, summaries):
            results[idx] = summary

    # save to JSON
    with open(output_json_path, "w") as f:
        json.dump(results, f, indent=2)

    print(f"Saved results to {output_json_path}")

In [None]:
lora_output = "LoRA_summaries_test.json"
run_batched_summarization(ds_test, lora_model, tokenizer, lora_output)

100%|██████████| 188/188 [02:26<00:00,  1.29it/s]

Saved results to LoRA_summaries_test.json





In [None]:
blob_output = "BLoB_summaries_test.json"
run_batched_summarization(ds_test, blob_model, tokenizer, blob_output)

100%|██████████| 188/188 [02:32<00:00,  1.24it/s]

Saved results to BLoB_summaries_test.json





In [None]:
# write results to google-drive

drive_dir = "/content/drive/MyDrive/DLA_project"

import shutil
shutil.copy(lora_output, drive_dir)
shutil.copy(blob_output, drive_dir)

'/content/drive/MyDrive/DLA_project/BLoB_summaries_test.json'

## Evaluation

In [None]:
blob_summaries_path="/content/drive/MyDrive/DLA_project/BLoB_summaries_test.json"
lora_summaries_path="/content/drive/MyDrive/DLA_project/LoRA_summaries_test.json"

In [None]:
# convert dataset to a BERTScore and ROUGE friendly format
with open(blob_summaries_path) as f_blob:
    blob_map = json.load(f_blob)

with open(lora_summaries_path) as f_lora:
    lora_map = json.load(f_lora)

groups = {}

for row in ds_test:
    base_id = row['id'].rsplit('_', 1)[0] # test_X
    full_id = row['id']

    if base_id not in groups:
        groups[base_id] = {
            "id":base_id,
            "references":[],
            "blob_candidate":None,
            "lora_candidate":None
        }

        candidate_id = f"{base_id}_1"
        groups[base_id]["blob_candidate"] = blob_map.get(candidate_id)
        groups[base_id]["lora_candidate"] = lora_map.get(candidate_id)

    groups[base_id]["references"].append(row["summary"])

ds_eval = Dataset.from_list(list(groups.values()))

In [None]:
candidates_blob = ds_eval["blob_candidate"]
candidates_lora = ds_eval["lora_candidate"] 
references = ds_eval["references"]   

In [None]:
ds_eval

Dataset({
    features: ['id', 'references', 'blob_candidate', 'lora_candidate'],
    num_rows: 500
})

### Evaluation BERTScore

Max over references --> Mean over samples.
We have 500 dialouges, and for each dialouge we have 3 reference summaries.
For each candidate summarie we will choose the max BERTScore over the three references, and then average over the 500 dialouges.

In [None]:
!pip install bert-score

Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bert-score
Successfully installed bert-score-0.3.13


In [None]:
from bert_score import score

In [None]:

bertscore_model = "microsoft/deberta-xlarge-mnli" # we use this model for the BERTScores

def compute_stats(candidates, references):
    P, R, F1 = score(
        candidates,
        references,
        lang="en",
        model_type=bertscore_model,
        batch_size=8,
        device="cuda",
        rescale_with_baseline=True
    )
    F1_np = F1.detach().cpu().numpy()
    return {
        "mean_f1": float(F1_np.mean()),
        "std_f1": float(F1_np.std(ddof=0)),
        "f1_values": F1_np
    }


In [None]:
blob_stats = compute_stats(candidates_blob, references)
print("BLOB:")
print(blob_stats['mean_f1'], blob_stats['std_f1'])

BLOB:
0.4586315453052521 0.16308806836605072


In [None]:
lora_stats = compute_stats(candidates_lora, references)
print("LORA:")
print(lora_stats['mean_f1'], lora_stats['std_f1'])

LORA:
0.45908281207084656 0.15822634100914001


In [None]:
from scipy.stats import ttest_rel
ttest_rel(lora_stats["f1_values"], blob_stats["f1_values"])

TtestResult(statistic=np.float64(0.08596280306091514), pvalue=np.float64(0.931530467068286), df=np.int64(499))

### Evaluation ROUGE(1,2,L)

In [None]:
pip install evaluate rouge-score

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=a91260fbacc3aee03a7e6609c1c627a345be9855d15da3f2808b4dd1e619d981
  Stored in directory: /root/.cache/pip/wheels/85/9d/af/01feefbe7d55ef5468796f0c68225b6788e85d9d0a281e7a70
Successfully built rouge-score
Installing collected packages: rouge-score, evaluate
Successfully installed evaluate-0.4.6 rouge-score-0.1.2


In [None]:
import evaluate
import numpy as np

rouge = evaluate.load("rouge")

def compute_rouge_with_std(candidates, references):
    results = rouge.compute(
        predictions=candidates,
        references=references,
        rouge_types=["rouge1", "rouge2", "rougeL"],
        use_aggregator=False
    )

    return {
        "rouge1_mean": np.mean(results["rouge1"]),
        "rouge1_std":  np.std(results["rouge1"], ddof=0),
        "rouge2_mean": np.mean(results["rouge2"]),
        "rouge2_std":  np.std(results["rouge2"], ddof=0),
        "rougeL_mean": np.mean(results["rougeL"]),
        "rougeL_std":  np.std(results["rougeL"], ddof=0),
        "per_example": results   # for t-tests
    }

Downloading builder script: 0.00B [00:00, ?B/s]

In [None]:
rouge_blob = compute_rouge_with_std(candidates_blob, references)
rouge_lora = compute_rouge_with_std(candidates_lora, references)

print("BLOB:", rouge_blob)
print("LORA:", rouge_lora)

BLOB: {'rouge1_mean': np.float64(0.4870296371853134), 'rouge1_std': np.float64(0.14065965453233162), 'rouge2_mean': np.float64(0.23904908752325565), 'rouge2_std': np.float64(0.16686269960586003), 'rougeL_mean': np.float64(0.40943744823201994), 'rougeL_std': np.float64(0.15063977292210035), 'per_example': {'rouge1': [0.39999999999999997, 0.41860465116279066, 0.5882352941176471, 0.47058823529411764, 0.4615384615384615, 0.43010752688172044, 0.4390243902439024, 0.3888888888888889, 0.5128205128205129, 0.5641025641025641, 0.4864864864864865, 0.5517241379310344, 0.4444444444444444, 0.3333333333333333, 0.39999999999999997, 0.5652173913043478, 0.37037037037037035, 0.4615384615384615, 0.5, 0.7777777777777777, 0.43636363636363634, 0.5217391304347826, 0.36363636363636365, 0.75, 0.19354838709677416, 0.6363636363636365, 0.6060606060606061, 0.39999999999999997, 0.38709677419354843, 0.5555555555555556, 0.5555555555555556, 0.4848484848484849, 0.43243243243243246, 0.3043478260869565, 0.30303030303030304

In [None]:
from scipy.stats import ttest_rel

ttest_rel(
    rouge_lora["per_example"]["rouge1"],
    rouge_blob["per_example"]["rouge1"]
)

TtestResult(statistic=np.float64(-1.009318278786927), pvalue=np.float64(0.31331117833732186), df=np.int64(499))

In [None]:
ttest_rel(
    rouge_lora["per_example"]["rouge2"],
    rouge_blob["per_example"]["rouge2"]
)

TtestResult(statistic=np.float64(-1.4200023653643254), pvalue=np.float64(0.15623144197925332), df=np.int64(499))

In [None]:
ttest_rel(
    rouge_lora["per_example"]["rougeL"],
    rouge_blob["per_example"]["rougeL"]
)

TtestResult(statistic=np.float64(-1.6009456485887341), pvalue=np.float64(0.11002181468460058), df=np.int64(499))

# Generate N summaries per input text with beam=1 LoRA

In [None]:
num_examples=100

In [None]:
ds_test

Dataset({
    features: ['id', 'dialogue', 'summary', 'topic'],
    num_rows: 1500
})

In [None]:
# Build indices for unique dialogues (keeps first occurrence of each dialogue)
seen = set()
unique_indices = []
for idx, d in enumerate(ds_test['dialogue']):
    if d in seen:
        continue
    seen.add(d)
    unique_indices.append(idx)

unique_indices = unique_indices[:num_examples]

In [None]:
ds_sub = ds_test.select(unique_indices) # ds_sub, dataset containing the first 100 unique dialgoues.

In [None]:
# For each dialouge in ds_sub, create 100 summaries for that dialouge, save it as json file. {"dialouge_id": ds_sub['id'], "samples":list[str] }

def generate_100_summaries(model, tokenizer, dialogue, num_summaries=100):
    """
    batch: list of strings containing the dialogues to be summarized
    """
    MAX_SOURCE_LEN = model.config.max_position_embeddings
    device = model.device

    with torch.no_grad():
        inputs = tokenizer(
            "Summarize: " + dialogue,
            truncation=True,
            max_length=512, #match BLOB
            padding=True,
            return_tensors="pt"
        ).to(device)

        outputs = model.generate(
            **inputs,
            do_sample=True,            
            use_cache=True,
            num_beams=1,                
            length_penalty=LENGTH_PENALTY,
            num_return_sequences=num_summaries,
            max_new_tokens=MAX_NEW_TOKENS,
            return_dict_in_generate=False,
            output_scores = False 
        )

        # decode whole batch at once
        summaries = tokenizer.batch_decode(
            outputs,
            skip_special_tokens=True
        )
        if len(summaries) != num_summaries:
            raise RuntimeError(f"Expected {num_summaries} samples, got {len(summaries)}")

        return summaries




def run_generate_and_save(ds_sub, model, tokenizer, drive_dir):

    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)
    model.eval()

    for record in tqdm(ds_sub):
        record_id = record["id"]
        dialogue = record["dialogue"]

        samples = generate_100_summaries(
            model,
            tokenizer,
            dialogue
        )

        out_path = os.path.join(drive_dir, f"{record_id}(LoRA).json")
        with open(out_path, "w") as f:
            json.dump(
                {
                    "dialogue_id": record_id,
                    "samples": samples
                },
                f,
                indent=2
            )

    print(f"Saved {len(ds_sub)} JSON files to {drive_dir}")

In [None]:
drive_dir = "/content/drive/MyDrive/DLA_project/LoRA_100_x_100_summaries_2.0"
run_generate_and_save(ds_sub, lora_model, tokenizer, drive_dir)

  0%|          | 0/100 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
100%|██████████| 100/100 [02:28<00:00,  1.49s/it]

Saved 100 JSON files to /content/drive/MyDrive/DLA_project/LoRA_100_x_100_summaries_2.0





# Bayesian Sampling Evaluation

In [None]:
blob_100_path = "/content/drive/MyDrive/DLA_project/BLoB_100_x_100_summaries"
lora_100_path = "/content/drive/MyDrive/DLA_project/LoRA_100_x_100_summaries_2.0"

ds_test= load_dataset("knkarthick/dialogsum", split="test")


# Build a lookup: id -> summary
id_to_summary = {ex["id"]: ex["summary"] for ex in ds_test}

records = []

NUM_DIALOGUES = 100  # test_0 ... test_99

for x in range(NUM_DIALOGUES):
    base_id = f"test_{x}"
    file_id = f"{base_id}_1"

    # load LoRA summaries
    lora_path = os.path.join(lora_100_path, f"{file_id}(LoRA).json")
    with open(lora_path, "r") as f:
        lora_data = json.load(f)

    # load BLoB summaries
    blob_path = os.path.join(blob_100_path, f"{file_id}(BLoB).json")
    with open(blob_path, "r") as f:
       blob_data = json.load(f)

    lora_summaries = lora_data["samples"]
    blob_summaries = blob_data["samples"]

    # sanity checks
    if len(lora_summaries) != 100:
        raise RuntimeError(f"{file_id}: LoRA has {len(lora_summaries)} samples")
    if len(blob_summaries) != 100:
       raise RuntimeError(f"{file_id}: BLoB has {len(blob_summaries)} samples")

    # collect 3 reference summaries
    references = []
    for i in range(1, 4):
        ref_id = f"{base_id}_{i}"
        if ref_id not in id_to_summary:
            raise KeyError(f"Missing reference: {ref_id}")
        references.append(id_to_summary[ref_id])

    records.append({
        "id": base_id,
        "blob_summaries": blob_summaries,
        "lora_summaries": lora_summaries,
        "references": references,
    })

# create Hugging Face dataset
ds_eval1 = Dataset.from_list(records)

README.md: 0.00B [00:00, ?B/s]

train.csv:   0%|          | 0.00/11.3M [00:00<?, ?B/s]

validation.csv: 0.00B [00:00, ?B/s]

test.csv: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/12460 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/500 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1500 [00:00<?, ? examples/s]

In [None]:
ds_eval1

Dataset({
    features: ['id', 'lora_summaries', 'references'],
    num_rows: 100
})

In [None]:
for i in range(len(ds_eval1)):
    row = ds_eval1[i]
    assert isinstance(row["id"], str)
    assert len(row["lora_summaries"]) == 100
    assert len(row["blob_summaries"]) == 100
    assert len(row["references"]) == 3

### BERTScore

In [None]:
!pip install bert-score

Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bert-score
Successfully installed bert-score-0.3.13


In [None]:
from bert_score import score


def compute_bertscores(
    ds,
    model_type="microsoft/deberta-xlarge-mnli",
    device="cuda" if torch.cuda.is_available() else "cpu",
    fine_tuning_type=None,
    aggregation=None
):
    all_scores = []

    for row in tqdm(ds):
        summaries = row[f"{fine_tuning_type}_summaries"]      # 100 summaries
        references = row["references"]         # 3 references

        # Repeat summaries for each reference
        # summaries: [s1, s2, ..., s100]
        # expanded_summaries: [s1, s1, s1, s2, s2, s2, ...]
        expanded_summaries = []
        expanded_references = []

        for s in summaries:
            for r in references:
                expanded_summaries.append(s)
                expanded_references.append(r)

        # Compute BERTScore
        _, _, f1 = score(
            expanded_summaries,
            expanded_references,
            model_type=model_type,
            device=device,
            verbose=False,
        )

        # f1 shape: (100 * 3,)
        f1 = f1.cpu().numpy().reshape(len(summaries), len(references))
        if aggregation == "mean":
        # Average over references → one score per summary
            mean_f1_per_summary = f1.mean(axis=1)  # shape (100,)

        if aggregation == "max":
            mean_f1_per_summary = f1.max(axis=1)  # shape (100,)

        if aggregation == "min":
            mean_f1_per_summary = f1.min(axis=1)  # shape (100,)

        all_scores.extend(mean_f1_per_summary.tolist())

    return np.array(all_scores)

In [None]:
out_path = "/content/drive/MyDrive/DLA_project/BERTSCORES/blob_bertscores_mean.npy"
np.save(out_path, dist_bert_blob_mean)

In [None]:
dist_bert_lora_mean = compute_bertscores(
    ds_eval1,
    fine_tuning_type="lora",
    aggregation="mean",
)
out_path = "/content/drive/MyDrive/DLA_project/BERTSCORES_2/lora_bertscores_mean.npy"
np.save(out_path, dist_bert_lora_mean)

dist_bert_lora_max = compute_bertscores(
    ds_eval1,
    fine_tuning_type="lora",
    aggregation="max",
)
out_path = "/content/drive/MyDrive/DLA_project/BERTSCORES_2/lora_bertscores_max.npy"
np.save(out_path, dist_bert_lora_max)

################

dist_bert_lora_min = compute_bertscores(
    ds_eval1,
    fine_tuning_type="lora",
    aggregation="min",
)
out_path = "/content/drive/MyDrive/DLA_project/BERTSCORES_2/lora_bertscores_min.npy"
np.save(out_path, dist_bert_lora_min)

  0%|          | 0/100 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/792 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/3.04G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.04G [00:00<?, ?B/s]

100%|██████████| 100/100 [06:58<00:00,  4.18s/it]
100%|██████████| 100/100 [06:36<00:00,  3.97s/it]
100%|██████████| 100/100 [06:31<00:00,  3.91s/it]


In [None]:
dist_bert_blob_mean = compute_bertscores(
    ds_eval1,
    fine_tuning_type="blob",
    aggregation="mean",
)
out_path = "/content/drive/MyDrive/DLA_project/BERTSCORES/blob_bertscores_mean.npy"
np.save(out_path, dist_bert_blob_mean)


dist_bert_blob_max = compute_bertscores(
    ds_eval1,
    fine_tuning_type="blob",
    aggregation="max",
)
out_path = "/content/drive/MyDrive/DLA_project/BERTSCORES/blob_bertscores_max.npy"
np.save(out_path, dist_bert_blob_max)

#############/content/drive/MyDrive/DLA_project/BERTSCORES/

dist_bert_blob_min = compute_bertscores(
    ds_eval1,
    fine_tuning_type="blob",
    aggregation="min",
)
out_path = "/content/drive/MyDrive/DLA_project/BERTSCORES/blob_bertscores_min.npy"
np.save(out_path, dist_bert_blob_min)

###############

dist_bert_lora_mean = compute_bertscores(
    ds_eval1,
    fine_tuning_type="lora",
    aggregation="mean",
)
out_path = "/content/drive/MyDrive/DLA_project/BERTSCORES/lora_bertscores_mean.npy"
np.save(out_path, dist_bert_lora_mean)

dist_bert_lora_max = compute_bertscores(
    ds_eval1,
    fine_tuning_type="lora",
    aggregation="max",
)
out_path = "/content/drive/MyDrive/DLA_project/BERTSCORES/lora_bertscores_max.npy"
np.save(out_path, dist_bert_lora_max)

################

dist_bert_lora_min = compute_bertscores(
    ds_eval1,
    fine_tuning_type="lora",
    aggregation="min",
)
out_path = "/content/drive/MyDrive/DLA_project/BERTSCORES/lora_bertscores_min.npy"
np.save(out_path, dist_bert_lora_min)

100%|██████████| 100/100 [05:45<00:00,  3.46s/it]
100%|██████████| 100/100 [05:53<00:00,  3.54s/it]
100%|██████████| 100/100 [05:53<00:00,  3.53s/it]
100%|██████████| 100/100 [06:09<00:00,  3.69s/it]
100%|██████████| 100/100 [06:04<00:00,  3.65s/it]
100%|██████████| 100/100 [06:02<00:00,  3.63s/it]


### ROUGE

In [None]:
!pip install evaluate rouge-score

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=c38372a873088b41f8eb13927c148f98b529de791d93fc399f6146473372d719
  Stored in directory: /root/.cache/pip/wheels/85/9d/af/01feefbe7d55ef5468796f0c68225b6788e85d9d0a281e7a70
Successfully built rouge-score
Installing collected packages: rouge-score, evaluate
Successfully installed evaluate-0.4.6 rouge-score-0.1.2


In [None]:
from rouge_score import rouge_scorer


scorer = rouge_scorer.RougeScorer(
    ["rouge1", "rouge2", "rougeL"],
    use_stemmer=True,
)

def compute_rougescores(
    ds,
    fine_tuning_type,
    aggregation="mean",
    rouge_type="rouge1",
):
    if fine_tuning_type not in {"blob", "lora"}:
        raise ValueError("fine_tuning_type must be 'blob' or 'lora'")
    if aggregation not in {"mean", "max", "min"}:
        raise ValueError("aggregation must be 'mean', 'max', 'min'")

    all_scores = []

    for row in tqdm(ds):
        summaries = row[f"{fine_tuning_type}_summaries"]  # 100
        references = row["references"]                    # 3

        for summary in summaries:
            ref_scores = []

            for ref in references:
                score = scorer.score(ref, summary)[rouge_type].fmeasure
                ref_scores.append(score)

            ref_scores = np.asarray(ref_scores)

            if aggregation == "mean":
                all_scores.append(ref_scores.mean())
            elif aggregation == "max":
                all_scores.append(ref_scores.max())
            else:
                all_scores.append(ref_scores.min())

    return np.asarray(all_scores)

#### ROUGE1

In [None]:
# Mean over references
rouge1_blob_mean = compute_rougescores(
    ds_eval1,
    fine_tuning_type="blob",
    aggregation="mean",
    rouge_type="rouge1"
)
print(len(rouge1_blob_mean))
output_path="/content/drive/MyDrive/DLA_project/ROUGEScores/rouge1_blob_mean.npy"
np.save(output_path, rouge1_blob_mean)

# Oracle ROUGE
rouge1_blob_max = compute_rougescores(
    ds_eval1,
    fine_tuning_type="blob",
    aggregation="max",
    rouge_type="rouge1"
)
print(len(rouge1_blob_max))
output_path="/content/drive/MyDrive/DLA_project/ROUGEScores/rouge1_blob_max.npy"
np.save(output_path, rouge1_blob_max)


# Worst-case ROUGE
rouge1_blob_min = compute_rougescores(
    ds_eval1,
    fine_tuning_type="blob",
    aggregation="min",
    rouge_type="rouge1"
)
print(len(rouge1_blob_min))
output_path="/content/drive/MyDrive/DLA_project/ROUGEScores/rouge1_blob_min.npy"
np.save(output_path, rouge1_blob_min)

In [None]:

#######      LORA      ############
# Mean over references
rouge1_lora_mean = compute_rougescores(
    ds_eval1,
    fine_tuning_type="lora",
    aggregation="mean",
    rouge_type="rouge1"
)
print(len(rouge1_lora_mean))
output_path = "/content/drive/MyDrive/DLA_project/ROUGEScoresLora/rouge1_lora_mean.npy"
np.save(output_path, rouge1_lora_mean)


# Oracle ROUGE (max over references)
rouge1_lora_max = compute_rougescores(
    ds_eval1,
    fine_tuning_type="lora",
    aggregation="max",
    rouge_type="rouge1"
)
print(len(rouge1_lora_max))
output_path = "/content/drive/MyDrive/DLA_project/ROUGEScoresLora/rouge1_lora_max.npy"
np.save(output_path, rouge1_lora_max)


# Worst-case ROUGE (min over references)
rouge1_lora_min = compute_rougescores(
    ds_eval1,
    fine_tuning_type="lora",
    aggregation="min",
    rouge_type="rouge1"
)
print(len(rouge1_lora_min))
output_path = "/content/drive/MyDrive/DLA_project/ROUGEScoresLora/rouge1_lora_min.npy"
np.save(output_path, rouge1_lora_min)

100%|██████████| 100/100 [00:27<00:00,  3.64it/s]


10000


100%|██████████| 100/100 [00:27<00:00,  3.63it/s]


10000


100%|██████████| 100/100 [00:27<00:00,  3.61it/s]

10000





#### ROUGE2

In [None]:
# Mean over references
rouge2_blob_mean = compute_rougescores(
    ds_eval1,
    fine_tuning_type="blob",
    aggregation="mean",
    rouge_type="rouge2"
)
print(len(rouge2_blob_mean))
output_path="/content/drive/MyDrive/DLA_project/ROUGEScores/rouge2_blob_mean.npy"
np.save(output_path, rouge2_blob_mean)

# Oracle ROUGE
rouge2_blob_max = compute_rougescores(
    ds_eval1,
    fine_tuning_type="blob",
    aggregation="max",
    rouge_type="rouge2"
)
print(len(rouge2_blob_max))
output_path="/content/drive/MyDrive/DLA_project/ROUGEScores/rouge2_blob_max.npy"
np.save(output_path, rouge2_blob_max)


# Worst-case ROUGE
rouge2_blob_min = compute_rougescores(
    ds_eval1,
    fine_tuning_type="blob",
    aggregation="min",
    rouge_type="rouge2"
)
print(len(rouge2_blob_min))
output_path="/content/drive/MyDrive/DLA_project/ROUGEScores/rouge2_blob_min.npy"
np.save(output_path, rouge2_blob_min)

In [None]:


#######      LORA      ############
# Mean over references
rouge2_lora_mean = compute_rougescores(
    ds_eval1,
    fine_tuning_type="lora",
    aggregation="mean",
    rouge_type="rouge2"
)
print(len(rouge2_lora_mean))
output_path = "/content/drive/MyDrive/DLA_project/ROUGEScoresLora/rouge2_lora_mean.npy"
np.save(output_path, rouge2_lora_mean)


# Oracle ROUGE (max over references)
rouge2_lora_max = compute_rougescores(
    ds_eval1,
    fine_tuning_type="lora",
    aggregation="max",
    rouge_type="rouge2"
)
print(len(rouge2_lora_max))
output_path = "/content/drive/MyDrive/DLA_project/ROUGEScoresLora/rouge2_lora_max.npy"
np.save(output_path, rouge2_lora_max)


# Worst-case ROUGE (min over references)
rouge2_lora_min = compute_rougescores(
    ds_eval1,
    fine_tuning_type="lora",
    aggregation="min",
    rouge_type="rouge2"
)
print(len(rouge2_lora_min))
output_path = "/content/drive/MyDrive/DLA_project/ROUGEScoresLora/rouge2_lora_min.npy"
np.save(output_path, rouge2_lora_min)

100%|██████████| 100/100 [00:28<00:00,  3.57it/s]


10000


100%|██████████| 100/100 [00:27<00:00,  3.60it/s]


10000


100%|██████████| 100/100 [00:27<00:00,  3.63it/s]

10000





#### ROUGEL

In [None]:
# Mean over references
rougeL_blob_mean = compute_rougescores(
    ds_eval1,
    fine_tuning_type="blob",
    aggregation="mean",
    rouge_type="rougeL"
)
print(len(rougeL_blob_mean))
output_path="/content/drive/MyDrive/DLA_project/ROUGEScores/rougeL_blob_mean.npy"
np.save(output_path, rougeL_blob_mean)

# Oracle ROUGE
rougeL_blob_max = compute_rougescores(
    ds_eval1,
    fine_tuning_type="blob",
    aggregation="max",
    rouge_type="rougeL"
)
print(len(rougeL_blob_max))
output_path="/content/drive/MyDrive/DLA_project/ROUGEScores/rougeL_blob_max.npy"
np.save(output_path, rougeL_blob_max)


# Worst-case ROUGE
rougeL_blob_min = compute_rougescores(
    ds_eval1,
    fine_tuning_type="blob",
    aggregation="min",
    rouge_type="rougeL"
)
print(len(rougeL_blob_min))
output_path="/content/drive/MyDrive/DLA_project/ROUGEScores/rougeL_blob_min.npy"
np.save(output_path, rougeL_blob_min)


In [None]:
#######      LORA      ############
# Mean over references
rougeL_lora_mean = compute_rougescores(
    ds_eval1,
    fine_tuning_type="lora",
    aggregation="mean",
    rouge_type="rougeL"
)
print(len(rougeL_lora_mean))
output_path = "/content/drive/MyDrive/DLA_project/ROUGEScoresLora/rougeL_lora_mean.npy"
np.save(output_path, rougeL_lora_mean)


# Oracle ROUGE (max over references)
rougeL_lora_max = compute_rougescores(
    ds_eval1,
    fine_tuning_type="lora",
    aggregation="max",
    rouge_type="rougeL"
)
print(len(rougeL_lora_max))
output_path = "/content/drive/MyDrive/DLA_project/ROUGEScoresLora/rougeL_lora_max.npy"
np.save(output_path, rougeL_lora_max)


# Worst-case ROUGE (min over references)
rougeL_lora_min = compute_rougescores(
    ds_eval1,
    fine_tuning_type="lora",
    aggregation="min",
    rouge_type="rougeL"
)
print(len(rougeL_lora_min))
output_path = "/content/drive/MyDrive/DLA_project/ROUGEScoresLora/rougeL_lora_min.npy"
np.save(output_path, rougeL_lora_min)

100%|██████████| 100/100 [00:27<00:00,  3.64it/s]


10000


100%|██████████| 100/100 [00:27<00:00,  3.65it/s]


10000


100%|██████████| 100/100 [00:27<00:00,  3.65it/s]

10000



