In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install -q "transformers>=4.45.0" "datasets>=2.20.0" accelerate sentencepiece

import transformers, datasets
print("Transformers version:", transformers.__version__)
print("Datasets version:", datasets.__version__)


Transformers version: 4.57.3
Datasets version: 4.0.0


# 0. Import & Config

## Config

- `config.dropout = 0.1`
  - general Transformer dropout
  - Prevents overfitting, smaller more likely to memorize and over fit

- `config.attention_dropout = 0.1`
  - Attention weights dropout
  - Robust, distributed attention, Prevents from focusing too much on single token, (try lower than dropout, or even 0.0?)

- `config.activation_dropout = 0.0`
  - Feed-forward MLP dropout
  - Extra FFN regularization (not necessary bc FFNs already have large dim)

Training recipe upgrades
- Learning-rate schedule + warmup
  Add linear warmup (e.g., 5–10% of steps) + linear decay. This often stabilizes seq2seq fine-tuning more than changing dropout.
    - warmup_ratio: first % of training steps, learning rate increases linearly from 0, prevent loss spiking early
    - lr_scheduler_type: after warmup, lr decreases linearly to 0 over training, more conservative than cosine
- Weight decay + label smoothing
  Weight decay (e.g., 0.01) + label smoothing (e.g., 0.1) can reduce overfitting and improve fluency.
    - label_smoothing_factor: improves generalization, encourages paraphrasing instead of copying
    - weight_decay: reduces copying the input verbatim, encourages smoother paraphrases

Decoding improvements (huge for perceived quality)
- try num_beams=4
    - compare 4 sentences choose the best probability sentence
- add length_penalty (e.g., 1.0–1.2)
    - prevent model from cutting too much, longer outputs
- add no_repeat_ngram_size=3
    - forbids repeating any 3-word sequence
- cap with max_new_tokens
    - 64 tokens = 3-4 sentences

In [None]:
import os
from typing import Dict, Any

import torch
from torch.utils.data import DataLoader
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
)
import matplotlib.pyplot as plt
from transformers import GenerationConfig

# Metrics
import re
import nltk
from nltk.corpus import cmudict
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import numpy as np
import json
from tqdm.auto import tqdm
import random

# Download required NLTK data
try:
    nltk.data.find('corpora/cmudict')
except LookupError:
    nltk.download('cmudict')

In [None]:
# Paths to JSON files
TRAIN_JSON_1 = "/content/drive/MyDrive/AML_Final_Project/Data/synthetic_train.json"
TRAIN_JSON_2 = "/content/drive/MyDrive/AML_Final_Project/Data/asset_train.json"

# TEST_JSON  = "/content/drive/MyDrive/AML_Final_Project/Data/synthetic_test.json"
TEST_JSON  = "/content/drive/MyDrive/AML_Final_Project/Data/asset_test.json"

MODEL_NAME = "facebook/bart-base"
OUTPUT_DIR = "/content/drive/MyDrive/AML_Final_Project/bart-config"

MAX_SOURCE_LENGTH = 128   # input: prompt + original, 80~100 words, 1024 limit
MAX_TARGET_LENGTH = 64    # output: simplified, 40~50 words

# Training hyperparameters
NUM_EPOCHS = 10
BATCH_SIZE = 8
LEARNING_RATE = 3e-5
GRAD_CLIP = 1.0

In [None]:
TASK_PREFIX = ()

# 1. GPU Device

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda


# 2. Load Json Data

In [None]:
raw_datasets = load_dataset(
    "json",
    data_files={
        "train": [TRAIN_JSON_1, TRAIN_JSON_2],
        "eval": TEST_JSON,
    },
)

train_dataset = raw_datasets["train"]
eval_dataset = raw_datasets["eval"]

print("Raw train entries:", len(train_dataset))
print("Raw eval entries:", len(eval_dataset))

Generating train split: 0 examples [00:00, ? examples/s]

Generating eval split: 0 examples [00:00, ? examples/s]

Raw train entries: 27330
Raw eval entries: 359


# 3. Flatten Multiple Simplifications

In [None]:
# Turn:
#   original = "..."
#   simplifications = ["a", "b", "c"]
# into:
#   (original, "a"), (original, "b"), (original, "c")

def explode_simplifications(examples: Dict[str, Any]) -> Dict[str, Any]:
    new_originals = []
    new_simplified = []

    for orig, sims in zip(examples["original"], examples["simplifications"]):
        for s in sims:
            new_originals.append(orig)
            new_simplified.append(s)

    return {
        "original": new_originals,
        "simplified": new_simplified,
    }

train_flat = train_dataset.map(
    explode_simplifications,
    batched=True,
    remove_columns=train_dataset.column_names,
)

eval_flat = eval_dataset.map(
    explode_simplifications,
    batched=True,
    remove_columns=eval_dataset.column_names,
)

print("Subset train examples:", len(train_flat))
print("Subset eval examples:", len(eval_flat))


Map:   0%|          | 0/27330 [00:00<?, ? examples/s]

Map:   0%|          | 0/359 [00:00<?, ? examples/s]

Subset train examples: 45330
Subset eval examples: 3590


# 4. Tokenizer & Model (BART)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

config = AutoConfig.from_pretrained(MODEL_NAME)
config.use_cache = False

# Config
config.dropout = 0.1
config.attention_dropout = 0.1
config.activation_dropout = 0.0

model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME, config=config)
model.to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

BartForConditionalGeneration(
  (model): BartModel(
    (shared): BartScaledWordEmbedding(50265, 768, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): BartScaledWordEmbedding(50265, 768, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 768)
      (layers): ModuleList(
        (0-5): 6 x BartEncoderLayer(
          (self_attn): BartAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (final_layer_n

# 5. Preprocessing

In [None]:
def preprocess_function(examples: Dict[str, Any]) -> Dict[str, Any]:
    # Build source strings with the task prefix + original text
    sources = [
        text.strip() for text in examples["original"]
    ]
    targets = [text.strip() for text in examples["simplified"]]

    # Tokenize inputs
    model_inputs = tokenizer(
        sources,
        truncation=True,
        padding=True, # "max_length" for fixed shapes
        # max_length=MAX_SOURCE_LENGTH,
    )

    labels = tokenizer(
        text_target=targets,
        truncation=True,
        padding=True,
        # max_length=MAX_TARGET_LENGTH,
    )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_train = train_flat.map(
    preprocess_function,
    batched=True,
    remove_columns=train_flat.column_names,
)

tokenized_eval = eval_flat.map(
    preprocess_function,
    batched=True,
    remove_columns=eval_flat.column_names,
)

print("Tokenized train sample keys:", tokenized_train[0].keys())


Map:   0%|          | 0/45330 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/3590 [00:00<?, ? examples/s]

Tokenized train sample keys: dict_keys(['input_ids', 'attention_mask', 'labels'])


# 6. Dataloaders & Optimizer

In [None]:
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    padding=True,
    # max_length=MAX_SOURCE_LENGTH,
    label_pad_token_id=-100,
)

train_dataloader = DataLoader(
    tokenized_train,
    shuffle=True,
    batch_size=BATCH_SIZE,
    collate_fn=data_collator,
)

eval_dataloader = DataLoader(
    tokenized_eval,
    shuffle=False,
    batch_size=BATCH_SIZE,
    collate_fn=data_collator,
)

optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE,
                              weight_decay=0.01)


# 7. Training (seq2seq)

In [None]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir=OUTPUT_DIR,

    num_train_epochs=10,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,

    learning_rate=2e-5,
    weight_decay=0.01,

    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",

    label_smoothing_factor=0.1, # label smoothing
    warmup_ratio=0.1, # warm up
    lr_scheduler_type = "linear", # LR scheduler

    predict_with_generate=False,
    fp16=torch.cuda.is_available(),

    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,

    report_to="none",
)


In [None]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    processing_class=tokenizer,
    data_collator=data_collator,
)


In [None]:
trainer.train()


Epoch,Training Loss,Validation Loss
1,2.5381,1.724008
2,1.7062,1.670887
3,1.6597,1.651295
4,1.6326,1.640602
5,1.6136,1.6363
6,1.5988,1.632226
7,1.5861,1.632363




In [None]:
import matplotlib.pyplot as plt

logs = trainer.state.log_history

train_loss = [x["loss"] for x in logs if "loss" in x and "epoch" in x]
train_ep   = [x["epoch"] for x in logs if "loss" in x and "epoch" in x]

eval_loss  = [x["eval_loss"] for x in logs if "eval_loss" in x]
eval_ep    = [x["epoch"] for x in logs if "eval_loss" in x]

plt.figure(figsize=(6,4))
plt.plot(train_ep, train_loss, marker="o", label="Train loss")
plt.plot(eval_ep,  eval_loss,  marker="o", label="Eval loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Train vs Eval Loss")
plt.legend()
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()


In [None]:
os.makedirs(OUTPUT_DIR, exist_ok=True)
model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print("Training complete. Model saved to:", OUTPUT_DIR)


# 8. Inference

In [None]:
# seq2seq
FINAL_CKPT = "/content/drive/MyDrive/AML_Final_Project/bart-config/checkpoint-final"

tokenizer = AutoTokenizer.from_pretrained(FINAL_CKPT)
inference_model = AutoModelForSeq2SeqLM.from_pretrained(FINAL_CKPT).to(device)

inference_model.eval()
inference_model.config.use_cache = True


In [None]:
gen_config = GenerationConfig.from_model_config(inference_model.config)

gen_config.use_cache = False
gen_config.num_beams = 4
gen_config.max_new_tokens = 64
gen_config.early_stopping = True
gen_config.forced_bos_token_id = 0
gen_config.no_repeat_ngram_size = 3
gen_config.length_penalty = 1.1



In [None]:
def simplify_trained(text: str, max_new_tokens: int = 64, num_beams: int = 4) -> str:
    model_input = text.strip()

    enc = tokenizer(
        model_input,
        return_tensors="pt",
        truncation=True,
        max_length=MAX_SOURCE_LENGTH,
    )
    enc = {k: v.to(device) for k, v in enc.items()}

    with torch.no_grad():
        output_ids = inference_model.generate(
            **enc,
            generation_config=gen_config
        )

    return tokenizer.decode(output_ids[0], skip_special_tokens=True) # seq2seq


In [None]:
test_text = (
    # "Adjacent counties are Marin (to the south), Mendocino (to the north), "
    # "Lake (northeast), Napa (to the east), and Solano and Contra Costa (to the southeast)."
    "Jeddah is the principal gateway to Mecca, Islam's holiest city, "
    "which able-bodied Muslims are required to visit at least once in their lifetime."
    )

print("\n=== ORIGINAL ===\n")
print(test_text)

print("\n=== SIMPLIFIED (MODEL OUTPUT) ===\n")
print(simplify_trained(test_text))



=== ORIGINAL ===

Jeddah is the principal gateway to Mecca, Islam's holiest city, which able-bodied Muslims are required to visit at least once in their lifetime.

=== SIMPLIFIED (MODEL OUTPUT) ===

Jossah is the main entry point for Muslims to visit in their lifetime. It is also the home of Islam's most famous city.


# 9. Evaluation

## 9-1. Metrics

In [None]:
# Initialize CMU Pronouncing Dictionary for syllable counting
d = cmudict.dict()

In [None]:
def count_syllables(word):
    """Count syllables in a word using CMU Pronouncing Dictionary."""
    word = word.lower()
    if word in d:
        return max([len([y for y in x if y[-1].isdigit()]) for x in d[word]])
    else:
        # Fallback: estimate syllables by counting vowel groups
        word = word.lower()
        count = 0
        vowels = 'aeiouy'
        previous_was_vowel = False
        for char in word:
            is_vowel = char in vowels
            if is_vowel and not previous_was_vowel:
                count += 1
            previous_was_vowel = is_vowel
        # Adjust for silent 'e'
        if word.endswith('e'):
            count -= 1
        if count == 0:
            count = 1
        return count

In [None]:
def flesch_kincaid_grade(text):
    """
    Calculate Flesch-Kincaid Grade Level.

    Args:
        text (str): The text to analyze

    Returns:
        float: The grade level score
    """
    # Split into sentences
    sentences = re.split(r'[.!?]+', text)
    sentences = [s.strip() for s in sentences if s.strip()]

    # Split into words
    words = re.findall(r'\b\w+\b', text.lower())

    if not sentences or not words:
        return 0.0

    total_sentences = len(sentences)
    total_words = len(words)
    total_syllables = sum(count_syllables(word) for word in words)

    # Flesch-Kincaid Grade Level formula
    # https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests
    grade = 0.39 * (total_words / total_sentences) + 11.8 * (total_syllables / total_words) - 15.59

    return round(grade, 2)


In [None]:
def bleu_score(reference, candidate):
    """
    Calculate BLEU score between reference and candidate text.

    BLEU measures how similar the candidate text is to the reference text.
    Score ranges from 0 to 1, where 1 indicates perfect match.

    Args:
        reference (str): The reference (original) text
        candidate (str): The candidate (simplified) text

    Returns:
        float: BLEU score between 0 and 1
    """
    # Tokenize texts into words
    reference_tokens = re.findall(r'\b\w+\b', reference.lower())
    candidate_tokens = re.findall(r'\b\w+\b', candidate.lower())

    # BLEU expects reference as list of lists
    reference_list = [reference_tokens]

    # Use smoothing to handle cases with no n-gram matches
    smoothing = SmoothingFunction().method1

    # Calculate BLEU score
    score = sentence_bleu(reference_list, candidate_tokens, smoothing_function=smoothing)

    return round(score, 4)

In [None]:
def sari_score(source, reference, candidate):
    """
    Calculate SARI score for text simplification.

    SARI measures the quality of simplification by evaluating:
    - Added words (should be simple/appropriate)
    - Deleted words (should remove complex content)
    - Kept words (should retain important information)

    Args:
        source (str): The original source text
        reference (str): The reference simplified text
        candidate (str): The candidate simplified text

    Returns:
        float: SARI score (0-100 scale)
    """
    # Tokenize texts
    source_tokens = set(re.findall(r'\b\w+\b', source.lower()))
    reference_tokens = set(re.findall(r'\b\w+\b', reference.lower()))
    candidate_tokens = set(re.findall(r'\b\w+\b', candidate.lower()))

    # Calculate add, keep, and delete operations
    # Add: words in candidate but not in source
    added = candidate_tokens - source_tokens
    # Keep: words in both source and candidate
    kept = source_tokens & candidate_tokens
    # Delete: words in source but not in candidate
    deleted = source_tokens - candidate_tokens

    # Calculate precision and recall for each operation
    # Add score: precision of added words (how many added words are in reference)
    if added:
        add_precision = len(added & reference_tokens) / len(added)
    else:
        add_precision = 0.0

    # Keep score: F1 of kept words
    if kept or (source_tokens & reference_tokens):
        keep_precision = len(kept & reference_tokens) / len(kept) if kept else 0
        keep_recall = len(kept & reference_tokens) / len(source_tokens & reference_tokens) if (source_tokens & reference_tokens) else 0
        if keep_precision + keep_recall > 0:
            keep_f1 = 2 * keep_precision * keep_recall / (keep_precision + keep_recall)
        else:
            keep_f1 = 0
    else:
        keep_f1 = 0

    # Delete score: precision of deleted words (how many deleted words are also deleted in reference)
    reference_deleted = source_tokens - reference_tokens
    if deleted:
        delete_precision = len(deleted & reference_deleted) / len(deleted)
    else:
        delete_precision = 0.0

    # SARI is the average of the three scores (scaled to 0-100)
    sari = (add_precision + keep_f1 + delete_precision) / 3 * 100

    return round(sari, 2)

In [None]:
def compression_ratio(original, simplified):
    """
    Calculate compression ratio between original and simplified text.

    Measures how much shorter the simplified text is compared to the original.
    Typical good simplifications: 0.6-0.8 (20-40% shorter)

    Args:
        original (str): The original text
        simplified (str): The simplified text

    Returns:
        dict: Dictionary with character-based and word-based compression ratios
    """
    # Character-based compression
    char_ratio = len(simplified) / len(original) if len(original) > 0 else 0

    # Word-based compression
    original_words = len(re.findall(r'\b\w+\b', original))
    simplified_words = len(re.findall(r'\b\w+\b', simplified))
    word_ratio = simplified_words / original_words if original_words > 0 else 0

    return {
        'char_ratio': round(char_ratio, 4),
        'word_ratio': round(word_ratio, 4),
        'char_reduction_pct': round((1 - char_ratio) * 100, 2),
        'word_reduction_pct': round((1 - word_ratio) * 100, 2)
    }

In [None]:
def average_sentence_length(text):
    """
    Calculate average sentence length in words.

    Simpler text typically has shorter sentences.
    General guidelines:
    - <15 words: Very easy
    - 15-20 words: Easy
    - 20-25 words: Moderate
    - >25 words: Difficult

    Args:
        text (str): The text to analyze

    Returns:
        dict: Dictionary with average sentence length and total sentences
    """
    # Split into sentences
    sentences = re.split(r'[.!?]+', text)
    sentences = [s.strip() for s in sentences if s.strip()]

    if not sentences:
        return {
            'avg_sentence_length': 0.0,
            'total_sentences': 0
        }

    # Count words in each sentence
    total_words = 0
    for sentence in sentences:
        words = re.findall(r'\b\w+\b', sentence)
        total_words += len(words)

    avg_length = total_words / len(sentences)

    return {
        'avg_sentence_length': round(avg_length, 2),
        'total_sentences': len(sentences)
    }

In [None]:
def print_results(results):
    """Pretty print the aggregated results."""
    print("\n" + "="*70)
    print(f"DATASET METRICS (n={results['dataset_size']} pairs)")
    print("="*70)

    print("\n📊 READABILITY (Flesch-Kincaid Grade Level)")
    print(f"  Original:    {results['flesch_kincaid']['original']['mean']:.2f} ± {results['flesch_kincaid']['original']['std']:.2f}")
    print(f"  Simplified:  {results['flesch_kincaid']['simplified']['mean']:.2f} ± {results['flesch_kincaid']['simplified']['std']:.2f}")
    print(f"  Improvement: {results['flesch_kincaid']['improvement']:.2f} grade levels")

    print("\n📝 SEMANTIC SIMILARITY (BLEU Score)")
    print(f"  Mean:   {results['bleu']['mean']:.4f} ± {results['bleu']['std']:.4f}")
    print(f"  Median: {results['bleu']['median']:.4f}")

    print("\n📏 COMPRESSION")
    print(f"  Word Ratio:  {results['compression']['word_ratio']['mean']:.4f} ({results['compression']['avg_word_reduction_pct']:.1f}% reduction)")
    print(f"  Char Ratio:  {results['compression']['char_ratio']['mean']:.4f}")

    print("\n📐 SENTENCE LENGTH (words/sentence)")
    print(f"  Original:    {results['avg_sentence_length']['original']['mean']:.2f} ± {results['avg_sentence_length']['original']['std']:.2f}")
    print(f"  Simplified:  {results['avg_sentence_length']['simplified']['mean']:.2f} ± {results['avg_sentence_length']['simplified']['std']:.2f}")
    print(f"  Reduction:   {results['avg_sentence_length']['reduction']:.2f} words")
    print()

## 9-2. Calculate

In [None]:
def generate_predictions(data, model, tokenizer, device):
    model.eval()
    preds = []

    for item in tqdm(data, desc="Generating predictions"):
        src = item["original"].strip()

        enc = tokenizer(
            src,
            return_tensors="pt",
            truncation=True,
            max_length=MAX_SOURCE_LENGTH,
        )
        enc = {k: v.to(device) for k, v in enc.items()}

        with torch.no_grad():
            out_ids = model.generate(
                **enc,
                generation_config=gen_config
            )

        pred = tokenizer.decode(out_ids[0], skip_special_tokens=True)
        preds.append(pred)

    return preds


In [None]:
def calculate_dataset_metrics(data, preds):
    fk_original_scores = []
    fk_pred_scores = []

    bleu_scores = []
    sari_scores = []

    compression_word_ratios = []
    compression_char_ratios = []

    asl_original_scores = []
    asl_pred_scores = []

    for item, pred in tqdm(list(zip(data, preds)), desc="Scoring metrics"):
        original = item["original"]
        references = item.get("simplifications", [item.get("simplified", "")])

        # FKGL
        fk_original_scores.append(flesch_kincaid_grade(original))
        fk_pred_scores.append(flesch_kincaid_grade(pred))

        # BLEU: reference(s) vs candidate(pred)
        bleu_per_refs = [bleu_score(ref, pred) for ref in references if ref and ref.strip()]
        bleu_scores.append(max(bleu_per_refs) if bleu_per_refs else 0.0)

        # SARI approximation: average across refs
        sari_per_refs = [sari_score(original, ref, pred) for ref in references if ref and ref.strip()]
        sari_scores.append(float(np.mean(sari_per_refs)) if sari_per_refs else 0.0)

        # Compression
        comp = compression_ratio(original, pred)
        compression_word_ratios.append(comp["word_ratio"])
        compression_char_ratios.append(comp["char_ratio"])

        # Avg sentence length
        asl_orig = average_sentence_length(original)["avg_sentence_length"]
        asl_p = average_sentence_length(pred)["avg_sentence_length"]
        asl_original_scores.append(asl_orig)
        asl_pred_scores.append(asl_p)

    results = {
        "dataset_size": len(data),

        "flesch_kincaid": {
            "original": {
                "mean": round(float(np.mean(fk_original_scores)), 2),
                "std": round(float(np.std(fk_original_scores)), 2),
                "median": round(float(np.median(fk_original_scores)), 2),
            },
            "pred": {
                "mean": round(float(np.mean(fk_pred_scores)), 2),
                "std": round(float(np.std(fk_pred_scores)), 2),
                "median": round(float(np.median(fk_pred_scores)), 2),
            },
            "improvement": round(float(np.mean(fk_original_scores) - np.mean(fk_pred_scores)), 2),
        },

        "bleu_max_over_refs": {
            "mean": round(float(np.mean(bleu_scores)), 4),
            "std": round(float(np.std(bleu_scores)), 4),
            "median": round(float(np.median(bleu_scores)), 4),
        },

        "sari": {
            "mean": round(np.mean(sari_scores), 2),
            "std": round(np.std(sari_scores), 2),
            "median": round(float(np.median(sari_scores)), 2)
        },

        "compression": {
            "word_ratio": {
                "mean": round(float(np.mean(compression_word_ratios)), 4),
                "std": round(float(np.std(compression_word_ratios)), 4),
                "median": round(float(np.median(compression_word_ratios)), 4),
            },
            "char_ratio": {
                "mean": round(float(np.mean(compression_char_ratios)), 4),
                "std": round(float(np.std(compression_char_ratios)), 4),
                "median": round(float(np.median(compression_char_ratios)), 4),
            },
            "avg_word_reduction_pct": round((1 - float(np.mean(compression_word_ratios))) * 100, 2),
        },

        "avg_sentence_length": {
            "original": {
                "mean": round(float(np.mean(asl_original_scores)), 2),
                "std": round(float(np.std(asl_original_scores)), 2),
                "median": round(float(np.median(asl_original_scores)), 2),
            },
            "pred": {
                "mean": round(float(np.mean(asl_pred_scores)), 2),
                "std": round(float(np.std(asl_pred_scores)), 2),
                "median": round(float(np.median(asl_pred_scores)), 2),
            },
            "reduction": round(float(np.mean(asl_original_scores) - np.mean(asl_pred_scores)), 2),
        },
    }

    return results


In [None]:
with open(TEST_JSON, "r") as f:
    eval_data = json.load(f)

'''
random.seed(1)

n_total = len(eval_data)
n_10 = int(0.1 * n_total)

eval_data_10 = random.sample(eval_data, n_10)
'''

best_model = inference_model

preds = generate_predictions(eval_data, best_model, tokenizer, device)

results = calculate_dataset_metrics(eval_data, preds)


Generating predictions:   0%|          | 0/359 [00:00<?, ?it/s]

Scoring metrics:   0%|          | 0/359 [00:00<?, ?it/s]

In [None]:
results

{'dataset_size': 359,
 'flesch_kincaid': {'original': {'mean': 11.8, 'std': 3.92, 'median': 11.68},
  'pred': {'mean': 8.6, 'std': 3.57, 'median': 8.18},
  'improvement': 3.2},
 'bleu_max_over_refs': {'mean': 0.5644, 'std': 0.2107, 'median': 0.5703},
 'sari': {'mean': np.float64(41.35),
  'std': np.float64(11.72),
  'median': 39.61},
 'compression': {'word_ratio': {'mean': 0.8998,
   'std': 0.1712,
   'median': 0.9286},
  'char_ratio': {'mean': 0.8571, 'std': 0.1645, 'median': 0.887},
  'avg_word_reduction_pct': 10.02},
 'avg_sentence_length': {'original': {'mean': 19.37,
   'std': 8.01,
   'median': 19.0},
  'pred': {'mean': 14.33, 'std': 6.48, 'median': 12.5},
  'reduction': 5.04}}

In [None]:
# Save results
OUT_PATH = "/content/drive/MyDrive/AML_Final_Project/Results/config_asset_results.json"
with open(OUT_PATH, "w") as f:
    json.dump(results, f, indent=2)

print("Saved:", OUT_PATH)

Saved: /content/drive/MyDrive/AML_Final_Project/Results/config_asset_results.json
