In [None]:
!pip install transformers sentencepiece




In [None]:
# -*- coding: utf-8 -*-
"""
Evaluate baseline BART (no training) on:
  - FULL asset_test.json
  - 10% of synthetic_test.json

For each set:
  - Input = original sentences ONLY
  - Model generates its own simplifications
  - We compare model outputs to originals + human simplifications (if present)
  - We save JSON with: original, simplifications, bart_baseline, metrics
"""

# ============================
# 0. SETUP: DRIVE & INSTALLS
# ============================

from google.colab import drive
drive.mount('/content/drive')

!pip install -q "transformers>=4.45.0" sentencepiece nltk

import os
import json
import re
import numpy as np
import nltk
from nltk.corpus import cmudict
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# ============================
# 1. MODEL CONFIG (BASELINE BART)
# ============================

MODEL_NAME = "facebook/bart-base"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

TASK_PREFIX = (
    "Explain this in simple, plain language for a general audience. "
    "Use short sentences and everyday words, but keep all important information.\n\n"
)

MAX_SOURCE_LENGTH = 128
MAX_NEW_TOKENS   = 64
NUM_BEAMS        = 4

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model     = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME).to(device)
model.eval()

def simplify_baseline(text: str) -> str:
    """Run the untrained BART-base as a plain-language simplifier."""
    model_input = TASK_PREFIX + text.strip()

    enc = tokenizer(
        model_input,
        return_tensors="pt",
        truncation=True,
        max_length=MAX_SOURCE_LENGTH,
    ).to(device)

    with torch.no_grad():
        output_ids = model.generate(
            **enc,
            max_new_tokens=MAX_NEW_TOKENS,
            num_beams=NUM_BEAMS,
            early_stopping=True,
        )

    return tokenizer.decode(output_ids[0], skip_special_tokens=True)

# Quick sanity check (optional)
test_text = (
    "Adjacent counties are Marin (to the south), Mendocino (to the north), "
    "Lake (northeast), Napa (to the east), and Solano and Contra Costa (to the southeast)."
)
print("\n=== QUICK SANITY CHECK (BASELINE BART) ===")
print("ORIGINAL:\n", test_text)
print("\nBASELINE OUTPUT:\n", simplify_baseline(test_text))

# ============================
# 2. NLTK / CMUDICT SETUP
# ============================

try:
    nltk.data.find('corpora/cmudict')
except LookupError:
    nltk.download('cmudict')

d = cmudict.dict()

# ============================
# 3. METRIC FUNCTIONS
# ============================

def count_syllables(word):
    word = word.lower()
    if word in d:
        return max([len([y for y in x if y[-1].isdigit()]) for x in d[word]])
    else:
        word = word.lower()
        count = 0
        vowels = 'aeiouy'
        previous_was_vowel = False
        for char in word:
            is_vowel = char in vowels
            if is_vowel and not previous_was_vowel:
                count += 1
            previous_was_vowel = is_vowel
        if word.endswith('e'):
            count -= 1
        if count == 0:
            count = 1
        return count

def flesch_kincaid_grade(text):
    sentences = re.split(r'[.!?]+', text)
    sentences = [s.strip() for s in sentences if s.strip()]

    words = re.findall(r'\b\w+\b', text.lower())

    if not sentences or not words:
        return 0.0

    total_sentences = len(sentences)
    total_words = len(words)
    total_syllables = sum(count_syllables(word) for word in words)

    grade = 0.39 * (total_words / total_sentences) + \
            11.8 * (total_syllables / total_words) - 15.59

    return round(grade, 2)

def bleu_score(reference, candidate):
    """
    BLEU here is computed between ORIGINAL and MODEL OUTPUT.
    (You can switch to reference simplification later if you want.)
    """
    reference_tokens = re.findall(r'\b\w+\b', reference.lower())
    candidate_tokens = re.findall(r'\b\w+\b', candidate.lower())

    reference_list = [reference_tokens]
    smoothing = SmoothingFunction().method1

    score = sentence_bleu(reference_list, candidate_tokens,
                          smoothing_function=smoothing)
    return round(score, 4)

def sari_score(source, reference, candidate):
    """
    SARI uses:
      - source = original
      - reference = first human simplification (if present)
      - candidate = BART baseline output
    """
    source_tokens    = set(re.findall(r'\b\w+\b', source.lower()))
    reference_tokens = set(re.findall(r'\b\w+\b', reference.lower()))
    candidate_tokens = set(re.findall(r'\b\w+\b', candidate.lower()))

    added   = candidate_tokens - source_tokens
    kept    = source_tokens & candidate_tokens
    deleted = source_tokens - candidate_tokens

    # Add score
    if added:
        add_precision = len(added & reference_tokens) / len(added)
    else:
        add_precision = 0.0

    # Keep score (F1)
    if kept or (source_tokens & reference_tokens):
        keep_precision = len(kept & reference_tokens) / len(kept) if kept else 0
        keep_recall    = len(kept & reference_tokens) / len(source_tokens & reference_tokens) \
                         if (source_tokens & reference_tokens) else 0
        if keep_precision + keep_recall > 0:
            keep_f1 = 2 * keep_precision * keep_recall / (keep_precision + keep_recall)
        else:
            keep_f1 = 0
    else:
        keep_f1 = 0

    # Delete score
    reference_deleted = source_tokens - reference_tokens
    if deleted:
        delete_precision = len(deleted & reference_deleted) / len(deleted)
    else:
        delete_precision = 0.0

    sari = (add_precision + keep_f1 + delete_precision) / 3 * 100
    return round(sari, 2)

def compression_ratio(original, simplified):
    char_ratio = len(simplified) / len(original) if len(original) > 0 else 0

    original_words   = len(re.findall(r'\b\w+\b', original))
    simplified_words = len(re.findall(r'\b\w+\b', simplified))
    word_ratio       = simplified_words / original_words if original_words > 0 else 0

    return {
        'char_ratio': round(char_ratio, 4),
        'word_ratio': round(word_ratio, 4),
        'char_reduction_pct': round((1 - char_ratio) * 100, 2),
        'word_reduction_pct': round((1 - word_ratio) * 100, 2)
    }

def average_sentence_length(text):
    sentences = re.split(r'[.!?]+', text)
    sentences = [s.strip() for s in sentences if s.strip()]

    if not sentences:
        return {
            'avg_sentence_length': 0.0,
            'total_sentences': 0
        }

    total_words = 0
    for sentence in sentences:
        words = re.findall(r'\b\w+\b', sentence)
        total_words += len(words)

    avg_length = total_words / len(sentences)

    return {
        'avg_sentence_length': round(avg_length, 2),
        'total_sentences': len(sentences)
    }

def safe_mean(lst):
    return float(np.mean(lst)) if lst else float('nan')

# ============================
# 4. GENERIC EVAL HELPER
# ============================

def evaluate_baseline_on_json(
    input_json_path: str,
    output_json_path: str,
    split_name: str,
    subset_fraction: float = None
):
    """
    - Loads raw test JSON with fields:
        "original", "simplifications" (list of human refs)
    - Runs baseline BART on original sentences only
    - Computes metrics
    - Saves results with bart_baseline + metrics
    - Prints corpus-level averages
    """
    with open(input_json_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    n_total = len(data)
    if subset_fraction is not None:
        n_use = max(1, int(subset_fraction * n_total))
        data = data[:n_use]
        print(f"\nLoaded {n_use} / {n_total} entries from {split_name} (subset_fraction={subset_fraction}).")
    else:
        print(f"\nLoaded {n_total} entries from {split_name} (full set).")

    print("First keys:", list(data[0].keys()))

    metric_agg = {
        "fkg_original": [],
        "fkg_bart": [],
        "bleu_original_bart": [],
        "sari_bart_vs_ref0": [],
        "word_ratio": [],
        "char_ratio": [],
        "asl_original": [],
        "asl_bart": [],
    }

    results = []

    for i, entry in enumerate(data):
        original = entry["original"]
        refs = entry.get("simplifications", [])

        # 1) Model generates its own simplification from ORIGINAL only
        bart_text = simplify_baseline(original)

        # 2) Metrics
        fkg_orig = flesch_kincaid_grade(original)
        fkg_bart = flesch_kincaid_grade(bart_text)

        bleu_bart = bleu_score(original, bart_text)

        comp      = compression_ratio(original, bart_text)
        asl_orig  = average_sentence_length(original)
        asl_bart  = average_sentence_length(bart_text)

        if refs:
            sari_bart = sari_score(
                source=original,
                reference=refs[0],   # first human simplification
                candidate=bart_text
            )
        else:
            sari_bart = None

        metrics = {
            "fkg_original": fkg_orig,
            "fkg_bart": fkg_bart,
            "bleu_original_bart": bleu_bart,
            "compression": comp,
            "asl_original": asl_orig,
            "asl_bart": asl_bart,
            "sari_bart_vs_first_ref": sari_bart,
        }

        result_entry = {
            "index": i,
            "original": original,
            "simplifications": refs,
            "bart_baseline": bart_text,
            "metrics": metrics,
        }
        results.append(result_entry)

        # Aggregate
        metric_agg["fkg_original"].append(fkg_orig)
        metric_agg["fkg_bart"].append(fkg_bart)
        metric_agg["bleu_original_bart"].append(bleu_bart)
        metric_agg["word_ratio"].append(comp["word_ratio"])
        metric_agg["char_ratio"].append(comp["char_ratio"])
        metric_agg["asl_original"].append(asl_orig["avg_sentence_length"])
        metric_agg["asl_bart"].append(asl_bart["avg_sentence_length"])
        if sari_bart is not None:
            metric_agg["sari_bart_vs_ref0"].append(sari_bart)

        if (i + 1) % 50 == 0:
            print(f"  Processed {i+1}/{len(data)} examples in {split_name}...")

    # Save JSON with metrics
    os.makedirs(os.path.dirname(output_json_path), exist_ok=True)
    with open(output_json_path, "w", encoding="utf-8") as f:
        json.dump(results, f, ensure_ascii=False, indent=2)

    print(f"\nSaved {split_name} baseline outputs + metrics to:\n{output_json_path}")

    # Corpus-level stats
    print(f"\n=== CORPUS-LEVEL AVERAGES (BART BASELINE – {split_name}) ===")
    print("Flesch-Kincaid (original):", safe_mean(metric_agg["fkg_original"]))
    print("Flesch-Kincaid (BART):    ", safe_mean(metric_agg["fkg_bart"]))
    print("BLEU (original vs BART):  ", safe_mean(metric_agg["bleu_original_bart"]))
    print("Word ratio (BART/orig):   ", safe_mean(metric_agg["word_ratio"]))
    print("Char ratio (BART/orig):   ", safe_mean(metric_agg["char_ratio"]))
    print("ASL original (words/s):   ", safe_mean(metric_agg["asl_original"]))
    print("ASL BART (words/s):       ", safe_mean(metric_agg["asl_bart"]))
    print("SARI (if refs present):   ", safe_mean(metric_agg["sari_bart_vs_ref0"]))


# ============================
# 5. RUN EVALS:
#    - FULL asset test
#    - 10% synthetic test
# ============================

ASSET_TEST_JSON   = "/content/drive/My Drive/AML_Final_Project/Data/asset_test.json"
SYNTH_TEST_JSON   = "/content/drive/My Drive/AML_Final_Project/Data/synthetic_test.json"

ASSET_OUT_JSON    = "/content/drive/My Drive/colab_data/baseline_notrain_asset_test_with_metrics.json"
SYNTH_10P_OUT_JSON = "/content/drive/My Drive/colab_data/baseline_notrain_synthetic_test_10p_with_metrics.json"

# 1) FULL asset test
evaluate_baseline_on_json(
    input_json_path=ASSET_TEST_JSON,
    output_json_path=ASSET_OUT_JSON,
    split_name="ASSET_TEST_FULL",
    subset_fraction=None,        # full
)

# 2) 10% synthetic test
evaluate_baseline_on_json(
    input_json_path=SYNTH_TEST_JSON,
    output_json_path=SYNTH_10P_OUT_JSON,
    split_name="SYNTH_TEST_10P",
    subset_fraction=0.10,        # 10%
)

print("\n=== DONE: baseline (no-train) BART evaluated on full asset test + 10% synthetic test ===")


Mounted at /content/drive
Using device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]


=== QUICK SANITY CHECK (BASELINE BART) ===
ORIGINAL:
 Adjacent counties are Marin (to the south), Mendocino (to the north), Lake (northeast), Napa (to the east), and Solano and Contra Costa (to the southeast).

BASELINE OUTPUT:
 Explain this in simple, plain language for a general audience. Use short sentences and everyday words, but keep all important information. For example, the following is a list of counties in California:Adjacent counties are Marin (to the south), Mendocino ( to the north), Lake (nort


[nltk_data] Downloading package cmudict to /root/nltk_data...
[nltk_data]   Unzipping corpora/cmudict.zip.



Loaded 359 entries from ASSET_TEST_FULL (full set).
First keys: ['original', 'simplifications']
  Processed 50/359 examples in ASSET_TEST_FULL...
  Processed 100/359 examples in ASSET_TEST_FULL...
  Processed 150/359 examples in ASSET_TEST_FULL...
  Processed 200/359 examples in ASSET_TEST_FULL...
  Processed 250/359 examples in ASSET_TEST_FULL...
  Processed 300/359 examples in ASSET_TEST_FULL...
  Processed 350/359 examples in ASSET_TEST_FULL...

Saved ASSET_TEST_FULL baseline outputs + metrics to:
/content/drive/My Drive/colab_data/baseline_notrain_asset_test_with_metrics.json

=== CORPUS-LEVEL AVERAGES (BART BASELINE – ASSET_TEST_FULL) ===
Flesch-Kincaid (original): 11.798300835654597
Flesch-Kincaid (BART):     9.075376044568246
BLEU (original vs BART):   0.3633807799442897
Word ratio (BART/orig):    2.4642593314763235
Char ratio (BART/orig):    2.541618105849582
ASL original (words/s):    19.366768802228414
ASL BART (words/s):        12.112311977715876
SARI (if refs present):    