In [1]:
import re
import nltk
from nltk.corpus import cmudict
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import numpy as np
import json

In [2]:
# Download required NLTK data
try:
    nltk.data.find('corpora/cmudict')
except LookupError:
    nltk.download('cmudict')

In [3]:
# Initialize CMU Pronouncing Dictionary for syllable counting
d = cmudict.dict()

def count_syllables(word):
    """Count syllables in a word using CMU Pronouncing Dictionary."""
    word = word.lower()
    if word in d:
        return max([len([y for y in x if y[-1].isdigit()]) for x in d[word]])
    else:
        # Fallback: estimate syllables by counting vowel groups
        word = word.lower()
        count = 0
        vowels = 'aeiouy'
        previous_was_vowel = False
        for char in word:
            is_vowel = char in vowels
            if is_vowel and not previous_was_vowel:
                count += 1
            previous_was_vowel = is_vowel
        # Adjust for silent 'e'
        if word.endswith('e'):
            count -= 1
        if count == 0:
            count = 1
        return count

In [4]:
def flesch_kincaid_grade(text):
    """
    Calculate Flesch-Kincaid Grade Level.

    Args:
        text (str): The text to analyze

    Returns:
        float: The grade level score
    """
    # Split into sentences
    sentences = re.split(r'[.!?]+', text)
    sentences = [s.strip() for s in sentences if s.strip()]

    # Split into words
    words = re.findall(r'\b\w+\b', text.lower())

    if not sentences or not words:
        return 0.0

    total_sentences = len(sentences)
    total_words = len(words)
    total_syllables = sum(count_syllables(word) for word in words)

    # Flesch-Kincaid Grade Level formula
    # https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests
    grade = 0.39 * (total_words / total_sentences) + 11.8 * (total_syllables / total_words) - 15.59

    return round(grade, 2)

In [5]:
def bleu_score(reference, candidate):
    """
    Calculate BLEU score between reference and candidate text.

    BLEU measures how similar the candidate text is to the reference text.
    Score ranges from 0 to 1, where 1 indicates perfect match.

    Args:
        reference (str): The reference (original) text
        candidate (str): The candidate (simplified) text

    Returns:
        float: BLEU score between 0 and 1
    """
    # Tokenize texts into words
    reference_tokens = re.findall(r'\b\w+\b', reference.lower())
    candidate_tokens = re.findall(r'\b\w+\b', candidate.lower())

    # BLEU expects reference as list of lists
    reference_list = [reference_tokens]

    # Use smoothing to handle cases with no n-gram matches
    smoothing = SmoothingFunction().method1

    # Calculate BLEU score
    score = sentence_bleu(reference_list, candidate_tokens, smoothing_function=smoothing)

    return round(score, 4)

In [6]:
def sari_score(source, reference, candidate):
    """
    Calculate SARI score for text simplification.

    SARI measures the quality of simplification by evaluating:
    - Added words (should be simple/appropriate)
    - Deleted words (should remove complex content)
    - Kept words (should retain important information)

    Args:
        source (str): The original source text
        reference (str): The reference simplified text
        candidate (str): The candidate simplified text

    Returns:
        float: SARI score (0-100 scale)
    """
    # Tokenize texts
    source_tokens = set(re.findall(r'\b\w+\b', source.lower()))
    reference_tokens = set(re.findall(r'\b\w+\b', reference.lower()))
    candidate_tokens = set(re.findall(r'\b\w+\b', candidate.lower()))

    # Calculate add, keep, and delete operations
    # Add: words in candidate but not in source
    added = candidate_tokens - source_tokens
    # Keep: words in both source and candidate
    kept = source_tokens & candidate_tokens
    # Delete: words in source but not in candidate
    deleted = source_tokens - candidate_tokens

    # Calculate precision and recall for each operation
    # Add score: precision of added words (how many added words are in reference)
    if added:
        add_precision = len(added & reference_tokens) / len(added)
    else:
        add_precision = 0.0

    # Keep score: F1 of kept words
    if kept or (source_tokens & reference_tokens):
        keep_precision = len(kept & reference_tokens) / len(kept) if kept else 0
        keep_recall = len(kept & reference_tokens) / len(source_tokens & reference_tokens) if (source_tokens & reference_tokens) else 0
        if keep_precision + keep_recall > 0:
            keep_f1 = 2 * keep_precision * keep_recall / (keep_precision + keep_recall)
        else:
            keep_f1 = 0
    else:
        keep_f1 = 0

    # Delete score: precision of deleted words (how many deleted words are also deleted in reference)
    reference_deleted = source_tokens - reference_tokens
    if deleted:
        delete_precision = len(deleted & reference_deleted) / len(deleted)
    else:
        delete_precision = 0.0

    # SARI is the average of the three scores (scaled to 0-100)
    sari = (add_precision + keep_f1 + delete_precision) / 3 * 100

    return round(sari, 2)

In [7]:
def compression_ratio(original, simplified):
    """
    Calculate compression ratio between original and simplified text.

    Measures how much shorter the simplified text is compared to the original.
    Typical good simplifications: 0.6-0.8 (20-40% shorter)

    Args:
        original (str): The original text
        simplified (str): The simplified text

    Returns:
        dict: Dictionary with character-based and word-based compression ratios
    """
    # Character-based compression
    char_ratio = len(simplified) / len(original) if len(original) > 0 else 0

    # Word-based compression
    original_words = len(re.findall(r'\b\w+\b', original))
    simplified_words = len(re.findall(r'\b\w+\b', simplified))
    word_ratio = simplified_words / original_words if original_words > 0 else 0

    return {
        'char_ratio': round(char_ratio, 4),
        'word_ratio': round(word_ratio, 4),
        'char_reduction_pct': round((1 - char_ratio) * 100, 2),
        'word_reduction_pct': round((1 - word_ratio) * 100, 2)
    }


In [8]:
def average_sentence_length(text):
    """
    Calculate average sentence length in words.

    Simpler text typically has shorter sentences.
    General guidelines:
    - <15 words: Very easy
    - 15-20 words: Easy
    - 20-25 words: Moderate
    - >25 words: Difficult

    Args:
        text (str): The text to analyze

    Returns:
        dict: Dictionary with average sentence length and total sentences
    """
    # Split into sentences
    sentences = re.split(r'[.!?]+', text)
    sentences = [s.strip() for s in sentences if s.strip()]

    if not sentences:
        return {
            'avg_sentence_length': 0.0,
            'total_sentences': 0
        }

    # Count words in each sentence
    total_words = 0
    for sentence in sentences:
        words = re.findall(r'\b\w+\b', sentence)
        total_words += len(words)

    avg_length = total_words / len(sentences)

    return {
        'avg_sentence_length': round(avg_length, 2),
        'total_sentences': len(sentences)
    }

In [9]:
# Example usage
if __name__ == "__main__":
    example =  {
    "original": "When a patient arrives at the emergency room, they are first triaged to assess the severity of their condition, after which they are directed to the appropriate treatment area based on priority and available resources.",
    "simplifications": [
      "When patients arrive at the emergency room, they are checked to see how serious their condition is and then taken to the right treatment area."
    ]
  }

    original = example["original"]
    simplified = example["simplifications"][0]

    print("Original text:")
    print(original)
    print("\nSimplified text:")
    print(simplified)
    print("\n" + "="*60)

    print("\nMetrics:")
    print(f"Flesch-Kincaid Grade (Original): {flesch_kincaid_grade(original)}")
    print(f"Flesch-Kincaid Grade (Simplified): {flesch_kincaid_grade(simplified)}")

    print(f"\nBLEU Score: {bleu_score(original, simplified)}")

    # the SARI needs three texts: Source, Reference, and Candidate. So for our project do we not use it?
    # print(f"\nSARI Score: {sari_score(original, simplified, simplified)}")

    # Compression Ratio
    print("\nCompression Ratio:")
    compression = compression_ratio(original, simplified)
    print(f"  Word Ratio: {compression['word_ratio']} ({compression['word_reduction_pct']}% reduction)")
    print(f"  Char Ratio: {compression['char_ratio']} ({compression['char_reduction_pct']}% reduction)")

    # Average Sentence Length
    print("\nAverage Sentence Length:")
    asl_original = average_sentence_length(original)
    asl_simplified = average_sentence_length(simplified)
    print(f"  Original: {asl_original['avg_sentence_length']} words/sentence ({asl_original['total_sentences']} sentences)")
    print(f"  Simplified: {asl_simplified['avg_sentence_length']} words/sentence ({asl_simplified['total_sentences']} sentences)")


Original text:
When a patient arrives at the emergency room, they are first triaged to assess the severity of their condition, after which they are directed to the appropriate treatment area based on priority and available resources.

Simplified text:
When patients arrive at the emergency room, they are checked to see how serious their condition is and then taken to the right treatment area.


Metrics:
Flesch-Kincaid Grade (Original): 19.64
Flesch-Kincaid Grade (Simplified): 12.1

BLEU Score: 0.1759

Compression Ratio:
  Word Ratio: 0.7143 (28.57% reduction)
  Char Ratio: 0.6514 (34.86% reduction)

Average Sentence Length:
  Original: 35.0 words/sentence (1 sentences)
  Simplified: 25.0 words/sentence (1 sentences)


In [15]:
"""
Calculate metrics across an entire dataset of sentence pairs.
Assumes you have all the individual metric functions already defined.
"""

def calculate_dataset_metrics(data, use_bertscore=True):
    """
    Calculate metrics for an entire dataset of sentence pairs.

    Args:
        data (list): List of dicts with 'original' and 'simplifications' keys
        use_bertscore (bool): Whether to calculate BERTScore (slower)

    Returns:
        dict: Aggregated metrics across the dataset
    """
    # Storage for individual scores
    fk_original_scores = []
    fk_simplified_scores = []
    bleu_scores = []
    compression_word_ratios = []
    compression_char_ratios = []
    asl_original_scores = []
    asl_simplified_scores = []

    # Process each example
    for item in data:
        original = item['original']
        simplified = item['simplifications'][0]  # Assuming first simplification

        # Flesch-Kincaid
        fk_original_scores.append(flesch_kincaid_grade(original))
        fk_simplified_scores.append(flesch_kincaid_grade(simplified))

        # BLEU
        bleu_scores.append(bleu_score(original, simplified))

        # Compression
        comp = compression_ratio(original, simplified)
        compression_word_ratios.append(comp['word_ratio'])
        compression_char_ratios.append(comp['char_ratio'])

        # Average Sentence Length
        asl_orig = average_sentence_length(original)
        asl_simp = average_sentence_length(simplified)
        asl_original_scores.append(asl_orig['avg_sentence_length'])
        asl_simplified_scores.append(asl_simp['avg_sentence_length'])

    # Calculate aggregate statistics
    results = {
        'dataset_size': len(data),
        'flesch_kincaid': {
            'original': {
                'mean': round(np.mean(fk_original_scores), 2),
                'std': round(np.std(fk_original_scores), 2),
                'median': round(np.median(fk_original_scores), 2)
            },
            'simplified': {
                'mean': round(np.mean(fk_simplified_scores), 2),
                'std': round(np.std(fk_simplified_scores), 2),
                'median': round(np.median(fk_simplified_scores), 2)
            },
            'improvement': round(np.mean(fk_original_scores) - np.mean(fk_simplified_scores), 2)
        },
        'bleu': {
            'mean': round(np.mean(bleu_scores), 4),
            'std': round(np.std(bleu_scores), 4),
            'median': round(np.median(bleu_scores), 4)
        },
        'compression': {
            'word_ratio': {
                'mean': round(np.mean(compression_word_ratios), 4),
                'std': round(np.std(compression_word_ratios), 4),
                'median': round(np.median(compression_word_ratios), 4)
            },
            'char_ratio': {
                'mean': round(np.mean(compression_char_ratios), 4),
                'std': round(np.std(compression_char_ratios), 4),
                'median': round(np.median(compression_char_ratios), 4)
            },
            'avg_word_reduction_pct': round((1 - np.mean(compression_word_ratios)) * 100, 2)
        },
        'avg_sentence_length': {
            'original': {
                'mean': round(np.mean(asl_original_scores), 2),
                'std': round(np.std(asl_original_scores), 2)
            },
            'simplified': {
                'mean': round(np.mean(asl_simplified_scores), 2),
                'std': round(np.std(asl_simplified_scores), 2)
            },
            'reduction': round(np.mean(asl_original_scores) - np.mean(asl_simplified_scores), 2)
        }
    }

    return results

In [16]:
def print_results(results):
    """Pretty print the aggregated results."""
    print("\n" + "="*70)
    print(f"DATASET METRICS (n={results['dataset_size']} pairs)")
    print("="*70)

    print("\nüìä READABILITY (Flesch-Kincaid Grade Level)")
    print(f"  Original:    {results['flesch_kincaid']['original']['mean']:.2f} ¬± {results['flesch_kincaid']['original']['std']:.2f}")
    print(f"  Simplified:  {results['flesch_kincaid']['simplified']['mean']:.2f} ¬± {results['flesch_kincaid']['simplified']['std']:.2f}")
    print(f"  Improvement: {results['flesch_kincaid']['improvement']:.2f} grade levels")

    print("\nüìù SEMANTIC SIMILARITY (BLEU Score)")
    print(f"  Mean:   {results['bleu']['mean']:.4f} ¬± {results['bleu']['std']:.4f}")
    print(f"  Median: {results['bleu']['median']:.4f}")

    print("\nüìè COMPRESSION")
    print(f"  Word Ratio:  {results['compression']['word_ratio']['mean']:.4f} ({results['compression']['avg_word_reduction_pct']:.1f}% reduction)")
    print(f"  Char Ratio:  {results['compression']['char_ratio']['mean']:.4f}")

    print("\nüìê SENTENCE LENGTH (words/sentence)")
    print(f"  Original:    {results['avg_sentence_length']['original']['mean']:.2f} ¬± {results['avg_sentence_length']['original']['std']:.2f}")
    print(f"  Simplified:  {results['avg_sentence_length']['simplified']['mean']:.2f} ¬± {results['avg_sentence_length']['simplified']['std']:.2f}")
    print(f"  Reduction:   {results['avg_sentence_length']['reduction']:.2f} words")
    print()

In [17]:
# Example usage
from google.colab import drive
drive.mount('/content/drive')

if __name__ == "__main__":
    # Load your data from JSON file
    with open('/content/drive/MyDrive/AML/AML_Final_Project/Data/synthetic_test.json', 'r') as f:
        data = json.load(f)

    # Calculate metrics (set use_bertscore=False for faster computation)
    results = calculate_dataset_metrics(data, use_bertscore=True)

    # Print results
    print_results(results)

    # Optionally save to file
    with open('/content/drive/MyDrive/AML/AML_Final_Project/Data/metrics_results.json', 'w') as f:
        json.dump(results, f, indent=2)
    print("Results saved to Google Drive: Shareddrives/AML_Final_Project/metrics_results.json")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

DATASET METRICS (n=4470 pairs)

üìä READABILITY (Flesch-Kincaid Grade Level)
  Original:    21.85 ¬± 2.84
  Simplified:  11.95 ¬± 3.10
  Improvement: 9.90 grade levels

üìù SEMANTIC SIMILARITY (BLEU Score)
  Mean:   0.0498 ¬± 0.0548
  Median: 0.0260

üìè COMPRESSION
  Word Ratio:  0.5916 (40.8% reduction)
  Char Ratio:  0.5208

üìê SENTENCE LENGTH (words/sentence)
  Original:    31.32 ¬± 5.12
  Simplified:  18.35 ¬± 4.44
  Reduction:   12.96 words

Results saved to Google Drive: Shareddrives/AML_Final_Project/metrics_results.json
