# Statistical Validation Demo

This notebook demonstrates bootstrap confidence intervals and sensitivity analysis for entropy estimates, following best practices in NLP evaluation.


In [None]:
# %%
%load_ext autoreload
%autoreload 2

from pathlib import Path
import json
import numpy as np
import matplotlib.pyplot as plt

from reducelang.validation import (
    block_bootstrap,
    compute_bootstrap_ci,
    run_ablation_study,
    format_sensitivity_results,
)
from reducelang.models import UnigramModel, NGramModel, PPMModel
from reducelang.alphabet import ENGLISH_ALPHABET, ROMANIAN_ALPHABET


## Bootstrap overview

Bootstrap resampling estimates the sampling distribution of a statistic (here, bits/char) by repeatedly resampling the data with replacement. Block bootstrap preserves temporal dependencies in text by resampling contiguous blocks rather than individual characters.


In [None]:
# %%
# Load a small English corpus sample (adjust the path as needed)
corpus_file = Path("data/corpora/en/2025-10-01/processed/text8.txt")
text = corpus_file.read_text(encoding="utf-8")[:50000]

split_idx = int(len(text) * 0.8)
train_text = text[:split_idx]
test_text = text[split_idx:]

model = PPMModel(ENGLISH_ALPHABET, depth=5)
model.fit(ENGLISH_ALPHABET.normalize(train_text))
bpc = model.evaluate(ENGLISH_ALPHABET.normalize(test_text))
print(f"Point estimate: {bpc:.4f} bpc")


## Bootstrap demonstration


In [None]:
# %%
bootstrap_results = block_bootstrap(
    text=ENGLISH_ALPHABET.normalize(test_text),
    model=model,
    block_size=2000,
    n_resamples=100,
    confidence_level=0.95,
    seed=42,
)
print(f"Mean: {bootstrap_results['mean_bpc']:.4f} bpc")
print(f"Std: {bootstrap_results['std_bpc']:.4f} bpc")
print(
    f"95% CI: [{bootstrap_results['ci_lower_bpc']:.4f}, {bootstrap_results['ci_upper_bpc']:.4f}]"
)


In [None]:
# %%
redundancy_ci = compute_bootstrap_ci(
    bits_per_char=bpc,
    log2_alphabet_size=ENGLISH_ALPHABET.log2_size,
    bootstrap_results=bootstrap_results,
)
print(f"Redundancy: {redundancy_ci['redundancy']:.2%}")
print(
    f"95% CI: [{redundancy_ci['ci_lower_redundancy']:.2%}, {redundancy_ci['ci_upper_redundancy']:.2%}]"
)


In [None]:
# %%
fig, ax = plt.subplots(figsize=(8, 5))
ax.axvline(bpc, color='red', linestyle='--', label='Point estimate')
ax.axvline(bootstrap_results['ci_lower_bpc'], color='blue', linestyle=':', label='95% CI')
ax.axvline(bootstrap_results['ci_upper_bpc'], color='blue', linestyle=':')
ax.set_xlabel('Bits per character')
ax.set_title('Bootstrap Confidence Interval')
ax.legend()
plt.show()


## Sensitivity analysis overview

Sensitivity analysis tests how robust entropy estimates are to preprocessing choices. For Romanian, we test the effect of removing diacritics. For all languages, we test removing space from the alphabet.


In [None]:
# %%
# Romanian sample (adjust path as needed)
corpus_file_ro = Path("data/corpora/ro/2025-10-01/processed/opus.txt")
text_ro = corpus_file_ro.read_text(encoding="utf-8")[:50000]
split_idx_ro = int(len(text_ro) * 0.8)
train_text_ro = text_ro[:split_idx_ro]
test_text_ro = text_ro[split_idx_ro:]


In [None]:
# %%
sensitivity_results = run_ablation_study(
    model_class=PPMModel,
    alphabet=ROMANIAN_ALPHABET,
    train_text=train_text_ro,
    test_text=test_text_ro,
    model_kwargs={"depth": 5, "escape_method": "A"},
    ablations=["no_diacritics", "no_space"],
)
print(f"Baseline: {sensitivity_results['baseline']['bits_per_char']:.4f} bpc")
for variant in sensitivity_results['variants']:
    print(f"{variant['name']}: {variant['bits_per_char']:.4f} bpc (Δ={variant['delta_bpc']:+.4f}, ΔR={variant['delta_redundancy']:+.2%})")


In [None]:
# %%
from IPython.display import Markdown

sensitivity_table = format_sensitivity_results(sensitivity_results, output_format="markdown")
Markdown(sensitivity_table)


In [None]:
# %%
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))
variants = ['Baseline'] + [v['name'] for v in sensitivity_results['variants']]
bpcs = [sensitivity_results['baseline']['bits_per_char']] + [v['bits_per_char'] for v in sensitivity_results['variants']]
redundancies = [sensitivity_results['baseline']['redundancy']] + [v['redundancy'] for v in sensitivity_results['variants']]
ax1.bar(variants, bpcs, color=['blue'] + ['orange'] * (len(variants)-1))
ax1.set_ylabel('Bits per character')
ax1.set_title('Entropy by Variant')
ax1.tick_params(axis='x', rotation=45)
ax2.bar(variants, [r * 100 for r in redundancies], color=['blue'] + ['green'] * (len(variants)-1))
ax2.set_ylabel('Redundancy (%)')
ax2.set_title('Redundancy by Variant')
ax2.tick_params(axis='x', rotation=45)
plt.tight_layout()
plt.show()


### Interpretation

Removing diacritics from Romanian often increases entropy, showing diacritics carry information. Removing space typically increases entropy more, indicating word boundaries are informative.


### Summary

Bootstrap confidence intervals quantify uncertainty in entropy estimates. Sensitivity analysis tests robustness to preprocessing choices. Both should be reported alongside point estimates.


In [None]:
# %%
output = {"bootstrap": bootstrap_results, "sensitivity": sensitivity_results}
with open("validation_results.json", "w", encoding="utf-8") as f:
    json.dump(output, f, indent=2)
print("Results saved to validation_results.json")
