In [1]:
# Import
import numpy as np
import math
from PIL import Image, ImageDraw, ImageFont
import os
import json

# test perceptual similarity metrics on letters
from skimage.metrics import structural_similarity as ssim
import numpy as np
from PIL import Image
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
def entropy(freqs):
    total = sum(freqs)
    if total == 0:
        return 0
    return -sum((f / total) * np.log(f / total) for f in freqs if f > 0)

## Baseline -- English conditional entropy

### As a sanity check, if threshold set to 0, i.e., score > 0.0, calculate the marginal prob etc., the conditional entropy should be the same as, or very similar to the unconditional entropy.

In [None]:
## note, this is the innitial version, which used the row ssim metric, not the scaled ssim metric
## the scaled ssim metric is more distinguishing between similar and dissimilar images

visible = "upper"
lang = "en"
threshold = 0.2

if lang == "en":
    img_path = f"/swdata/yin/Cui/EM/reveil/data/en/onestop/en_word_onestop_uniWords_newFont/{visible}/images"
    jsonl_file_path = f"/swdata/yin/Cui/EM/reveil/data/en/onestop/en_word_onestop_uniWords_newFont/{visible}/img_dict_test.json"
    out_path = "/swdata/yin/Cui/EM/reveil/precomputed/baseline_onestop/EN"
if lang == "zh":
    img_path = f"/swdata/yin/Cui/EM/reveil/data/zh/zh-char/onestop/zh_char_onestop_uniqueChar_newFont/{visible}/images"
    jsonl_file_path = f"/swdata/yin/Cui/EM/reveil/data/zh/zh-char/onestop/zh_char_onestop_uniqueChar_newFont/{visible}/img_dict_test.json"
    out_path = "/swdata/yin/Cui/EM/reveil/precomputed/baseline_onestop/ZH"
# img_path = f"/swdata/yin/Cui/Re-Veil/create-dataset-new/dataset_en/en_word_{visible}_20250513/images"
# jsonl_file_path = f"/swdata/yin/Cui/EM/reveil/data/en/en-word/{visible}/img_dict_test.json"

conditional_prob_ent_dict = {}

# pick a fixed size for all images (e.g., 28x28)
resize_shape = (128, 128)

with open(jsonl_file_path, 'r') as file:
    lines = file.readlines()
    print("total lines:", len(lines))

comparison_counter = 0  # keep track of comparisons
sum_freq = 0.0

for line in lines:
    p_marginal = 0
    p_similars = []
    data = json.loads(line)
    img = img_path + "/" + data["image"]
    target = data["label"]
    p_target = data["normal_freq"]

    img_target = np.array(Image.open(img).convert("L").resize(resize_shape))
    sum_freq += p_target   ## accumulate total frequency sum should be 1.0

    # compare img_target with all other images
    for comp_line in lines:
        comp_data = json.loads(comp_line)
        comp_img_path = img_path + "/" + comp_data["image"]
        comparison = comp_data["label"]
        p_comp = comp_data["normal_freq"]

        img_comp = np.array(Image.open(comp_img_path).convert("L").resize(resize_shape))

        similarity = ssim(img_target, img_comp)
        if similarity > threshold:  # threshold for considering similar
            p_similars.append(p_comp)
            p_marginal += p_comp

        # # Show the two images every 400th comparison for visual inspection
        # comparison_counter += 1
        # if comparison_counter % 400 == 0:
        #     print(f"[{comparison_counter}] {img} vs {comp_img_path}, SSIM = {similarity:.3f}")
        #     fig, axes = plt.subplots(1, 2, figsize=(4, 2))
        #     axes[0].imshow(img_target, cmap="gray")
        #     axes[0].set_title(f"Target: {target}")
        #     axes[0].axis("off")
        #     axes[1].imshow(img_comp, cmap="gray")
        #     axes[1].set_title(f"Comp: {comparison}")
        #     axes[1].axis("off")
        #     plt.show()

    p_target_given_orth = p_target / p_marginal if p_marginal > 0 else 0
    ent_cond = entropy(p_similars) if len(p_similars) > 0 else 0
    conditional_prob_ent_dict[target] = (p_target_given_orth, ent_cond, p_target, p_marginal)  
    print(f"{target}|orthographic representations --> prob = {p_target_given_orth}, entropy = {ent_cond}, freq = {p_target}, marginal prob = {p_marginal}") 

# compute overall conditional entropy
H_W_given_O = 0.0
rows = []
for word, (p_target_given_orth, ent_cond, freq_target, p_marginal) in conditional_prob_ent_dict.items():
    H_W_given_O += freq_target * ent_cond
    rows.append({
        "target": word,
        "prob_target": p_target_given_orth,
        "entropy": ent_cond,
        "freq": freq_target,
        "marginal": p_marginal
    })

print(f"Overall conditional entropy H(L|O) = {H_W_given_O:.4f} nats")
print(f"Sum of frequencies (should be ~1.0) = {sum_freq:.4f}")

# df = pd.DataFrame(rows, columns=["target", "prob_target", "entropy", "freq", "marginal"])
# out_csv = f"{out_path}/baseline_{lang}_word_{visible}_{threshold}.csv"
# df.to_csv(out_csv, index=False)
# print(f"Results saved to {out_csv}")

total lines: 692
Year|orthographic representations --> prob = 0.0012620344886189255, entropy = 4.665206077661126, freq = 0.0012620344886189, marginal prob = 0.9999999999999799
Old|orthographic representations --> prob = 0.0007780594319694201, entropy = 4.663097475696409, freq = 0.0007777010774164, marginal prob = 0.9995394252183114
Bottle|orthographic representations --> prob = 5.176569070873968e-05, entropy = 4.65327268977921, freq = 5.147772256208681e-05, marginal prob = 0.9944370848198835
Message|orthographic representations --> prob = 0.0001383809746292028, entropy = 4.665206077661126, freq = 0.0001383809746292, marginal prob = 0.9999999999999799
Angela|orthographic representations --> prob = 1.453000233607318e-05, entropy = 4.665206077661126, freq = 1.4530002336072889e-05, marginal prob = 0.9999999999999799
Erdmann|orthographic representations --> prob = 1.8266288651063427e-07, entropy = 4.665206077661126, freq = 1.826628865106306e-07, marginal prob = 0.9999999999999799
never|orth

In [None]:

print("Entropy of EN word:", entropy([conditional_prob_ent_dict[word][2] for word in conditional_prob_ent_dict.keys()]))

Entropy of EN word: 4.665206077661126


#### Scale the SSIM score with an exp function and a temperature alph = 10.0.
#### This result is the numbers reported in the paper

In [None]:
import os
import json
import numpy as np
import pandas as pd
from PIL import Image
from skimage.metrics import structural_similarity as ssim
from scipy.stats import entropy

# -----------------------------
# Configurations
# -----------------------------
visible = "whole"   # "upper" or "lower"
lang = "en"         # "en" or "zh"
resize_shape = (128, 128)

# Similarity transformation parameters
sim_method = "exp"   # options: "exp", "power", "minmax", "raw"
alpha = 10.0         # strength for exp scaling
beta = 2.0           # strength for power scaling

if lang == "en":
    img_path = f"/swdata/yin/Cui/EM/reveil/data/en/onestop/en_word_onestop_uniWords_newFont/{visible}/images"
    jsonl_file_path = f"/swdata/yin/Cui/EM/reveil/data/en/onestop/en_word_onestop_uniWords_newFont/{visible}/img_dict_test.json"
    out_path = "/swdata/yin/Cui/EM/reveil/precomputed/baseline_onestop/EN"
elif lang == "zh":
    img_path = f"/swdata/yin/Cui/EM/reveil/data/zh/zh-char/onestop/zh_char_onestop_uniqueChar_newFont/{visible}/images"
    jsonl_file_path = f"/swdata/yin/Cui/EM/reveil/data/zh/zh-char/onestop/zh_char_onestop_uniqueChar_newFont/{visible}/img_dict_test.json"
    out_path = "/swdata/yin/Cui/EM/reveil/precomputed/baseline_onestop/ZH"

# -----------------------------
# Similarity transformation
# -----------------------------
def transform_similarity(sim, method="exp", alpha=2.0, beta=2.0):
    """
    Transform similarity into a likelihood.
    """
    if method == "exp":
        return np.exp(alpha * sim)
    elif method == "power":
        return np.power(max(sim, 1e-6), beta)
    elif method == "minmax":
        # clip and rescale to [0,1]
        sim = np.clip(sim, 0.0, 1.0)
        return np.power(sim, beta)
    else:  # "raw"
        return max(sim, 1e-6)

# -----------------------------
# Load dataset
# -----------------------------
with open(jsonl_file_path, 'r') as file:
    lines = file.readlines()
print("total lines:", len(lines))

data_list = [json.loads(line) for line in lines]

# Preload images to memory
image_cache = {}
for data in data_list:
    label = data["label"]
    img_file = os.path.join(img_path, data["image"])
    img_array = np.array(Image.open(img_file).convert("L").resize(resize_shape))
    image_cache[label] = (img_array, data["normal_freq"])

# -----------------------------
# Bayesian baseline computation
# -----------------------------
results = {}
overall_entropy = 0.0
rows = []

for target_label, (target_img, p_target_prior) in image_cache.items():
    likelihoods = []
    
    for comp_label, (comp_img, p_comp_prior) in image_cache.items():
        similarity = ssim(target_img, comp_img)
        p_likelihood = transform_similarity(similarity, method=sim_method, alpha=alpha, beta=beta)
        likelihoods.append((comp_label, p_comp_prior, p_likelihood))
    
    # Marginal likelihood
    Z = sum(p_prior * p_like for _, p_prior, p_like in likelihoods)
    
    # Posterior distribution
    posteriors = {}
    for comp_label, p_prior, p_like in likelihoods:
        p_post = (p_prior * p_like) / Z if Z > 0 else 0
        posteriors[comp_label] = p_post
    
    # Entropy of posterior
    posterior_probs = list(posteriors.values())
    ent_cond = entropy(posterior_probs)  # in nats
    
    results[target_label] = {
        "posterior_entropy": ent_cond,
        "prior": p_target_prior,
        "posterior_target": posteriors[target_label],
        "marginal_likelihood": Z
    }
    
    overall_entropy += p_target_prior * ent_cond
    
    rows.append({
        "target": target_label,
        "prior": p_target_prior,
        "posterior_target": posteriors[target_label],
        "entropy": ent_cond,
        "marginal_likelihood": Z
    })

# -----------------------------
# Summarize results
# -----------------------------
print(f"Overall conditional entropy H(W|O) = {overall_entropy:.4f} nats")

df = pd.DataFrame(rows)
os.makedirs(out_path, exist_ok=True)
out_csv = f"{out_path}/bayesian_baseline_{lang}_{visible}_{sim_method}_{alpha}.csv"
df.to_csv(out_csv, index=False)
print(f"Results saved to {out_csv}")
