# Simple Test Script with EN Letters

In [1]:
# Import
import numpy as np
import math
from PIL import Image, ImageDraw, ImageFont
import os

In [2]:
def entropy(freqs):
    total = sum(freqs)
    if total == 0:
        return 0
    return -sum((f / total) * np.log(f / total) for f in freqs if f > 0)

In [None]:
## from https://en.wikipedia.org/wiki/Letter_frequency
english_letter_freq = {
    'a': 0.08167, 'b': 0.01492, 'c': 0.02782, 'd': 0.04253, 'e': 0.12702,
    'f': 0.02228, 'g': 0.02015, 'h': 0.06094, 'i': 0.06966, 'j': 0.00153,
    'k': 0.00772, 'l': 0.04025, 'm': 0.02406, 'n': 0.06749, 'o': 0.07507,
    'p': 0.01929, 'q': 0.00095, 'r': 0.05987, 's': 0.06327, 't': 0.09056,
    'u': 0.02758, 'v': 0.00978, 'w': 0.02360, 'x': 0.00150, 'y': 0.01974,
    'z': 0.00074
}

print("Entropy of letters:", entropy([english_letter_freq[letter] for letter in english_letter_freq.keys()]))

# annotated groups by 3 colleagues -- upper
## losse grouping
similar_letters_upper = ['acegoq', 'bh', 'd', 'f', 'ij', 'k', 'l', 'm', 'np', 'r', 's', 't', 'uvwxy', 'z']
## tighter grouping
similar_letters_upper = ['agq', 'c', 'eo','bh', 'd', 'f', 'ij', 'k', 'l', 'm', 'np', 'r', 's', 't', 'vxy', 'u', 'w','z']

# combine the list of similar letters to form a string to check if we have all letters
# similar_letters_upper_string = ''.join(similar_letters_upper)
# assert(len(similar_letters_upper_string) == 26)

# annotated groups by 3 colleagues -- lower
## losse grouping
similar_letters_lower = ['adu', 'b', 'ce', 'fhilmnr', 'g', 'j', 'kx', 'o', 'p', 'q', 's', 't', 'vw', 'y','z']
## tighter grouping
similar_letters_lower = ['adu', 'b', 'ce', 'fhilmnrt', 'g', 'j', 'kx', 'o', 'p', 'q', 's', 'vw', 'y','z']
# similar_letters_lower_string = ''.join(similar_letters_lower)
# assert(len(similar_letters_lower_string) == 26)

# Calculate the combined frequency of similar letters and put them in a list
similar_letters_freq_upper = []
for letters in similar_letters_upper:
    freq = 0
    for letter in letters:
        freq += english_letter_freq[letter]
    similar_letters_freq_upper.append(freq)

## here, MI(upper) = H(upper) - H(upper|letter ID), and H(upper|letter ID) = 0
print("Entropy of similar letters (upper) --> MI(upper):", entropy(similar_letters_freq_upper))

# Calculate the combined frequency of similar lower letters and put them in a list
similar_letters_freq_lower = []
for letters in similar_letters_lower:
    freq = 0
    for letter in letters:
        freq += english_letter_freq[letter]
    similar_letters_freq_lower.append(freq)

print("Entropy of similar letters (lower) --> MI(lower):", entropy(similar_letters_freq_lower))

Entropy of letters: 2.8944350702720407
Entropy of similar letters (upper) --> MI(upper): 2.589331493486497
Entropy of similar letters (lower) --> MI(lower): 1.7797182590071035


In [None]:
## If use H(letter ID) - H(letter ID| visible part)) --> sanity check
# Entropy H(L) of letters
H_L = -sum(p * math.log(p) for p in english_letter_freq.values())

# Conditional entropy H(L|O)
H_L_given_O = 0.0
for cluster in similar_letters_upper:
    letters = list(cluster)
    cluster_prob = sum(english_letter_freq[l] for l in letters)
    if cluster_prob == 0:
        continue

    cluster_entropy = 0.0
    for l in letters:
        pl = english_letter_freq[l]
        pl_given_o = pl / cluster_prob
        cluster_entropy -= pl_given_o * math.log(pl_given_o)

    H_L_given_O += cluster_prob * cluster_entropy

# Mutual Information
I_L_O = H_L - H_L_given_O

print(f"Entropy H(L) = {H_L:.4f} nats")
print(f"Conditional Entropy H(L|O) = {H_L_given_O:.4f} nats")
print(f"Mutual Information I(L; O) = {I_L_O:.4f} nats")


Entropy H(L) = 2.8944 nats
Conditional Entropy H(L|C) = 0.3051 nats
Mutual Information I(L; C) = 2.5893 nats


In [6]:
# Config
font_path = '/swdata/yin/Cui/Re-Veil/NotoSansCJK-VF.ttf.ttc'
font_size = 220
alphabet = "abcdefghijklmnopqrstuvwxyz"
save_dir = "/swdata/yin/Cui/EM/reveil/data/temp/letters"

# Load font
font = ImageFont.truetype(font_path, font_size, index=2)
font.set_variation_by_axes([400.0])  # Normal weight

# Ensure directories exist
os.makedirs(f"{save_dir}/whole", exist_ok=True)
os.makedirs(f"{save_dir}/upper", exist_ok=True)
os.makedirs(f"{save_dir}/lower", exist_ok=True)

# Define consistent canvas size based on widest and tallest character
max_width = 0
max_height = 0
for char in alphabet:
    dummy_img = Image.new("RGB", (1, 1))
    draw = ImageDraw.Draw(dummy_img)
    bbox = draw.textbbox((0, 0), char, font=font)
    width = bbox[2] - bbox[0]
    height = bbox[3] - bbox[1]
    max_width = max(max_width, width)
    max_height = max(max_height, height)

# Add generous padding
padding = 40
canvas_width = max_width + 2 * padding
canvas_height = max_height + 2 * padding

for text in alphabet:
    # Create blank image
    image = Image.new("RGB", (canvas_width, canvas_height), "white")
    draw = ImageDraw.Draw(image)

    # Compute character's bounding box
    bbox = draw.textbbox((0, 0), text, font=font)
    text_width = bbox[2] - bbox[0]
    text_height = bbox[3] - bbox[1]

    # Center the character within the canvas
    x = (canvas_width - text_width) // 2 - bbox[0]
    y = (canvas_height - text_height) // 2 - bbox[1]

    # Draw full letter
    draw.text((x, y), text, fill="black", font=font)

    # Create upper- and lower-masked versions
    image_u = image.copy()
    image_l = image.copy()
    draw_u = ImageDraw.Draw(image_u)
    draw_l = ImageDraw.Draw(image_l)

    # Mask lower half
    draw_u.rectangle([0, canvas_height // 2, canvas_width, canvas_height], fill="white")
    # Mask upper half
    draw_l.rectangle([0, 0, canvas_width, canvas_height // 2], fill="white")

    # Save all
    image.save(f"{save_dir}/whole/{text}.png")
    image_u.save(f"{save_dir}/upper/{text}.png")
    image_l.save(f"{save_dir}/lower/{text}.png")


In [None]:
# test perceptual similarity metrics on letter n
from skimage.metrics import structural_similarity as ssim
import numpy as np
from PIL import Image

ssimdict = {}

alphabet = "abcdefghijklmnopqrstuvwxyz"
# load two grayscale images of the same size
img1 = np.array(Image.open(f"/swdata/yin/Cui/EM/reveil/data/temp/letters/lower/n.png").convert("L"))
for i in range(26):
    img2 = np.array(Image.open(f"/swdata/yin/Cui/EM/reveil/data/temp/letters/lower/{alphabet[i]}.png").convert("L"))
    if img1.shape != img2.shape:
        raise ValueError("Images must have the same dimensions.")
    # compute SSIM
    score, diff = ssim(img1, img2, full=True)
    ssimdict[f"n-{alphabet[i]}"] = score

# sort the dictionary by SSIM score
sorted_ssim = dict(sorted(ssimdict.items(), key=lambda item: item[1], reverse=True))
print("Sorted SSIM scores:")
for key, value in sorted_ssim.items():
    print(f"{key}: {value}")


Sorted SSIM scores:
n-n: 1.0
n-h: 0.9790152060658808
n-u: 0.962169018876847
n-a: 0.9549007934654298
n-q: 0.9530690822585127
n-p: 0.9508886732355146
n-o: 0.9482252111750056
n-c: 0.946247322625113
n-b: 0.9452765318750579
n-e: 0.9448563864607069
n-d: 0.9445567754618669
n-x: 0.9397132139774363
n-r: 0.9374586405074
n-s: 0.9373816377852875
n-z: 0.9362951105216054
n-w: 0.930590129496269
n-i: 0.9279220715328209
n-f: 0.9266402154809799
n-v: 0.9262825441820689
n-l: 0.9245180393065847
n-t: 0.9236380948978558
n-g: 0.9209390071624466
n-y: 0.9156827934423832
n-k: 0.9156794387301888
n-j: 0.9132833100876664
n-m: 0.8928683568253987


## Baseline -- Letter upper part conditional entropy

### As a sanity check, if threshold set to 0, i.e., score > 0.0, calculate the marginal prob etc., the conditional entropy should be the same as, or very similar to the unconditional entropy. This is the case here.

In [None]:
# get perceptual similarity metrics on all letters
from skimage.metrics import structural_similarity as ssim
import numpy as np
from PIL import Image

ssimdict = {}
conditional_pob_ent_dict = {}
## each target letter has an entropy value
conditional_entdict = {}

alphabet = "abcdefghijklmnopqrstuvwxyz"
# load two grayscale images of the same size

for i in range(26):
    target = alphabet[i]
    p_marginal = 0
    p_similars = []
    img1 = np.array(Image.open(f"/swdata/yin/Cui/EM/reveil/data/temp/letters/upper/{target}.png").convert("L"))
    for j in range(26):
        comparison = alphabet[j]
        # image loading
        img2 = np.array(Image.open(f"/swdata/yin/Cui/EM/reveil/data/temp/letters/upper/{comparison}.png").convert("L"))
        # ensure same size
        if img1.shape != img2.shape:
            raise ValueError("Images must have the same dimensions.")
        
        score, diff = ssim(img1, img2, full=True)
        ssimdict[f"{target}-{comparison}"] = score

        if score > 0.95:    # set to 0 for sanity check
            p_marginal += english_letter_freq[comparison]
            p_similars.append(english_letter_freq[comparison])

    p_target = english_letter_freq[target]
    p_target_given_orth = p_target / p_marginal if p_marginal > 0 else 0

    # calculate conditional entropy
    ent_cond = entropy(p_similars)
    conditional_pob_ent_dict[target] = (p_target_given_orth, ent_cond)


    print(f"{target}|evidence: prob = {p_target_given_orth}, entropy = {ent_cond}")

    # calculate overall conditional entropy
H_L_given_O = 0.0
for letter, (p_target_given_orth, ent_cond) in conditional_pob_ent_dict.items():
    p_letter = english_letter_freq[letter]
    H_L_given_O += p_letter * ent_cond
print(f"Overall conditional entropy H(L|0) = {H_L_given_O:.4f} nats")

a|evidence: prob = 0.08753108119694764, entropy = 2.760326905908609
b|evidence: prob = 0.01566683817584241, entropy = 2.8034470128141677
c|evidence: prob = 0.028506142858606667, entropy = 2.849550294597128
d|evidence: prob = 0.04465888925057491, entropy = 2.8034470128141677
e|evidence: prob = 0.13337813573026158, entropy = 2.8034470128141677
f|evidence: prob = 0.02331299898502653, entropy = 2.8028490786660387
g|evidence: prob = 0.022969245149670567, entropy = 2.7438408509979437
h|evidence: prob = 0.0639904234876566, entropy = 2.8034470128141677
i|evidence: prob = 0.06966069660696608, entropy = 2.8944350702720407
j|evidence: prob = 0.001561622862975249, entropy = 2.84929960671849
k|evidence: prob = 0.008106433694202641, entropy = 2.8034470128141677
l|evidence: prob = 0.04025040250402505, entropy = 2.8944350702720407
m|evidence: prob = 0.08349238296838671, entropy = 1.5862270416370545
n|evidence: prob = 0.07086829145359277, entropy = 2.8034470128141677
o|evidence: prob = 0.08053165697612

## Baseline -- Letter lower part conditional entropy

In [None]:
# test perceptual similarity metrics on letters
from skimage.metrics import structural_similarity as ssim
import numpy as np
from PIL import Image

ssimdict = {}
conditional_pob_ent_dict = {}
## each target letter has an entropy value
conditional_entdict = {}

alphabet = "abcdefghijklmnopqrstuvwxyz"
# Load two grayscale images of the same size

for i in range(26):
    target = alphabet[i]
    p_marginal = 0
    p_similars = []
    img1 = np.array(Image.open(f"/swdata/yin/Cui/EM/reveil/data/temp/letters/lower/{target}.png").convert("L"))
    for j in range(26):
        comparison = alphabet[j]
        img2 = np.array(Image.open(f"/swdata/yin/Cui/EM/reveil/data/temp/letters/lower/{comparison}.png").convert("L"))
        if img1.shape != img2.shape:
            raise ValueError("Images must have the same dimensions.")
        
        score, diff = ssim(img1, img2, full=True)
        ssimdict[f"{target}-{comparison}"] = score

        if score > 0.95:
            p_marginal += english_letter_freq[comparison]
            p_similars.append(english_letter_freq[comparison])

    p_target = english_letter_freq[target]
    p_target_given_orth = p_target / p_marginal if p_marginal > 0 else 0

    # calculate conditional entropy
    ent_cond = entropy(p_similars)
    conditional_pob_ent_dict[target] = (p_target_given_orth, ent_cond)


    print(f"{target}|evidence: prob = {p_target_given_orth}, entropy = {ent_cond}")

    # calculate overall conditional entropy
H_L_given_O = 0.0
for letter, (p_target_given_orth, ent_cond) in conditional_pob_ent_dict.items():
    p_letter = english_letter_freq[letter]
    H_L_given_O += p_letter * ent_cond
print(f"Overall conditional entropy H(L|0) = {H_L_given_O:.4f} nats")

a|evidence: prob = 0.08368428063488162, entropy = 2.849550294597128
b|evidence: prob = 0.015467230619311234, entropy = 2.8143918208921375
c|evidence: prob = 0.027820278202782035, entropy = 2.8944350702720407
d|evidence: prob = 0.045417654471284256, entropy = 2.7420226977207647
e|evidence: prob = 0.12963340953625083, entropy = 2.8533066783708536
f|evidence: prob = 0.022280222802228026, entropy = 2.8944350702720407
g|evidence: prob = 0.029397175536881422, entropy = 2.6499682961028856
h|evidence: prob = 0.06254105090311989, entropy = 2.8423151586273367
i|evidence: prob = 0.06966069660696608, entropy = 2.8944350702720407
j|evidence: prob = 0.0018261025243181952, entropy = 2.639566589032997
k|evidence: prob = 0.008485381402506046, entropy = 2.7434905852840115
l|evidence: prob = 0.041223281680476044, entropy = 2.8499569429260023
m|evidence: prob = 0.04311209862385322, entropy = 2.2112039332731546
n|evidence: prob = 0.06915455001895628, entropy = 2.849550294597128
o|evidence: prob = 0.0785431

## Baseline -- Letter whole part conditional entropy

In [None]:
# perceptual similarity metrics on all letters
from skimage.metrics import structural_similarity as ssim
import numpy as np
from PIL import Image

ssimdict = {}
conditional_pob_ent_dict = {}
## each target letter has an entropy value
conditional_entdict = {}

alphabet = "abcdefghijklmnopqrstuvwxyz"

for i in range(26):
    target = alphabet[i]
    p_marginal = 0
    p_similars = []
    img1 = np.array(Image.open(f"/swdata/yin/Cui/EM/reveil/data/temp/letters/whole/{target}.png").convert("L"))
    for j in range(26):
        comparison = alphabet[j]
        img2 = np.array(Image.open(f"/swdata/yin/Cui/EM/reveil/data/temp/letters/whole/{comparison}.png").convert("L"))
        if img1.shape != img2.shape:
            raise ValueError("Images must have the same dimensions.")

        score, diff = ssim(img1, img2, full=True)
        ssimdict[f"{target}-{comparison}"] = score

        if score > 0.9:
            p_marginal += english_letter_freq[comparison]
            p_similars.append(english_letter_freq[comparison])

    p_target = english_letter_freq[target]
    p_target_given_orth = p_target / p_marginal if p_marginal > 0 else 0

    # calculate conditional entropy
    ent_cond = entropy(p_similars)
    conditional_pob_ent_dict[target] = (p_target_given_orth, ent_cond)


    print(f"{target}|evidence: prob = {p_target_given_orth}, entropy = {ent_cond}")

    # calculate overall conditional entropy
H_L_given_O = 0.0
for letter, (p_target_given_orth, ent_cond) in conditional_pob_ent_dict.items():
    p_letter = english_letter_freq[letter]
    H_L_given_O += p_letter * ent_cond
print(f"Overall conditional entropy H(L|0) = {H_L_given_O:.4f} nats")

a|evidence: prob = 0.18473196109477497, entropy = 1.7096349315395556
b|evidence: prob = 0.19667809122066965, entropy = 0.49576265101622263
c|evidence: prob = 0.08035585338378441, entropy = 1.5164366746465803
d|evidence: prob = 1.0, entropy = -0.0
e|evidence: prob = 0.287154677397477, entropy = 1.7102104580214696
f|evidence: prob = 0.07883376972613404, entropy = 1.516467418338439
g|evidence: prob = 1.0, entropy = -0.0
h|evidence: prob = 0.4251133589117545, entropy = 0.9537963577549367
i|evidence: prob = 0.24515220834066512, entropy = 1.5418017739397445
j|evidence: prob = 0.013729361091170135, entropy = 0.7203972868810202
k|evidence: prob = 1.0, entropy = -0.0
l|evidence: prob = 0.14165053668836883, entropy = 1.5418017739397445
m|evidence: prob = 1.0, entropy = -0.0
n|evidence: prob = 0.1443358497829295, entropy = 1.8322493701467113
o|evidence: prob = 0.16980321194299933, entropy = 1.7096349315395556
p|evidence: prob = 1.0, entropy = -0.0
q|evidence: prob = 1.0, entropy = -0.0
r|evidence