### Step 1 — Imports & sample text
Use any text; you can later try word‑level tokens. For now, use characters.

In [1]:
import math
from collections import Counter
import heapq

text = """
to be, or not to be, that is the question—
whether 'tis nobler in the mind to suffer
the slings and arrows of outrageous fortune
"""

### Step 2 — Symbol frequencies & entropy
Build the PMF over characters and compute H(X) in bits/symbol.

In [5]:
def pmf_from_text(s, keep_space=True):
    if not keep_space:
        s = ''.join(ch for ch in s if not ch.isspace())
    counts = Counter(s)
    total = sum(counts.values())
    pmf = {sym: c/total for sym, c in counts.items()}
    return pmf, counts, total

def entropy(pmf):
    return -sum(p * math.log2(p) for p in pmf.values() if p > 0)
    
pmf, counts, total = pmf_from_text(text, keep_space=True)
H = entropy(pmf)
print(f"Symbols: {len(pmf)} Total: {total} Entropy: {H:.3f} bits/sym")

Symbols: 23 Total: 130 Entropy: 4.016 bits/sym


### Step 3 — Build Huffman tree & codebook
We construct a binary tree by repeatedly merging the two least‑probable nodes.

In [12]:
class Node:
    __slots__ = ('prob','sym','left','right')
    def __init__(self, prob, sym=None, left=None, right=None):
        self.prob = prob;
        self.sym = sym;
        self.left = left;
        self.right = right
    def __lt__(self, other): 
        return self.prob < other.prob
        
def huffman_code(pmf):
    heap = [Node(p, sym=s) for s,p in pmf.items()]
    heapq.heapify(heap)
    if len(heap) == 1:
        return {heap[0].sym: '0'}
    while len(heap) > 1:
        a = heapq.heappop(heap)
        b = heapq.heappop(heap)
        heapq.heappush(heap, Node(a.prob + b.prob, left=a, right=b))
    root = heap[0]
    code = {}
    def walk(node, bits=''):
        if node.sym is not None:
            code[node.sym] = bits or '0'
        else:
            walk(node.left, bits+'0')
            walk(node.right, bits+'1')
    walk(root)
    return code
    
code = huffman_code(pmf)
print(dict(list(code.items())[:10]))

{'i': '0000', 'h': '0001', 'e': '001', 't': '010', '\n': '01100', "'": '0110100', '—': '0110101', 'l': '011011', 'a': '01110', ',': '011110'}


### Step 4 — Encode & decode
Use the codebook to map symbols to bits, and then invert for decoding.

In [7]:
def encode(data, code):
    return ''.join(code[ch] for ch in data)
    
def decode(bitstring, code):
    inv = {v:k for k,v in code.items()}
    out = []
    buf = ''
    for b in bitstring:
        buf += b
        if buf in inv:
            out.append(inv[buf])
            buf = ''
    return ''.join(out)
    
bitstream = encode(text, code)
decoded = decode(bitstream, code)
print('Round-trip OK?', decoded == text)

Round-trip OK? True


### Step 5 — Evaluate compression
Compute average code length L_avg, theoretical bound H(X), and compression vs an 8‑bit
baseline.

In [11]:
def average_code_length(pmf, code):
    return sum(len(code[s]) * p for s,p in pmf.items())

Lavg = average_code_length(pmf, code)
bits_huffman = len(bitstream)
bits_fixed8 = len(text) * 8
ratio_vs_8bit = bits_huffman / bits_fixed8 if bits_fixed8>0 else float('inf')
redundancy = Lavg - H

print(f"L_avg = {Lavg:.3f} bits/sym | Entropy H = {H:.3f} bits/sym | Redundancy = {redundancy:+.3f} bits/sym")
print(f"Total bits (Huffman) = {bits_huffman} | Fixed 8-bit = {bits_fixed8} | Compression ratio = {ratio_vs_8bit:.3f}")

L_avg = 4.054 bits/sym | Entropy H = 4.016 bits/sym | Redundancy = +0.038 bits/sym
Total bits (Huffman) = 527 | Fixed 8-bit = 1040 | Compression ratio = 0.507
