# Jane Eyre - A Mathematical Theory of Communication

[A Mathematical Theory of Communication by Claude Shannon](https://people.math.harvard.edu/~ctm/home/text/others/shannon/entropy/entropy.pdf)

[The Mathematical Theory of Communication by Claude Shannon and Warren Weaver](https://pure.mpg.de/rest/items/item_2383164/component/file_2383163/content)

[Jane Eyre: An Autobiography by Charlotte Brontë](https://www.gutenberg.org/ebooks/1260)

## Imports

In [1]:
# The random module from the Python Standard Library.
import random

## Zero-order Approximation

In [2]:
# The allowed symbols.
symbols = "ABCDEFGHIJKLMNOPQRSTUVWXYZ "

Select several elements from symbols, with replacement.

https://docs.python.org/3/library/random.html#random.choices

In [3]:
# Randomly select k symbols from the string above.
L = random.choices(symbols, k=100)

# Show.
''.join(L)

'UFBIGLFOMQPIFQUHMLZTSCTKQEUUBYLGODRXGGMUCRYWEPJUYMKRXERMUSEMSCTT X EATBKCH NMRYTRJSLKV CPJPPFHOVSVUJ'

## First-order Approximation

In [4]:
# Open the book.
with open('data/janeeyre.txt', 'r') as f:
  # Read the book into one long string.
  text = f.read().upper()

FileNotFoundError: [Errno 2] No such file or directory: 'data/janeeyre.txt'

In [None]:
# Counts of the number of letters in the book.
counts = {s: text.count(s) for s in symbols}

In [None]:
# Show the counts.
counts

{'A': 62860,
 'B': 11161,
 'C': 18771,
 'D': 37462,
 'E': 100508,
 'F': 16889,
 'G': 15093,
 'H': 45904,
 'I': 56073,
 'J': 1223,
 'K': 6035,
 'L': 32495,
 'M': 22285,
 'N': 54288,
 'O': 60583,
 'P': 12143,
 'Q': 948,
 'R': 47417,
 'S': 50123,
 'T': 67127,
 'U': 23495,
 'V': 7629,
 'W': 18705,
 'X': 1277,
 'Y': 17307,
 'Z': 329,
 ' ': 168745}

In [None]:
# The number of E's.
counts['E']

100508

In [None]:
# Number of different items.
len(counts)

27

In [None]:
# Show the items in counts in sorted order.
# Adapted from: https://stackoverflow.com/a/613218
sorted(counts.items(), key=lambda item: -item[1])

[(' ', 168745),
 ('E', 100508),
 ('T', 67127),
 ('A', 62860),
 ('O', 60583),
 ('I', 56073),
 ('N', 54288),
 ('S', 50123),
 ('R', 47417),
 ('H', 45904),
 ('D', 37462),
 ('L', 32495),
 ('U', 23495),
 ('M', 22285),
 ('C', 18771),
 ('W', 18705),
 ('Y', 17307),
 ('F', 16889),
 ('G', 15093),
 ('P', 12143),
 ('B', 11161),
 ('V', 7629),
 ('K', 6035),
 ('X', 1277),
 ('J', 1223),
 ('Q', 948),
 ('Z', 329)]

In [None]:
# The dictionary keys.
counts.keys()

dict_keys(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', ' '])

In [None]:
# The dictionary values.
counts.values()

dict_values([62860, 11161, 18771, 37462, 100508, 16889, 15093, 45904, 56073, 1223, 6035, 32495, 22285, 54288, 60583, 12143, 948, 47417, 50123, 67127, 23495, 7629, 18705, 1277, 17307, 329, 168745])

In [None]:
# Randomly select k symbols from the string above.
L = random.choices(list(counts.keys()), weights=list(counts.values()), k=100)

# Show.
''.join(L)

'  EETAI  SODUN AFALENK   LHDEGEEIDEILEP FTRHMAREOWTSDC N ET ORKTSNSALA RE  IEDTHLSOAEOSILETP DT ONEE'

## Second-order Approximation

In [None]:
# Pairs of symbols.
symsq = [a + b for a in symbols for b in symbols]

# Show first and last 10.
symsq[:10], symsq[-10:]

(['AA', 'AB', 'AC', 'AD', 'AE', 'AF', 'AG', 'AH', 'AI', 'AJ'],
 [' R', ' S', ' T', ' U', ' V', ' W', ' X', ' Y', ' Z', '  '])

In [None]:
# Counts of the number of double letters in the book.
countsq = {s: text.count(s) for s in symsq}

In [None]:
# Example.
countsq['AA']

3

In [None]:
# Another example.
countsq['EA']

5239

In [None]:
# Our eventual output.
output = 'T'

In [None]:
# The counts for keys beginning with the last letter.
condcounts = {s: countsq[output[-1] + s] for s in symbols}

# Show.
condcounts

{'A': 2263,
 'B': 2,
 'C': 310,
 'D': 3,
 'E': 6382,
 'F': 52,
 'G': 1,
 'H': 18811,
 'I': 4190,
 'J': 0,
 'K': 0,
 'L': 1079,
 'M': 92,
 'N': 83,
 'O': 7190,
 'P': 0,
 'Q': 0,
 'R': 1926,
 'S': 1100,
 'T': 1368,
 'U': 1258,
 'V': 0,
 'W': 394,
 'X': 0,
 'Y': 836,
 'Z': 4,
 ' ': 15030}

In [None]:
# Randomly select the next symbol using the conditional weights.
random.choices(list(condcounts.keys()), weights=list(condcounts.values()), k=1)

['H']

In [None]:
# Our eventual output.
output = 'T'

for i in range(1, 100):
  # The counts for keys beginning with the last letter.
  condcounts = {s: countsq[output[-1] + s] for s in symbols}
  # The next letter.
  next = random.choices(list(condcounts.keys()), weights=list(condcounts.values()), k=1)[0]
  # Add to the output.
  output += next

# Show.
output

'TED PEANG AMAND THE ATSHAFFA CAS ORATHTHEERD WHIVE ITCECHEE IOFLOF BE HERAD HENCANEAND F MED GSS N B'

## End