In [131]:
import numpy as np
import pandas as pd
import math
import functools
import operator
import itertools
from scipy.stats import poisson

In [121]:
monogram = pd.read_table('english_monograms.txt', header=None, delim_whitespace=True)
bigram = pd.read_csv('english_bigrams_1.txt', header=None, delim_whitespace=True)
words = pd.read_csv('english_words.txt', header=None, delim_whitespace=True)
monogram[1]

0     529117365
1     390965105
2     374061888
3     326627740
4     320410057
5     313720540
6     294300210
7     277000841
8     216768975
9     183996130
10    169330528
11    138416451
12    117295780
13    110504544
14     95422055
15     91258980
16     90376747
17     79843664
18     75294515
19     70195826
20     46337161
21     35373464
22      9613410
23      8369915
24      4975847
25      4550166
Name: 1, dtype: int64

In [122]:
def compute_entropy(table):
    grams = table[0]
    freqs = table[1]
    
    total = sum(freqs)
    
    # Compute the entropy
    entropy = 0
    for freq in freqs:
        p = freq / total
        entropy -= p * math.log2(p)
    
    return entropy


In [123]:
print(compute_entropy(monogram))
print(compute_entropy(bigram))

4.184600828019136
7.843215598838259


In [127]:
# Generate bigram probabilities from monogram
monogram_p = pd.DataFrame(np.zeros(np.shape(monogram)))
monogram_p[0] = monogram[0]
monogram_p[1] = monogram[1]/sum(monogram[1])

bigram_random = pd.DataFrame(np.zeros([26**2,2]))
bigram_all = [p for p in itertools.product(monogram_p[0], repeat=2)]
bigram_allp = [p for p in itertools.product(monogram_p[1], repeat=2)]
for i in range(0,26**2):
    bigram_random[0][i] = ''.join(bigram_all[i])
    bigram_random[1][i] = np.prod(bigram_allp[i])

bigram_random

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bigram_random[1][i] = np.prod(bigram_allp[i])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bigram_random[0][i] = ''.join(bigram_all[i])


Unnamed: 0,0,1
0,EE,0.014633
1,ET,0.010812
2,EA,0.010345
3,EO,0.009033
4,EI,0.008861
...,...,...
671,QK,0.000008
672,QJ,0.000002
673,QX,0.000002
674,QZ,0.000001


In [128]:
print(compute_entropy(bigram_random))

8.369201656038285


In [209]:
print(math.e**(-0.01)*0.01)
print(poisson.pmf(1, 0.01))
print(1-poisson.cdf(k=1, mu=0.01))
print(poisson.pmf(1, 0.05))
print(poisson.pmf(1, 0.01) + poisson.pmf(1, 0.05))

0.009900498337491681
0.009900498337491688
4.9667913340267766e-05
0.04756147122503571
0.0574619695625274


In [210]:
joint_prob = [poisson.pmf(1, 0.05)/2, poisson.pmf(1, 0.01)/2, poisson.pmf(0, 0.05)/2, poisson.pmf(0, 0.01)/2]
joint_entropy = -sum([p * math.log2(p) for p in joint_prob])
joint_entropy

1.1782748705776316

In [219]:
pn0 = poisson.pmf(0, 0.05)/2 + poisson.pmf(0, 0.01)/2
pn1 = poisson.pmf(1, 0.05)/2 + poisson.pmf(1, 0.01)/2
px1 = 0.5
px2 = 0.5
hn = -sum([p * math.log2(p) for p in [pn0, pn1]])
hx = -sum([p * math.log2(p) for p in [px1, px2]])
print(hn, hx)

0.1888685804483239 1.0


In [212]:
mi = hn + hx - joint_entropy
print(mi)

0.010593709870692303


In [213]:
n12 = pd.DataFrame(np.zeros([4,2]))
n12[0] = [poisson.pmf(0, 0.025)*poisson.pmf(0, 0.025)/2, poisson.pmf(0, 0.025)*poisson.pmf(1, 0.025)/2,
         poisson.pmf(1, 0.025)*poisson.pmf(0, 0.025)/2, poisson.pmf(1, 0.025)*poisson.pmf(1, 0.025)/2]
n12[1] = [poisson.pmf(0, 0.005)*poisson.pmf(0, 0.005)/2, poisson.pmf(0, 0.005)*poisson.pmf(1, 0.005)/2,
         poisson.pmf(1, 0.005)*poisson.pmf(0, 0.005)/2, poisson.pmf(1, 0.005)*poisson.pmf(1, 0.005)/2]
hn12 = -sum([p * math.log2(p) for p in n12[0]])-sum([p * math.log2(p) for p in n12[1]])
print(hn12)

1.210690290525888


In [214]:
n12_marg = np.sum(n12, axis = 1)
hn12_marg = -sum([p * math.log2(p) for p in n12_marg])
hn12_marg

0.2212090230986569

In [215]:
mi_n12 = hn12_marg + hx - hn12
print(mi_n12)

0.010518732572768918


In [220]:
pn1

0.0287309847812637

In [217]:
n12_marg

0    0.970640
1    0.014365
2    0.014365
3    0.000310
dtype: float64