In [18]:
import nltk
nltk.download('gutenberg') 
nltk.download('punkt')

[nltk_data] Downloading package gutenberg to /home/alok/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package punkt to /home/alok/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [19]:
corpus = nltk.corpus.gutenberg.sents(u'austen-emma.txt')

# Get unigrams and Bigrams

In [20]:
def get_unigrams(corpus):
    unigrams = {}
    for sentence in corpus:
        for word in sentence:
            word = word.lower()
            if word not in unigrams:
                unigrams[word] = 0
            unigrams[word] += 1
    return unigrams

def sort_dict(d, reverse = True):
    return sorted(d.items(), key = lambda x : x[1], reverse = reverse)

In [21]:
from ipy_table import *

In [22]:
unigrams = get_unigrams(corpus)
sorted_unigrams = sort_dict(unigrams)

In [23]:
make_table(sorted_unigrams)

0,1
",",11454
.,6928
to,5239
the,5201
and,4896
of,4291
i,3178
a,3129
it,2528
her,2469


In [24]:
from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly.graph_objs import *
init_notebook_mode(connected=True)
iplot([{"x" : list(zip(*sorted_unigrams))[0], "y": list(zip(*sorted_unigrams))[1]}])

In [25]:
import math
def unigram_probs(unigrams):
    new_unigrams = {}
    N = sum(unigrams.values())
    for word in unigrams:
        new_unigrams[word] =  round(unigrams[word] / float(N), 15)
        #new_unigrams[word] =  math.log(unigrams[word] / float(N))
    return new_unigrams
    

In [26]:
uprobs = unigram_probs(unigrams)
sorted_uprobs = sort_dict(uprobs)
make_table(sorted_uprobs)

0,1
",",0.0595
.,0.036
to,0.0272
the,0.027
and,0.0254
of,0.0223
i,0.0165
a,0.0163
it,0.0131
her,0.0128


In [27]:
iplot([{"x" : list(zip(*sorted_uprobs))[0], "y": list(zip(*sorted_uprobs))[1]}])
#iplot([{"x" : [math.log(i+1) for i in range(len(sorted_uprobs))], "y": list(zip(*sorted_uprobs))[1]}])

# Add-One Smoothing

In [29]:
def add_one(unigrams,V):
    new_unigrams = {}
    N = sum(unigrams.values())
    for word in unigrams:
        new_unigrams[word] = round((unigrams[word] + 1)/ float(N+V), 15) * N
        #new_unigrams[word] = math.log((unigrams[word] + 1)/ float(N+V))
    return new_unigrams


In [30]:
smoothed_200 = sort_dict(add_one(unigrams, 200))
smoothed_2000 = sort_dict(add_one(unigrams, 2000))
smoothed_double = sort_dict(add_one(unigrams, len(unigrams)*10))



#p1 = Scatter(x = list(zip(*sorted_uprobs))[0], y= list(zip(*sorted_uprobs))[1], mode='lines', name="unsmoothed")
p1 = Scatter(x = list(zip(*sorted_uprobs))[0], y= list(zip(*sorted_unigrams))[1], mode='lines', name="unsmoothed")
p2 = Scatter(x = list(zip(*smoothed_200))[0], y= list(zip(*smoothed_200))[1], mode='lines', name="200")
p3 = Scatter(x = list(zip(*smoothed_2000))[0], y= list(zip(*smoothed_2000))[1], mode='lines', name="2000")
p4 = Scatter(x = list(zip(*smoothed_double))[0], y= list(zip(*smoothed_double))[1], mode='lines', name="double")
iplot([p1, p2, p3, p4])

In [32]:
def get_bigrams(corpus):
    bigrams = {}
    for sentence in corpus:
        for index, word in enumerate(sentence):
            if index > 0:                
                word = word.lower()            
                prev = sentence[index - 1].lower()
                if prev not in bigrams:
                    bigrams[prev] = {}
                if word not in bigrams[prev]:
                    bigrams[prev][word]  = 0
                bigrams[prev][word] += 1
    return bigrams

In [33]:
bigrams = get_bigrams(corpus)
bg_word = sort_dict(unigram_probs(bigrams['kind']))
p1 = Scatter(x = list(zip(*bg_word))[0], y = list(zip(*bg_word))[1], name='unsmoothed')
bg_smoothed = sort_dict(add_one(bigrams['kind'], len(unigrams)))
p2 = Scatter(x = list(zip(*bg_smoothed))[0], y = list(zip(*bg_smoothed))[1], name='smoothed')
iplot([p1,p2])

# Heldout Smoothing

In [37]:
#Heldout
training = corpus[:5000]
heldout = corpus[5000:]

In [38]:
utraining = get_unigrams(training)
uheldout = get_unigrams(heldout)

In [39]:
def eq_classes(ugrams):
    eq = {}
    for k, v in ugrams.items():
        if v not in eq:
            eq[v] = []
        eq[v].append(k)
    return eq

In [40]:
tr_eq_cls = eq_classes(utraining)
for r in tr_eq_cls:
    print(r, tr_eq_cls[r])


1 ['[', 'austen', '1816', ']', 'vex', 'indulgent', 'indistinct', 'caresses', 'nominal', 'mildness', 'impose', 'esteeming', 'disadvantages', 'threatened', 'alloy', 'misfortunes', 'mournful', 'debt', 'owing', 'footing', 'dearer', 'tenderer', 'dearly', 'valetudinarian', 'everywhere', 'friendliness', 'comparatively', 'amounting', 'equals', 'lieu', 'awoke', 'reconciled', 'chatted', 'housemaid', 'curtseys', 'needlework', 'lock', 'bangs', 'speck', 'surprising', 'rained', 'gainer', 'punctual', 'bears', 'deathbed', 'broadway', 'drizzle', 'borrowed', 'umbrellas', 'mitchell', ',\'"', 'supposes', 'delicately', 'encouragements', 'interference', 'grievously', 'fitted', 'fish', 'homely', 'embodied', 'military', 'dissuaded', 'threw', 'decorum', 'pursue', 'luxuries', 'amazing', 'poorer', 'lingering', 'reconciliation', 'kindred', 'offered', 'decease', 'competence', 'portionless', 'tyrannic', 'choose', 'tacitly', 'avowed', 'adoption', 'assume', 'capricious', 'boasts', 'achieved', 'dissentient', 'prized',

In [41]:
def get_heldout(hugrams, eq):
    hest = {}
    for r in eq:
        t_r = sum([hugrams[u] for u in eq[r] if u in hugrams])
        hest[r] = t_r
    return hest

In [42]:
hestimate = get_heldout(uheldout, tr_eq_cls)
N = sum(list(utraining.values()))
new_estimates = []
old_estimates = []
for r, t_r in sorted(hestimate.items()):
    new_estimates +=  [t_r/(len(tr_eq_cls[r])*N)]
    old_estimates += [r/N]
    print(r, len(tr_eq_cls[r]), hestimate[r], hestimate[r]/len(tr_eq_cls[r]), hestimate[r]/(len(tr_eq_cls[r]) * N), r/N)

1 2467 1131 0.45845156059991893 3.782260360857669e-06 8.250076313205898e-06
2 929 969 1.0430570505920345 8.605300266411749e-06 1.6500152626411795e-05
3 487 736 1.51129363449692 1.246828781626189e-05 2.4750228939617693e-05
4 298 606 2.033557046979866 1.6777000824841523e-05 3.300030525282359e-05
5 248 653 2.6330645161290325 2.1722983195659077e-05 4.1250381566029484e-05
6 180 666 3.7 3.052528235886182e-05 4.9500457879235385e-05
7 144 547 3.798611111111111 3.133883155085851e-05 5.775053419244128e-05
8 118 490 4.1525423728813555 3.425879147009229e-05 6.600061050564718e-05
9 87 409 4.7011494252873565 3.878484151840474e-05 7.425068681885307e-05
10 62 414 6.67741935483871 5.508921925269744e-05 8.250076313205897e-05
11 65 376 5.7846153846153845 4.7723518365621806e-05 9.075083944526487e-05
12 56 371 6.625 5.465675557498907e-05 9.900091575847077e-05
13 40 249 6.225 5.135672504970671e-05 0.00010725099207167666
14 28 195 6.964285714285714 5.7455888609826785e-05 0.00011550106838488256
15 36 401 11.1

In [43]:
p1 = Bar(x = list(hestimate.keys()), y = old_estimates, name='unsmoothed')
p2 = Bar(x = list(hestimate.keys()) , y = new_estimates, name='smoothed')
iplot([p1,p2])

# Good Turing

In [44]:
nrs = eq_classes(unigrams)
nr_counts = {k : len(v) for k, v in nrs.items()}
nr_probs = {k : (k*v)/float(N) for k, v in nr_counts.items()} # P = r * Nr / N
sorted_nrs = sorted(nr_counts.items())
sorted_probs = sorted(nr_probs.items())
nr_counts

{2: 1062,
 865: 1,
 571: 1,
 301: 2,
 1: 2888,
 3: 575,
 3178: 1,
 56: 9,
 313: 1,
 11454: 1,
 38: 14,
 27: 21,
 4896: 1,
 14: 57,
 1217: 1,
 3129: 1,
 34: 9,
 130: 2,
 125: 3,
 24: 25,
 141: 1,
 5239: 1,
 262: 1,
 4291: 1,
 5201: 1,
 85: 5,
 6: 202,
 8: 139,
 2199: 1,
 1624: 1,
 25: 22,
 30: 12,
 574: 1,
 452: 1,
 57: 4,
 2188: 1,
 81: 4,
 1202: 1,
 359: 1,
 19: 38,
 494: 1,
 2469: 1,
 6928: 1,
 2340: 1,
 2398: 1,
 4: 430,
 178: 1,
 7: 161,
 248: 1,
 9: 114,
 207: 1,
 33: 11,
 1007: 1,
 935: 1,
 35: 9,
 759: 2,
 11: 99,
 1145: 1,
 95: 5,
 546: 1,
 41: 10,
 18: 31,
 72: 3,
 254: 1,
 146: 3,
 32: 14,
 1347: 1,
 1320: 1,
 467: 1,
 415: 1,
 464: 1,
 93: 1,
 5: 310,
 131: 2,
 1436: 1,
 294: 1,
 70: 5,
 50: 6,
 599: 1,
 48: 8,
 1153: 1,
 77: 3,
 68: 5,
 177: 2,
 39: 16,
 1441: 1,
 46: 10,
 73: 6,
 2528: 1,
 21: 28,
 12: 82,
 132: 2,
 250: 2,
 16: 45,
 36: 12,
 71: 3,
 40: 12,
 654: 1,
 358: 3,
 309: 1,
 55: 5,
 138: 2,
 540: 1,
 45: 7,
 165: 1,
 536: 1,
 29: 15,
 42: 9,
 10: 110,
 202: 1,
 

### MAX = sorted_nrs[0][1]
# R vs NR
iplot([
    Bar({"x" : list(zip(*sorted_nrs))[0], "y": list(zip(*sorted_nrs))[1]}, name="original"),    
    Bar({"x" : list(zip(*sorted_nrs))[0], "y": [(MAX)*(x[0]**-2) for x in sorted_nrs]}, name="aR^b") # ab^r
])
# R vs Probability Mass of R
regress_probs = sorted({k : (MAX*k**-2)/float(N) for k, v in nr_counts.items()}.items())
iplot([
    Bar({"x" : list(zip(*sorted_probs))[0], "y": list(zip(*sorted_probs))[1]}, name="original"),    
    Bar({"x" : list(zip(*regress_probs))[0], "y": list(zip(*regress_probs))[1]}, name="aR^b"), # ab^r
])
# Ps. Zoom to see the bars. 

In [47]:
def good_turing(nrs):
    new_nrs = {}
    for r, nr in nrs.items():
        if (r+1) in nrs:
            new_nr = ((r+1) * nrs[r+1]) / float(N) 
        else:
            new_nr = MAX*r**-2 / float(N)
        new_nrs[r] = new_nr
    return new_nrs

In [48]:
new_nrs = good_turing(nr_counts)
sorted_newnrs = sorted(new_nrs.items())
sorted_newnrs

NameError: name 'MAX' is not defined

In [49]:

iplot([
    Bar({"x" : list(zip(*sorted_probs))[0], "y": list(zip(*sorted_probs))[1]}, name="original"),
    Bar({"x" : list(zip(*sorted_newnrs))[0], "y": list(zip(*sorted_newnrs))[1]}, name="good-turing"),
     Bar({"x" : list(zip(*regress_probs))[0], "y": list(zip(*regress_probs))[1]}, name="aR^b"), # ab^r

])

NameError: name 'sorted_newnrs' is not defined

# Kneser-Ney

In [27]:
def witten_bell(w, bigrams, unigrams):
    
    #wb_lambda = get_lambdaval(prior, text)
    #new_prior = ' '.join(prior.split()[:-1])
    #wb_prob = (1-wb_lambda)*laplace_mle(lik, prior, text) + wb_lambda*witten_bell(lik, new_prior, text)
    wprobs = {}
    prior_count = sum(list(bigrams[w].values()))
    type_count = len(list(bigrams[w].keys()))
    bprobs = {}
    for b in bigrams[w]:
        ngram_count = bigrams[w][b] / float(prior_count)
        bprobs[b]= ngram_count
        #vocab_size = len(get_vocab(text))
        wblambda = type_count / float(type_count + prior_count)
        wb_prob = (1-wblambda)*ngram_count + wblambda*unigrams[w]
        #print(unigrams[w])
        wprobs[b] = wb_prob
    return bprobs, wprobs
    #z = vocab_size - type_count
    #if ngram_count == 0:
    #    wb_prob = float(type_count)/float(z*(prior_count + type_count))
    #else:
    #    wb_prob = float(ngram_count)/float(prior_count + type_count)
    #return wb_prob

In [28]:
bprobs, wprobs = witten_bell('good', bigrams, uprobs)
sorted_bprobs = sort_dict(bprobs)
sorted_wprobs = sort_dict(wprobs)

iplot([
    Scatter({"x" : list(zip(*sorted_bprobs))[0], "y": list(zip(*sorted_bprobs))[1]}, name="original"),
    Scatter({"x" : list(zip(*sorted_wprobs))[0], "y": list(zip(*sorted_wprobs))[1]}, name="wittenbell"),
     

])