In [0]:
import nltk
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
from nltk.corpus import stopwords
from nltk.probability import FreqDist

import re

In [0]:
## Import
with open("littleprince.txt", "r",encoding='utf-8') as file:
    d = file.read()

## Cleansing
d = d.lower()
d = re.sub(r'[^a-zA-Z.\s]', ' ', d)
d = d.replace('\n','').replace('  ',' ')

In [0]:
## Remove punctuation
from nltk.tokenize import RegexpTokenizer

tokenizer = RegexpTokenizer(r'\w+')
token = tokenizer.tokenize(d)

In [0]:
## Remove stop words

stop_words=set(stopwords.words("english"))

w_token = word_tokenize(d)

w_token_filtered =[]
for w in token:
    if w not in stop_words:
        w_token_filtered.append(w)

In [0]:
trigram = list(ngrams(w_token_filtered, 3))

fdist = FreqDist(trigram)
print(fdist)
fdist.most_common(10)

<FreqDist with 6980 samples and 7306 outcomes>


[(('said', 'little', 'prince'), 47),
 (('asked', 'little', 'prince'), 11),
 (('good', 'morning', 'said'), 11),
 (('little', 'prince', 'said'), 9),
 (('little', 'prince', 'added'), 6),
 (('little', 'prince', 'went'), 6),
 (('concerned', 'matters', 'consequence'), 5),
 (('little', 'prince', 'asked'), 5),
 (('planet', 'little', 'prince'), 5),
 (('one', 'never', 'knows'), 5)]

In [0]:
mle = nltk.probability.MLEProbDist(fdist)
add1 = nltk.probability.LaplaceProbDist(fdist)
kns = nltk.probability.KneserNeyProbDist(fdist)

print('No Smoothing : ',fdist.freq(('said', 'little', 'prince')))
    ## 47 / 7306
print('No Smoothing unseen : ',fdist.freq(('king', 'queen', 'jack')))
print('MLE : ',mle.prob(('said', 'little', 'prince')))
    ##47 / 7306
print('Add-1 Smoothing : ',add1.prob(('said', 'little', 'prince')))
    ## 47 / (7306+6980)
print('Add-1 Smoothing unseen : ',add1.prob(('king', 'queen', 'jack')))
print('Kneser-Ney Smoothing : ',kns.prob(('said', 'little', 'prince')))
print('Kneser-Ney Smoothing unseen : ',kns.prob(('king', 'queen', 'jack')))

No Smoothing :  0.006433068710648782
No Smoothing unseen :  0.0
MLE :  0.006433068710648782
Add-1 Smoothing :  0.0033599328013439733
Add-1 Smoothing unseen :  6.999860002799944e-05
Kneser-Ney Smoothing :  0.9635416666666666
Kneser-Ney Smoothing unseen :  0.0
