[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/AtomicWiZ/BADS_NLP/blob/master/LanguageModel.ipynb)

In [0]:
import numpy as np
import requests

import nltk
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
from nltk.corpus import stopwords
from nltk.probability import FreqDist

In [0]:
# download one-time
nltk.download('stopwords')
nltk.download('punkt')

In [0]:
git_url = "https://raw.githubusercontent.com/AtomicWiZ/BADS_NLP/master/littleprince.txt"

## Import
d = requests.get(git_url).content.decode('utf8')

## Cleansing
d = d.lower()

In [0]:
## Remove punctuation
from nltk.tokenize import RegexpTokenizer
# tokenize by regular expression. split string into substring

tokenizer = RegexpTokenizer(r'\w+')
# \w = any one word/non-word character. including numerical character

token = tokenizer.tokenize(d)

In [0]:
## Remove stop words

stop_words=set(stopwords.words("english"))

w_token_filtered =[]
for w in token:
    if w not in stop_words:
        w_token_filtered.append(w)

In [9]:
trigram = list(ngrams(w_token_filtered, 3))

fdist = FreqDist(trigram)
print(fdist)

<FreqDist with 7011 samples and 7337 outcomes>


In [23]:
mle = nltk.probability.MLEProbDist(fdist)
add1 = nltk.probability.LaplaceProbDist(fdist)
kns = nltk.probability.KneserNeyProbDist(fdist)

query_text = ('said', 'little', 'prince')
unseen_text = ('king', 'queen', 'jack')

print(type(query_text))

print('MLE : ',mle.prob(query_text))
    ## 47 / 7337
print('Add-1 Smoothing : ',add1.prob(query_text))
    ## (47+1) / (7337+7011)
print('Kneser-Ney Smoothing : ',kns.prob(query_text))
    ## prob that 'said little' will follow by 'prince'
    ## prob that 'wi-2 wi-1' will follow by wi

print('MLE unseen : ',mle.prob(unseen_text))
print('Add-1 Smoothing unseen : ',add1.prob(unseen_text))
    ## 1 / (7337+7011)
print('Kneser-Ney Smoothing unseen : ',kns.prob(unseen_text)) 

<class 'tuple'>
MLE :  0.006405887965108355
Add-1 Smoothing :  0.003345413994981879
Kneser-Ney Smoothing :  0.9635416666666666
MLE unseen :  0.0
Add-1 Smoothing unseen :  6.969612489545581e-05
Kneser-Ney Smoothing unseen :  0.0


In [27]:
prob_sum = 0
# list all samples with non zero probabilities
for i in kns.samples():
    if i[0] == "said" and i[1] == "little":
        # sum prob of trigram that begin with 'said little'
        prob_sum += kns.prob(i)
        print("{0}:{1}".format(i, kns.prob(i)))
print('Total prob : ',prob_sum)
# save some probabilitity mass for the smoothing algorithm to distribute to the unseen  N-grams

('said', 'little', 'prince'):0.9635416666666666
('said', 'little', 'frightened'):0.005208333333333333
Total prob :  0.96875
