### N-gram Language Model [Thai-Eng]

Test Step

1. Test ด้วย String ที่เราสร้างเอง
    - Thai
    - Eng
    - Thai + Eng
2. Test ด้วยผล OCR
    - เอาคำที่ไม่ใช่คำออกไป
    - วัด Accuracy, Precision, Recall, F1 Score

In [30]:
from nltk import ngrams
from nltk import FreqDist
from math import e
from math import log
from pythainlp import word_tokenize
from nltk.corpus import wordnet
import numpy as np

-----------------------------------
### Corpus

* Thai Corpus

In [2]:
from pythainlp.corpus.wordnet import all_lemma_names
len(list(all_lemma_names()))
thai_corpus = [word for word in list(all_lemma_names()) \
                if word[0].isalpha() and '-' not in word and '.' not in word and '_' not in word]
len(thai_corpus)

75582

* คำมั่วๆ

* Eng Corpus

In [4]:
eng_corpus = [word for word in list(wordnet.all_lemma_names()) \
               if word[0].isalpha() and '-' not in word and '.' not in word and '_' not in word]
len(eng_corpus)

77578

-----------------------------------
### Trigrams

* Thai Trigrams

### เตรียมข้อมูลของ ThaiCorpus

In [5]:
tha_trigrams_list = [tri for word in thai_corpus for tri in list(ngrams(word, 
                                                                        3, 
                                                                        pad_left=True,
                                                                        pad_right=True, 
                                                                        left_pad_symbol='<s>',
                                                                        right_pad_symbol='</s>'))]
fdist_tha = FreqDist(tha_trigrams_list)
tha_trigrams = {k:v/len(tha_trigrams_list) for k,v in dict(fdist_tha).items()}

### เตรียมข้อมูลภาษาไทย มั่วๆ

In [6]:
tha_1_list = [tri for word in thai_1 for tri in list(ngrams(word, 
                                                            3, 
                                                            pad_left=True,
                                                            pad_right=True, 
                                                            left_pad_symbol='<s>',
                                                            right_pad_symbol='</s>'))]
fdist_tha_1 = FreqDist(tha_1_list)
tha_1_trigrams = {k:v/len(tha_1_list) for k,v in dict(fdist_tha_1).items()}

### เตรียมข้อมูล Thai wiki

In [15]:
text = open('wiki.txt', mode='r', encoding='utf-8')
wiki = word_tokenize(text.read(), engine="attacut")
wiki = [w for w in wiki if w != ' ']

tha_wiki_list = [tri for word in wiki for tri in list(ngrams(word, 
                                                            3, 
                                                            pad_left=True,
                                                            pad_right=True, 
                                                            left_pad_symbol='<s>',
                                                            right_pad_symbol='</s>'))]

fdist_tha_wiki = FreqDist(tha_wiki_list)
tha_1_trigrams = {k:v/len(tha_wiki_list) for k,v in dict(fdist_tha_wiki).items()}

### Perplexity

In [17]:
def perplexity(word, dict_trigrams):
    l = []
    ans = []
    trigrams_list = list(ngrams(word, 3, pad_left=True,pad_right=True, left_pad_symbol='<s>', right_pad_symbol='</s>'))
    for i in range(len(trigrams_list)):
        try:
            l.append(np.log(dict_trigrams[trigrams_list[i]]))
            ans.append(trigrams_list[i])
        except:
#             dict_trigrams[trigrams_list[i]] = 1/len(dict_trigrams)
            l.append(np.log(1/len(dict_trigrams)))
            ans.append(trigrams_list[i])
#     return np.power(e, -sum(l)/len(trigrams_list))
#     return sum(l)/len(word)
    return (sum(l)/len(word)) , l, ans

### Test avg

In [18]:
def cal_like(word, dict_trigrams):
    l = []
    trigrams_list = list(ngrams(word, 3, pad_left=True,pad_right=True, left_pad_symbol='<s>', right_pad_symbol='</s>'))
    for i in range(len(trigrams_list)):
        try:
            l.append(np.log(dict_trigrams[trigrams_list[i]]))
        except:
            l.append(np.log(1/len(dict_trigrams)))
            
    return sum(l)/len(word)

In [19]:
def avg(corpus, trigrams):
    a = []
    for i in corpus:
        avg = cal_like(i, trigrams)
        a.append(avg)
    return sum(a)/len(thai_corpus)
#     return a

### ลองกับ ThaiCorpus ที่เป็นคำที่มีจริงๆ

In [20]:
a = avg(thai_corpus, tha_trigrams)
a

-10.33581442828286

### ลองกับคำมั่วๆ

In [29]:
_, l, ans = perplexity(thai_1[0], tha_trigrams)
thai_1[0], list(zip(ans,l))

('กฟกพๆๆไำๆ',
 [(('<s>', '<s>', 'ก'), -4.9571017597635665),
  (('<s>', 'ก', 'ฟ'), -10.462588922467008),
  (('ก', 'ฟ', 'ก'), -10.462588922467008),
  (('ฟ', 'ก', 'พ'), -10.462588922467008),
  (('ก', 'พ', 'ๆ'), -10.462588922467008),
  (('พ', 'ๆ', 'ๆ'), -10.462588922467008),
  (('ๆ', 'ๆ', 'ไ'), -10.462588922467008),
  (('ๆ', 'ไ', 'ำ'), -10.462588922467008),
  (('ไ', 'ำ', 'ๆ'), -10.462588922467008),
  (('ำ', 'ๆ', '</s>'), -13.198014301452465),
  (('ๆ', '</s>', '</s>'), -8.431575967868252)])

In [21]:
b = avg(thai_1, tha_trigrams)
b

-0.00418356540152761

### wiki

In [22]:
c = avg(wiki, tha_trigrams)
c

-0.10152193330441954

### wiki

* Eng Trigrams

In [14]:
eng_trigrams = [tri for word in eng_corpus for tri in list(ngrams(word, 3, pad_left=True,\
                                                                        pad_right=True, left_pad_symbol='<s>',\
                                                                        right_pad_symbol='</s>'))]
len(eng_trigrams)

819776

### NLTK Language Model

In [33]:
import nltk
from nltk.lm.preprocessing import padded_everygram_pipeline
from nltk.lm import MLE
from nltk.lm import Vocabulary

In [34]:
train_sentences = ['an apple', 'an orange']
tokenized_text = [list(map(str.lower, nltk.tokenize.word_tokenize(sent))) for sent in train_sentences]

In [84]:
n = 2
train_data = [nltk.bigrams(t,  pad_right=True, pad_left=True, left_pad_symbol="<s>", right_pad_symbol="</s>") for t in tokenized_text]
words = [word for sent in tokenized_text for word in sent]
words.extend(["<s>", "</s>"])
type(words)

list

In [36]:
padded_vocab = Vocabulary(words)

In [37]:
model = MLE(n)
model.fit(train_data, padded_vocab)

In [38]:
test_sentences = ['an apple', 'an ant']
tokenized_text = [list(map(str.lower, nltk.tokenize.word_tokenize(sent))) for sent in test_sentences]

In [39]:
test_data = [nltk.bigrams(t,  pad_right=True, pad_left=True, left_pad_symbol="<s>", right_pad_symbol="</s>") for t in tokenized_text]
for test in test_data:
    print ("MLE Estimates:", [((ngram[-1], ngram[:-1]),model.score(ngram[-1], ngram[:-1])) for ngram in test])

MLE Estimates: [(('an', ('<s>',)), 1.0), (('apple', ('an',)), 0.5), (('</s>', ('apple',)), 1.0)]
MLE Estimates: [(('an', ('<s>',)), 1.0), (('ant', ('an',)), 0.0), (('</s>', ('ant',)), 0)]


In [40]:
test_data = [nltk.bigrams(t,  pad_right=True, pad_left=True, left_pad_symbol="<s>", right_pad_symbol="</s>") for t in tokenized_text]
for i, test in enumerate(test_data):
    print("PP({0}):{1}".format(test_sentences[i], model.perplexity(test)))

PP(an apple):1.2599210498948732
PP(an ant):inf


### NLTK Language Model @1

In [61]:
import nltk
from nltk.lm.preprocessing import padded_everygram_pipeline
from nltk.lm import MLE
from nltk.lm import Vocabulary

In [85]:
words = [char for word in thai_corpus for char in word]
# words.extend(["<s>", "</s>"])
words.append("<s>")
words.append("</s>")
type(words)

list

In [81]:
padded_vocab = Vocabulary(words)

In [82]:
n = 3

In [83]:
model = MLE(n)
model.fit(tha_trigrams_list, padded_vocab)

TypeError: Ngram <<s>> isn't a tuple, but <class 'str'>