In [133]:
from nltk import ngrams
from nltk import FreqDist
from math import e
from math import log
from pythainlp import word_tokenize
from nltk.corpus import wordnet
import numpy as np
import random
import re

## Corpus

### Thai Corpus

1. wordnet

In [2]:
from pythainlp.corpus.wordnet import all_lemma_names
len(list(all_lemma_names()))
thai_corpus = [word for word in list(all_lemma_names())
               if word[0].isalpha()
               and '-' not in word 
               and '.' not in word
               and '_' not in word]
len(thai_corpus)

75582

2. Not word

In [3]:
thai_not_word = ["กฟกพๆๆไำๆ", "ายนดาาหทอปแๆ", "เนกดเดนกยเ", "พไำพพจจไพๆำ", "-ภคพัำไรพ",
          "ถภนยบๆ", "หทอปแ", "สบกยำวส", "กกำหกอแปอ", "อแปทอปแดรีน",
          "อแปอาวสกด", "แผปแวฟกำบ", "นยบพ", "พไำพพจจไพๆำ", "ภคพัำไรพ",
          "ปผอแิทม", "สมไำพทา", "ฟมสนะวหด", "นหกดมสยวปากด" , "นดาหกสทฟก",
          "ดาพนฟทเ", "กฟหก่าอปท", "ยนตพจยหนยุขน", "พนงรจตกมด", "ภคตจกดดทมกด่สว" ]

3. Wiki

In [149]:
text = open('wiki.txt', mode='r', encoding='utf-8')
wiki = word_tokenize(text.read(), engine="attacut")
wiki = [w for w in wiki if w != ' '
                        and re.sub('[^\u0E00-\u0E7F0]+', '', w)
                        and w[0] != " "]

### Eng Corpus

In [6]:
eng_corpus = [word for word in list(wordnet.all_lemma_names())
              if word[0].isalpha()
              and '-' not in word 
              and '.' not in word
              and '_' not in word]
len(eng_corpus)

77578

## Generate Trigrams

1. wordnet

In [23]:
thai_trigrams_wordnet_list = [tri for word in thai_corpus 
                                 for tri in list(ngrams(word, 3, 
                                                pad_left=True,
                                                pad_right=True, 
                                                left_pad_symbol='<s>',
                                                right_pad_symbol='</s>'))]

2. not word

In [26]:
thai_not_word_list = [tri for word in thai_not_word 
                         for tri in list(ngrams(word, 3, 
                                                pad_left=True,
                                                pad_right=True, 
                                                left_pad_symbol='<s>',
                                                right_pad_symbol='</s>'))]

3. Wiki

In [30]:
thai_wiki_list = [tri for word in wiki 
                      for tri in list(ngrams(word, 3, 
                                             pad_left=True,
                                             pad_right=True, 
                                             left_pad_symbol='<s>',
                                             right_pad_symbol='</s>'))]

## Smoothing1: Add-one estimation

In [152]:
def fdist_trigrams(trigrams_list, n=0):
    fdist_thai = FreqDist(trigrams_list)
    dict_trigrams = { k:v + n for k,v in dict(fdist_thai).items()}
    
    return dict_trigrams

In [153]:
dict_trigrams_1 = fdist_trigrams(thai_trigrams_wordnet_list, 1)

In [154]:
dict_trigrams_0 = fdist_trigrams(thai_trigrams_wordnet_list)

In [156]:
def add_one_est(word, trigrams_list, dict_trigrams_0, dict_trigrams_1):
    # set up
    stop = 0
    proba = []
    
    # create trigrams of word
    trigrams_word = list(ngrams(word, 3,
                                pad_left=True,
                                pad_right=True, 
                                left_pad_symbol='<s>', 
                                right_pad_symbol='</s>'))
    
    # check some "trigrams of word" not in "trigram_list"
    for i in range(len(trigrams_word)):
        if trigrams_word[i] not in trigrams_list:
            stop = 1
            break
    
    if stop == 1:
        dict_trigrams = dict_trigrams_1
    else:
        dict_trigrams = dict_trigrams_0
    
    # Calculate
    for i in range(len(trigrams_word)):
        try:
            proba.append(np.log(dict_trigrams[trigrams_word[i]]/len(trigrams_list)))
        except:
            proba.append(np.log(1/len(trigrams_list)))
    
    return proba

## Probability Wordnet

In [171]:
corpus_wordnet_simple = random.choices(thai_corpus, k = 25)
proba = []
for word in corpus_wordnet_simple:
    x = add_one_est(word, thai_trigrams_wordnet_list, dict_trigrams_0, dict_trigrams_1)
    proba.append(x)

len(proba)

25

In [172]:
avg = [sum(x)/len(x) for x in proba]

In [173]:
sum(avg)/len(avg)

-8.693850615664402

## Probability Not word

In [174]:
corpus_not_word_simple = random.choices(thai_not_word, k = 25)
proba = []
for word in corpus_not_word_simple:
    x = add_one_est(word, thai_trigrams_wordnet_list, dict_trigrams_0, dict_trigrams_1)
    proba.append(x)

len(proba)

25

In [175]:
avg = [sum(x)/len(x) for x in proba]

In [176]:
sum(avg)/len(avg)

-11.535587927097264

## Probability Wiki

In [177]:
corpus_wiki_simple = random.choices(wiki, k = 25)
proba = []
for word in corpus_wiki_simple:
    x = add_one_est(word, thai_trigrams_wordnet_list, dict_trigrams_0, dict_trigrams_1)
    proba.append(x)

len(proba)

25

In [178]:
avg = [sum(x)/len(x) for x in proba]

In [179]:
sum(avg)/len(avg)

-8.079900658258584