In [1]:
from nltk import ngrams
from nltk import FreqDist
from math import e
from math import log
from pythainlp import word_tokenize
from nltk.corpus import wordnet
import numpy as np
import random
import re

## Corpus

### Thai Corpus

1. wordnet

In [2]:
from pythainlp.corpus.wordnet import all_lemma_names
len(list(all_lemma_names()))
thai_corpus = [word for word in list(all_lemma_names())
               if word[0].isalpha()
               and '-' not in word 
               and '.' not in word
               and '_' not in word]
len(thai_corpus)

75582

2. Not word

In [3]:
thai_not_word = ["กฟกพๆๆไำๆ", "ายนดาาหทอปแๆ", "เนกดเดนกยเ", "พไำพพจจไพๆำ", "-ภคพัำไรพ",
          "ถภนยบๆ", "หทอปแ", "สบกยำวส", "กกำหกอแปอ", "อแปทอปแดรีน",
          "อแปอาวสกด", "แผปแวฟกำบ", "นยบพ", "พไำพพจจไพๆำ", "ภคพัำไรพ",
          "ปผอแิทม", "สมไำพทา", "ฟมสนะวหด", "นหกดมสยวปากด" , "นดาหกสทฟก",
          "ดาพนฟทเ", "กฟหก่าอปท", "ยนตพจยหนยุขน", "พนงรจตกมด", "ภคตจกดดทมกด่สว" ]

3. Wiki

In [4]:
text = open('wiki.txt', mode='r', encoding='utf-8')
wiki = word_tokenize(text.read(), engine="attacut")
wiki = [w for w in wiki if w != ' '
                        and re.sub('[^\u0E00-\u0E7F0]+', '', w)
                        and w[0] != " "]

4. Image1

In [106]:
text1 = open('image1.txt', mode='r', encoding='utf-8')
image1 = word_tokenize(text1.read(), engine="attacut")
image1 = [w for w in image1 if w != ' '
                        and re.sub('[^\u0E00-\u0E7F0]+', '', w)
                        and w[0] != " "]
text1.close()
len(image1)

58

5. Image2

In [100]:
text2 = open('image2.txt', mode='r', encoding='utf-8')
ans_text2 = []
x = text2.read().splitlines()
for txt in x:
    ans_text2.append(txt.strip("[").lstrip("'").rstrip(",").rstrip("'").lstrip(' ').lstrip("''"))

image2 = [w for w in ans_text2 if w != ' '
                        and re.sub('[^\u0E00-\u0E7F0]+', '', w)
                        and w[0] != " "]

text2.close()
len(image2)

38

6. Image3

In [101]:
text3 = open('image3.txt', mode='r', encoding='utf-8')
ans_text3 = []
x = text3.read().splitlines()
for txt in x:
    ans_text3.append(txt.strip("[").lstrip("'").rstrip(",").rstrip("'").lstrip(' ').lstrip("''"))

image3 = [w for w in ans_text3 if w != ' '
                        and re.sub('[^\u0E00-\u0E7F0]+', '', w)
                        and w[0] != " "]

text3.close()
len(image3)

4

7. Image4

In [120]:
text4 = open('image4.txt', mode='r', encoding='utf-8')
ans_text4 = []
x = text4.read().splitlines()
for txt in x:
    ans_text4.append(txt.strip("[").lstrip("'").rstrip(",").rstrip("'").lstrip(' ').lstrip("''"))

image4 = [w for w in ans_text4 if w != ' '
                        and re.sub('[^\u0E00-\u0E7F0]+', '', w)
                        and w[0] != " "]

text4.close()
# len(image4)
# image4

### Eng Corpus

In [41]:
eng_corpus = [word for word in list(wordnet.all_lemma_names())
              if word[0].isalpha()
              and '-' not in word 
              and '.' not in word
              and '_' not in word]
len(eng_corpus)

77578

## Generate Trigrams

1. wordnet

In [42]:
thai_trigrams_wordnet_list = [tri for word in thai_corpus 
                                 for tri in list(ngrams(word, 3, 
                                                pad_left=True,
                                                pad_right=True, 
                                                left_pad_symbol='<s>',
                                                right_pad_symbol='</s>'))]

2. not word

In [43]:
thai_not_word_list = [tri for word in thai_not_word 
                         for tri in list(ngrams(word, 3, 
                                                pad_left=True,
                                                pad_right=True, 
                                                left_pad_symbol='<s>',
                                                right_pad_symbol='</s>'))]

3. Wiki

In [44]:
thai_wiki_list = [tri for word in wiki 
                      for tri in list(ngrams(word, 3, 
                                             pad_left=True,
                                             pad_right=True, 
                                             left_pad_symbol='<s>',
                                             right_pad_symbol='</s>'))]

## Smoothing1: Add-one estimation

In [45]:
def fdist_trigrams(trigrams_list, n=0):
    fdist_thai = FreqDist(trigrams_list)
    dict_trigrams = { k:v + n for k,v in dict(fdist_thai).items()}
    
    return dict_trigrams

In [46]:
dict_trigrams_1 = fdist_trigrams(thai_trigrams_wordnet_list, 1)

In [47]:
dict_trigrams_0 = fdist_trigrams(thai_trigrams_wordnet_list)

In [48]:
def add_one_est(word, trigrams_list, dict_trigrams_0, dict_trigrams_1):
    # set up
    stop = 0
    proba = []
    
    # create trigrams of word
    trigrams_word = list(ngrams(word, 3,
                                pad_left=True,
                                pad_right=True, 
                                left_pad_symbol='<s>', 
                                right_pad_symbol='</s>'))
    
    # check some "trigrams of word" not in "trigram_list"
    for i in range(len(trigrams_word)):
        if trigrams_word[i] not in trigrams_list:
            stop = 1
            break
    
    if stop == 1:
        dict_trigrams = dict_trigrams_1
    else:
        dict_trigrams = dict_trigrams_0
    
    # Calculate
    for i in range(len(trigrams_word)):
        try:
            proba.append(np.log(dict_trigrams[trigrams_word[i]]/len(trigrams_list)))
        except:
            proba.append(np.log(1/len(trigrams_list)))
    
    return proba

## Probability Wordnet

In [49]:
corpus_wordnet_simple = random.choices(thai_corpus, k = 25)
proba = []
for word in corpus_wordnet_simple:
    x = add_one_est(word, thai_trigrams_wordnet_list, dict_trigrams_0, dict_trigrams_1)
    proba.append(x)

len(proba)

25

In [50]:
avg = [sum(x)/len(x) for x in proba]

In [51]:
sum(avg)/len(avg)

-8.664902693391069

## Probability Not word

In [52]:
corpus_not_word_simple = random.choices(thai_not_word, k = 25)
proba = []
for word in corpus_not_word_simple:
    x = add_one_est(word, thai_trigrams_wordnet_list, dict_trigrams_0, dict_trigrams_1)
    proba.append(x)

len(proba)

25

In [53]:
avg = [sum(x)/len(x) for x in proba]

In [54]:
sum(avg)/len(avg)

-11.768992292918886

## Probability Wiki

In [55]:
corpus_wiki_simple = random.choices(wiki, k = 25)
proba = []
for word in corpus_wiki_simple:
    x = add_one_est(word, thai_trigrams_wordnet_list, dict_trigrams_0, dict_trigrams_1)
    proba.append(x)

len(proba)

25

In [56]:
avg = [sum(x)/len(x) for x in proba]

In [57]:
sum(avg)/len(avg)

-8.25890559186671

## Probability Image 1

In [135]:
# corpus_image1_simple = random.choices(image1, k = 25)

proba = []
for word in image1:
    x = add_one_est(word, thai_trigrams_wordnet_list, dict_trigrams_0, dict_trigrams_1)
    proba.append(x)

# len(proba)
dict(zip(image1,[sum(p)/len(p) for p in proba]))

{'การ': -5.88784470746459,
 'พางพิเศษแห่งประเทศไทย': -9.47849566161584,
 "โพร'": -11.00186805384449,
 '0': -13.891161482012413,
 '5380': -13.891161482012409,
 'เลย': -7.8691026887829425,
 'ประจํา': -9.371148421747964,
 'ตัว': -6.939584666454313,
 'ผู้': -7.144941963443349,
 'เสีย': -6.462008663688724,
 "ภาษีอากร',": -10.282164794753703,
 '00016542': -13.89116148201241,
 'ใบ': -8.332264522170846,
 'รับ': -7.244983781376675,
 'ค่า': -7.002367237162535,
 'ผ่าน': -7.520333232342129,
 "พางพิเศษ',\n": -11.35452443531372,
 'สายมางพณี': -9.685237305801321,
 '-สุขสวัสดิ์': -11.244843351123933,
 'ทาง': -6.635896273325277,
 'พิเศษแห่งประเทศไทย': -9.226119833085942,
 'เล่ม': -7.689213699624379,
 'ที่': -6.378380588355644,
 'เลข': -8.28627083916749,
 '2016': -13.891161482012409,
 '07': -13.89116148201241,
 '03': -13.89116148201241,
 'ค่าน': -6.97792055236404,
 'เข้า': -6.7209009951215,
 'ด่าน': -8.031817456363532,
 'ออก': -6.8750503530039495,
 'ช่องทาง': -7.36287376043796,
 '-MX03': -13.89116148201

In [129]:
avg = [sum(x)/len(x) for x in proba]

In [130]:
sum(avg)/len(avg)

-9.048700966824462

## Probability Image 2

In [136]:
# corpus_image2_simple = random.choices(image2, k = 25)

proba = []
for word in image2:
    x = add_one_est(word, thai_trigrams_wordnet_list, dict_trigrams_0, dict_trigrams_1)
    proba.append(x)

# len(proba)
dict(zip(image2,[sum(p)/len(p) for p in proba]))

{'231119012731': -13.841650969115275,
 'ญะนัน่': -11.515937837149776,
 '17-DEC-2019': -13.891161482012413,
 '550': -13.89116148201241,
 '02-627-4523': -13.891161482012413,
 '0': -13.891161482012413,
 'ล|ลทล': -11.19965319212128,
 '026278627': -13.89116148201241,
 '10330': -13.891161482012409,
 '0105537133562': -13.891161482012414,
 '2015': -13.891161482012409,
 '80': -13.89116148201241,
 '49000': -13.891161482012409,
 '0495558000071': -13.844951669975083,
 '00000': -13.891161482012409,
 '081': -13.75253204590042,
 '0761881': -13.81414512861686,
 '201': -13.75253204590042,
 'ซะ๒๒': -12.527588032725847,
 'บอฟ': -9.681696492307424,
 '0120-BKK2(KO)-Premises': -13.816504837461247,
 '93,000.00': -13.89116148201241,
 '20': -13.89116148201241,
 '2311.RB3.0000.': -13.891161482012414,
 '1952100940:': -13.891161482012413,
 '00000.': -13.891161482012409,
 '00000.00000.0000.00.000.': -13.891161482012418,
 '0000000000.00000': -13.891161482012414,
 '0220-BKK2(KO)-Premises': -13.816504837461247,
 '87,

In [125]:
avg = [sum(x)/len(x) for x in proba]

In [112]:
sum(avg)/len(avg)

-13.402525309382085

## Probability Image 3

In [137]:
# corpus_image2_simple = random.choices(image2, k = 25)

proba = []
for word in image3:
    x = add_one_est(word, thai_trigrams_wordnet_list, dict_trigrams_0, dict_trigrams_1)
    proba.append(x)

# len(proba)
dict(zip(image3,[sum(p)/len(p) for p in proba]))

{'วันที่': -7.254528992757594,
 '120819': -13.891161482012409,
 '130529': -13.891161482012409,
 '130919': -13.891161482012409}

In [114]:
avg = [sum(x)/len(x) for x in proba]

In [115]:
sum(avg)/len(avg)

-12.232003359698705

## Probability Image 4

In [138]:
# corpus_image2_simple = random.choices(image2, k = 25)

proba = []
for word in image4:
    x = add_one_est(word, thai_trigrams_wordnet_list, dict_trigrams_0, dict_trigrams_1)
    proba.append(x)

# len(proba)
dict(zip(image4,[sum(p)/len(p) for p in proba]))

{'บิลเงินสุดแนกัวกับภูาปี่เหณ': -10.073608626360075,
 'ต้นฉบับ': -8.378253009599,
 'ดูฏถั้ว,,': -12.496301653340693,
 'ถนนสุขุมวิท': -10.17523358081556,
 'แขวงคลองตัน': -8.979887613128476,
 'เขตคลองเตย': -9.231531663682828,
 'เรตีบํารับตา': -9.7018317429963,
 '28/11/2016': -13.891161482012413,
 'กรุงเทพฯ': -9.654677267704635,
 '10110': -13.891161482012409,
 'โทร.': -10.972760137478447,
 '(02)269-1000': -13.891161482012413,
 'กล': -6.92261907472367,
 'หน้าที่': -7.268742688199053,
 'บริษัท': -8.641233180885243,
 'บัตรกรุงไทย': -9.152765036956282,
 'จํากัด': -10.114443992067294,
 '(มหาซน)': -13.054680174600493,
 'สํานักงานใหญ่': -9.30332702508253,
 'เลขที่': -8.309244109267821,
 'อาคารสมัชชซาวาณิช': -9.972537819583005,
 'ชั้น14': -10.710740398561969,
 'ถนนลสุขุมวิท': -10.599943745198134,
 'เลขประจําตักผู้เสียภาษีอากิว': -9.589206737086634,
 '๓56': -13.89116148201241,
 '0016253': -13.891161482012409,
 'ที่อยู่': -7.392242925213354,
 'แขวงคลองต้นเหนือ': -8.994593193866573,
 'เขตวัฒนา': -9.

In [117]:
avg = [sum(x)/len(x) for x in proba]

In [118]:
sum(avg)/len(avg)

-10.90119882501532