In [1]:
from src.Normalizer import normalize_file
from src.Tokenizer import Tokenizer, SentenceTokenizer
from src.NgramModel import NGModel
from src.LIdentify import LIdentify
import numpy as np
import matplotlib.pyplot as plt

## Text Normalization

In [2]:
af_file = 'data/train.af.txt'
en_file = 'data/train.en.txt'
nl_file = 'data/train.nl.txt'
xh_file = 'data/train.xh.txt'
zu_file = 'data/train.zu.txt'

af_val_file = 'data/val.af.txt'
en_val_file = 'data/val.en.txt'
nl_val_file = 'data/val.nl.txt'
xh_val_file = 'data/val.xh.txt'
zu_val_file = 'data/val.zu.txt'

norm_af_file = 'data/normalized.af.txt'
norm_en_file = 'data/normalized.en.txt'
norm_nl_file = 'data/normalized.nl.txt'
norm_xh_file = 'data/normalized.xh.txt'
norm_zu_file = 'data/normalized.zu.txt'

normalize_file(af_file, norm_af_file)
normalize_file(en_file, norm_en_file)
normalize_file(nl_file, norm_nl_file)
normalize_file(xh_file, norm_xh_file)
normalize_file(zu_file, norm_zu_file)

Document Normalized Successfully!
Document Normalized Successfully!
Document Normalized Successfully!
Document Normalized Successfully!
Document Normalized Successfully!


## Language Modelling

In [3]:
CHARS = [' ','0','</s>','<s>','a','b','c',
		 'd','e','f','g','h','i','j','k',
		 'l','m','n','o','p','q','r','s',
		 't','u','v','w','x','y','z']

In [4]:
af_model = NGModel(norm_af_file, CHARS, 'af', 3)
en_model = NGModel(norm_en_file, CHARS, 'en', 3)
nl_model = NGModel(norm_nl_file, CHARS, 'nl', 3)
xh_model = NGModel(norm_xh_file, CHARS, 'xh', 3)
zu_model = NGModel(norm_zu_file, CHARS, 'zu', 3)

In [5]:
test = "abantu basebenzisa olunye ulwimi lwokuthetha ebantwini kukho iilwimi ezininzi ezahlukileyo isingesi isixhosa isizulu isibhulu isisotho isipedi iinkonzo zeelwimi zesizwe"
a = SentenceTokenizer(test)
af_lp, af_count = af_model.sent_logprob(a, 3)
en_lp, en_count = en_model.sent_logprob(a, 3)
nl_lp, nl_count = nl_model.sent_logprob(a, 3)
xh_lp, xh_count = xh_model.sent_logprob(a, 3)
zu_lp, zu_count = zu_model.sent_logprob(a, 3)
print(af_lp, en_lp, nl_lp, xh_lp, zu_lp)


-482.0261844809487 -520.3255283503499 -495.1275058716431 -137.232557774365 -164.24989633845786


In [6]:
# k_list = np.linspace(1e-8,1, 400)
# pp_list = []
# for k in k_list:
#     a, b = perplexity(en_val_file, en_model, 3, k)
#     pp_list.append(b)

In [7]:
models = (af_model, en_model, nl_model, xh_model, zu_model)
val_files = (af_val_file, en_val_file, nl_val_file, xh_val_file, zu_val_file)

In [8]:
# for model in models:
#     for file in val_files:
#         print("-"*30)
#         print("| model = ",model.name, "|",
#               "val file",file.split('/')[-1].split('.')[1])
#         hc, pp = perplexity(file, model, 3, 0.6)
#         print("-"*30)
#         print(f"|HC = {hc:.3f}, PP = {pp:.3f}")
#     print("="*30)

### Text Generation

In [9]:
en_model.generate(start='t')

'<s>ther mand minen li x0 calgention thave expecal a posecuras thersishillect ed minamende logra itear ac'

### Perplexity

In [14]:
def perplexity(file_name, model, order, k=1e-8):
    sum_prob = 0
    sum_counter = 0
    with open(file_name) as file:
        for line in file:
            token = SentenceTokenizer(line.strip())
            a, b = model.sent_logprob(token, order, k)
            sum_prob +=a 
            sum_counter +=b
    hc = - sum_prob /sum_counter 
    pp = 2**hc 
    return hc, pp, np.exp(sum_prob)**(-1/sum_counter)

In [15]:
for model in models:
    for file in val_files:
        print("-"*30)
        print("| model = ",model.name, "|",
              "val file",file.split('/')[-1].split('.')[1])
        hc, pp, pp2 = perplexity(file, model, 3, 0.6)
        print("-"*30)
        print(f"|HC = {hc:.3f}, PP = {pp:.3f}, PP2 = {pp2:.3f}")
    print("="*30)

------------------------------
| model =  af | val file af


  return hc, pp, np.exp(sum_prob)**(-1/sum_counter)


------------------------------
|HC = 2.541, PP = 5.818, PP2 = inf
------------------------------
| model =  af | val file en
------------------------------
|HC = 3.285, PP = 9.748, PP2 = inf
------------------------------
| model =  af | val file nl
------------------------------
|HC = 2.791, PP = 6.923, PP2 = inf
------------------------------
| model =  af | val file xh
------------------------------
|HC = 3.856, PP = 14.479, PP2 = inf
------------------------------
| model =  af | val file zu
------------------------------
|HC = 3.900, PP = 14.930, PP2 = inf
------------------------------
| model =  en | val file af
------------------------------
|HC = 3.314, PP = 9.943, PP2 = inf
------------------------------
| model =  en | val file en
------------------------------
|HC = 2.500, PP = 5.657, PP2 = inf
------------------------------
| model =  en | val file nl
------------------------------
|HC = 3.281, PP = 9.719, PP2 = inf
------------------------------
| model =  en | val file x

## Language Identification

In [12]:
models = (af_model, en_model, nl_model, xh_model, zu_model)
identifiers = LIdentify(models)

In [13]:
print(identifiers.scoring(val_af_grams[3],3))
print(identifiers.scoring(val_en_grams[3],3))
print(identifiers.scoring(val_nl_grams[3],3))
print(identifiers.scoring(val_xh_grams[3],3))
print(identifiers.scoring(val_zu_grams[3],3))

NameError: name 'val_af_grams' is not defined