In [15]:
from src.Normalizer import normalize_file
from src.Tokenizer import Tokenizer
from src.NgramModel import NGModel
from src.LIdentify import LIdentify
import numpy as np

## Text Normalization

In [16]:
af_files = 'data/train.af.txt'
en_files = 'data/train.en.txt'
nl_files = 'data/train.nl.txt'
xh_files = 'data/train.xh.txt'
zu_files = 'data/train.zu.txt'

norm_af_files = 'data/normalized.af.txt'
norm_en_files = 'data/normalized.en.txt'
norm_nl_files = 'data/normalized.nl.txt'
norm_xh_files = 'data/normalized.xh.txt'
norm_zu_files = 'data/normalized.zu.txt'

# normalize_file(af_files, norm_af_files)
# normalize_file(en_files, norm_en_files)
# normalize_file(nl_files, norm_nl_files)
# normalize_file(xh_files, norm_xh_files)
# normalize_file(zu_files, norm_zu_files)

## Language Modelling

In [17]:
CHARS = [' ','0','</s>','<s>','a','b','c',
         'd','e','f','g','h','i','j','k',
         'l','m','n','o','p','q','r','s',
         't','u','v','w','x','y','z']

In [104]:
from src.Tokenizer import Tokenizer
import numpy as np
from src.utils import sort_dict


class NGModel:
    def __init__(self, file_name: str, vocab:list, name: str, orders: int = 1):
        self.vocab = vocab
        self.token = Tokenizer(file_name)
        self.orders = orders
        self.name = name
        self.log_joints = self.__get_probs()

    def __ngram(self, tokens, order):
        counters = {}
        L = len(tokens) - order
        for i in range(L):
            current = tuple(tokens[i * order:(i + 1) * order])
            if counters.get(current):
                counters[current] += 1
            else:
                counters[current] = 1
        return counters

    def __get_probs(self):
        ngrams = {}
        token_list = list(self.token)
        for order in range(1, self.orders + 1):
            _tmp = self.__ngram(token_list, order)
            w = sum(_tmp.values())
            ngrams[order] = sort_dict({k: v/w
                                      for k, v in _tmp.items()})
        return ngrams
    
    def generate(self, start, max_len=100, smoothing=0):
        text = '<s>'+start
        tokens = ["<s>", start]
        for _ in range(max_len):
            tokens = tokens[-(self.orders -2):]
            probs = self.__get_next_probs(tokens, smoothing)
            next_word = self.__sample_word(probs)
            tokens.append(next_word)
            text += ''.join(next_word)
        return text
    
    def __addk(self, numer, denom, smoothing):
        b = 
        return a/b
    
    def __get_next_probs(self, tokens, smoothing):
        order = self.orders
        context= tuple(tokens[-(self.orders -2):])
        pcontext = self.log_joints[self.orders -1].get(context, 0) + smoothing * len(self.vocab) 
        probs = np.zeros((len(self.vocab),))
        for i in range(len(self.vocab)):
            joint = tuple(tokens[-(self.orders -2):] + [self.vocab[i]])
        	pjoint= self.log_joints[self.orders].get(joint, 0) + smoothing
            probs[i] = pjoint/pcontext
        print(sum(probs))
        return probs/sum(probs)
    
    def __sample_word(self, probs):
        idx = np.random.multinomial(1, probs).argmax() 
        return self.vocab[idx]
    
    def perplexity(self, text, order):
        token = ['<s>']
        token.extend(text)
        token +=['</s>']
        N = len(token)
        P = 1
        for i in range(N-1):
            joint = token[i*(order):(i+1)*order]
            den = joint[:-1]
            p = self.log_joints[order].get(tuple(joint), 0)/self.log_joints[order -1].get(tuple(den), 0)
            P *=(1/p)
        return P**(1/N)
        
    def __repr__(self):
        return self.name
    

In [105]:
# af_model = NGModel(norm_af_files, CHARS, 'af', 3)
en_model = NGModel(norm_en_files, CHARS, 'en', 3)
# nl_model = NGModel(norm_nl_files, CHARS, 'nl', 3)
# xh_model = NGModel(norm_xh_files, CHARS, 'xh', 3)
# zu_model = NGModel(norm_zu_files, CHARS, 'zu', 3)

In [106]:
en_model.generate(start='t', smoothing=0.5)

'<s>t</s>atmftxryqjme<s>lldckcrptjowk0gsi0mkwkqvyjgkmwxnmublkxv lqa<s>rbbcvuaml</s>ukmptz<s>eitwhl xk0xoyqpdsb0pmouku'

In [107]:
en_model.perplexity("bonjour mon coeur", 3)

ZeroDivisionError: division by zero

In [None]:
val_af_files = 'data/val.af.txt'
val_en_files = 'data/val.en.txt'
val_nl_files = 'data/val.nl.txt'
val_xh_files = 'data/val.xh.txt'
val_zu_files = 'data/val.zu.txt'

val_af_token = Tokenizer(val_af_files)
val_en_token = Tokenizer(val_en_files)
val_nl_token = Tokenizer(val_nl_files)
val_xh_token = Tokenizer(val_xh_files)
val_zu_token = Tokenizer(val_zu_files)

### Text Generation

In [73]:
en_model.generate(start='w')

'<s>w0<s>limobagkceoem0brv</s>d</s>kvge</s>qmn0h</s>zubmfgo ilyv <s>apatfropgtsb</s>so sxlhv<s>ysxhedrj0xrc ojvyrncasmuvijbjkx'

### Perplexity

In [None]:
val_af_grams = val_af_token.build_ngram(3)
val_en_grams = val_en_token.build_ngram(3)
val_nl_grams = val_nl_token.build_ngram(3)
val_xh_grams = val_xh_token.build_ngram(3)
val_zu_grams = val_zu_token.build_ngram(3)

In [None]:
set(val_en_grams[1])

{(' ',),
 ('0',),
 ('</s>',),
 ('<s>',),
 ('a',),
 ('b',),
 ('c',),
 ('d',),
 ('e',),
 ('f',),
 ('g',),
 ('h',),
 ('i',),
 ('j',),
 ('k',),
 ('l',),
 ('m',),
 ('n',),
 ('o',),
 ('p',),
 ('q',),
 ('r',),
 ('s',),
 ('t',),
 ('u',),
 ('v',),
 ('w',),
 ('x',),
 ('y',),
 ('z',)}

## Language Identification

In [None]:
models = (af_model, en_model, nl_model, xh_model, zu_model)
identifiers = LIdentify(models)

In [None]:
print(identifiers.scoring(val_af_grams[3],3))
print(identifiers.scoring(val_en_grams[3],3))
print(identifiers.scoring(val_nl_grams[3],3))
print(identifiers.scoring(val_xh_grams[3],3))
print(identifiers.scoring(val_zu_grams[3],3))

AttributeError: 'NGModel' object has no attribute 'ngrams'