In [1]:
import re
import string
import unicodedata
from itertools import permutations
from collections import Counter
import matplotlib.pyplot as plt
import numpy as np

In [2]:
def normalize(file_name):
	tokens = ""
	normalized = open(f"{file_name.replace('train', 'normalized')}", "w+")
	with open(file_name) as f:
		# asume that each line is a paragrap(block)
		for text in f:
			# diacritics
			text = text.strip()
			text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('ascii')
			# acronyms
			acronyms = re.findall(r"\b([A-Z]{2,}|[A-Z]+[0-9]+|[A-Z]+[a-z]+[A-Z])", text)
			for i, acronym in enumerate(acronyms):
				if any(char in string.punctuation for char in acronym) and len(acronym)>1:
					normalized = " ".join(acronym)
					text = text.replace(acronym, normalized)
	
			# normalized.write('\n')
			# Sentences split
			text = re.sub(r"([.!?])\s", "\n", text)
			# Replace numbers with "0"
			text = re.sub(r"\d", "0", text)
			# Special characters
			text = re.sub(r"[^\w\s\n]", "   ", text)
			# _ to space
			text = re.sub(r"_", " ", text)
			# Remove multiple space
			text = re.sub(r"\s{2,}", " ", text)
			# Lowercase all letters
			text = text.lower()
			normalized.write(text)
			normalized.write('\n')
	normalized.close()

In [3]:
def get_type(file_name: str) -> set:
	vocabs = set()
	with open(file_name) as file:
		for line in file:
			tmp = []
			tmp.extend(line)
			vocabs = vocabs.union(set(tmp))
	return vocabs

In [4]:
def n_gram(tokens, order=1):
	counters = {}
	L = len(tokens) - order
	for i in range(L):
		current = "".join(tokens[i*order:(i+1)*order])
		if counters.get(current):
			counters[current]+=1
		else:
			counters[current] =1
	return counters

In [56]:
def tokenize(files):
	token = []
	with open(files) as file:
		for line in file:
			token.extend('\\'+line)
	return token

def sort_dict(dictionnary, key='val'):
    if key=='key':
    	return dict(sorted(dictionnary.items(), key=lambda items: items[0], reverse=True))        
    else:
    	return dict(sorted(dictionnary.items(), key=lambda items: items[1], reverse=True))        
    
def build_lm(ngram):
    w = sum(ngram.values())
    return {k:v/w for k, v in ngram.items()}

In [57]:
af_files = 'data/train.af.txt'
en_files = 'data/train.en.txt'
nl_files = 'data/train.nl.txt'
xh_files = 'data/train.xh.txt'
zu_files = 'data/train.zu.txt'
normalize(af_files)
normalize(en_files)
normalize(nl_files)
normalize(xh_files)
normalize(zu_files)

In [58]:
norm_af_files = 'data/normalized.af.txt'
norm_en_files = 'data/normalized.en.txt'
norm_nl_files = 'data/normalized.nl.txt'
norm_xh_files = 'data/normalized.xh.txt'
norm_zu_files = 'data/normalized.zu.txt'

af_token = tokenize(norm_af_files)
en_token = tokenize(norm_en_files)
nl_token = tokenize(norm_nl_files)
xh_token = tokenize(norm_xh_files)
zu_token = tokenize(norm_zu_files)

In [59]:
af_trigram = n_gram(af_token, 3)
en_trigram = n_gram(en_token, 3)
nl_trigram = n_gram(nl_token, 3)
xh_trigram = n_gram(xh_token, 3)
zu_trigram = n_gram(zu_token, 3)

af_trigram_model = build_lm(af_trigram)
en_trigram_model = build_lm(en_trigram)
nl_trigram_model = build_lm(nl_trigram)
xh_trigram_model = build_lm(xh_trigram)
zu_trigram_model = build_lm(zu_trigram)

In [60]:
val_af_files = 'data/val.af.txt'
val_en_files = 'data/val.en.txt'
val_nl_files = 'data/val.nl.txt'
val_xh_files = 'data/val.xh.txt'
val_zu_files = 'data/val.zu.txt'

val_af_token = tokenize(val_af_files)
val_en_token = tokenize(val_en_files)
val_nl_token = tokenize(val_nl_files)
val_xh_token = tokenize(val_xh_files)
val_zu_token = tokenize(val_zu_files)

val_af_trigram = n_gram(val_af_token, 3)
val_en_trigram = n_gram(val_en_token, 3)
val_nl_trigram = n_gram(val_nl_token, 3)
val_xh_trigram = n_gram(val_xh_token, 3)
val_zu_trigram = n_gram(val_zu_token, 3)

In [61]:
def get_score(text):
    score = {
        'af': 0,
        'en': 0,
        'nl': 0,
        'xh': 0,
        'zu': 0
    }
    for trigram in text.keys():
        score['af'] += af_trigram_model.get(trigram, 0)
        score['en'] += en_trigram_model.get(trigram, 0)
        score['nl'] += nl_trigram_model.get(trigram, 0)
        score['xh'] += xh_trigram_model.get(trigram, 0)
        score['zu'] += zu_trigram_model.get(trigram, 0)
    # Softmax
    score = {k:np.exp(v) for k,v in score.items()}
    w = sum(x for x in score.values())
    score = {k:np.round(v/w, 5) for k,v in score.items()}
    return sort_dict(score)

In [62]:
print(get_score(val_af_trigram))
print(get_score(val_en_trigram))
print(get_score(val_nl_trigram))
print(get_score(val_xh_trigram))
print(get_score(val_zu_trigram))

{'af': 0.20949, 'nl': 0.20555, 'en': 0.20246, 'zu': 0.1919, 'xh': 0.1906}
{'en': 0.21346, 'nl': 0.20305, 'af': 0.20292, 'zu': 0.19085, 'xh': 0.18971}
{'nl': 0.20937, 'af': 0.20717, 'en': 0.20356, 'zu': 0.19043, 'xh': 0.18946}
{'xh': 0.20596, 'zu': 0.20478, 'en': 0.20045, 'nl': 0.19443, 'af': 0.19438}
{'zu': 0.20459, 'xh': 0.2044, 'en': 0.20083, 'af': 0.19539, 'nl': 0.19479}


In [63]:
th = {k:np.round(v,7) for k, v in en_trigram_model.items() if k.startswith('th')}
sort_dict(th)

{'the': 0.0046058,
 'th ': 0.0006857,
 'tha': 0.0005537,
 'thi': 0.0003188,
 'tho': 0.0002569,
 'thr': 0.0002418,
 'thn': 3.98e-05,
 'ths': 2.47e-05,
 'thu': 1.92e-05,
 'thy': 9.6e-06,
 'th\n': 8.2e-06,
 'thd': 4.1e-06,
 'thl': 4.1e-06,
 'thw': 4.1e-06,
 'thp': 2.7e-06,
 'thm': 2.7e-06,
 'thc': 1.4e-06,
 'tht': 1.4e-06}