In [132]:
import re


def process_line(line):
	# Replace sentence-ending punctuation with <eos>
	line = re.sub(r'([.!?])', ' <eos> ', line)
	# Insert spaces around specific punctuation to ensure they are treated as separate tokens
	line = re.sub(r'([{[\]()},$])', r' \1 ', line)
	# Collapse multiple spaces into a single space
	line = re.sub(r'\s{2,}', ' ', line)
	line = line.lower().strip()

	# Split the line into tokens
	tokens = line.split()

	# Handle special case where <eos> is followed by }
	if len(tokens) >= 2 and tokens[-2] == "<eos>":
		tokens[-2], tokens[-1] = tokens[-1], tokens[-2]

	latex_mode = False
	latex_stack = ""
	tokenized_line = ['<sos>']

	for token in tokens:
		if latex_mode:
			latex_stack += token
			if token == "$":
				latex_mode = False
				# latex_stack = latex_stack.replace(" ", "")
				latex_stack = '<ltx>'
				tokenized_line.append(latex_stack)
				latex_stack = ""
		elif token == "$":
			latex_mode = True
			latex_stack += token
		else:
			if latex_stack:
				# latex_stack = latex_stack.replace(" ", "")
				latex_stack = '<ltx>'
				tokenized_line.append(latex_stack)
				latex_stack = ""
			tokenized_line.append(token)

	# Append any remaining LaTeX stack
	if latex_stack:
		# latex_stack = latex_stack.replace(" ", "")
		latex_stack = '<ltx>'
		tokenized_line.append(latex_stack)
	text = " ".join(tokenized_line)
	text = re.sub(r"\\", "  ", text)
	text = re.sub(r"(<eos>)(\w)+", "\1", text)
	text = re.sub(r"(\d)+", " <num> ", text)
	text = re.sub(r"[\(\[\{]", " <opn> ", text)
	text = re.sub(r"[\)\]\}]", " <cld> ", text)
	text = re.sub(r"\\\w+\b", " <ltx> ", text)
	text = re.sub(r"(\\%|%)", "<prc>", text)
	text = re.sub(r",", "<com>", text)
	text = re.sub(r"/", " ", text)
	text = re.sub(r"-", " ", text)
	text = re.sub(r"[^\w<>]", " ", text)
	return text.split()


def normalize_data(data, dest):
	tokenized_data = []
	vocab = set()
	with open(dest, 'w+') as dest_file:
		for line in data:
			if len(line) > 0 and line[0] != "%":
				line = process_line(line)
				vocab = vocab.union(set(line))
				line = " ".join(line)
				dest_file.write(line + "\n")
	print(f">>> {source}, {target} Done!")
	return vocab

In [133]:
DATA = (("data414_2020_a1.af.txt", "data414_2020_a1.en.txt"),
	("data414_2021_a1.af.txt", "data414_2021_a1.en.txt"),
	("data414_2021_a2.af.txt", "data414_2021_a2.en.txt"),
	("ss414_2018_a1.af.txt", "ss414_2018_a1.en.txt"),
	("ss414_2018_a2.af.txt", "ss414_2018_a2.en.txt"),
	("ss414_2018_a3.af.txt", "ss414_2018_a3.en.txt"),
	("ss414_2019_a1.af.txt", "ss414_2019_a1.en.txt"),
	("ss414_2019_a2.af.txt", "ss414_2019_a2.en.txt"),
	("ss414_2019_a3.af.txt", "ss414_2019_a3.en.txt"))

In [134]:
English = set()
Afrikans = set()
for source, target in DATA:
    with open(source) as file:
	    data = file.read()
    data = data.strip().split("\n")
    vocab = normalize_data(data, f"norm/{source}")
    Afrikans = Afrikans.union(vocab)
    
    with open(target) as file:
	    data = file.read()
    data = data.strip().split("\n")
    vocab = normalize_data(data, f"norm/{target}")
    English = English.union(vocab)

>>> data414_2020_a1.af.txt, data414_2020_a1.en.txt Done!
>>> data414_2020_a1.af.txt, data414_2020_a1.en.txt Done!
>>> data414_2021_a1.af.txt, data414_2021_a1.en.txt Done!
>>> data414_2021_a1.af.txt, data414_2021_a1.en.txt Done!
>>> data414_2021_a2.af.txt, data414_2021_a2.en.txt Done!
>>> data414_2021_a2.af.txt, data414_2021_a2.en.txt Done!
>>> ss414_2018_a1.af.txt, ss414_2018_a1.en.txt Done!
>>> ss414_2018_a1.af.txt, ss414_2018_a1.en.txt Done!
>>> ss414_2018_a2.af.txt, ss414_2018_a2.en.txt Done!
>>> ss414_2018_a2.af.txt, ss414_2018_a2.en.txt Done!
>>> ss414_2018_a3.af.txt, ss414_2018_a3.en.txt Done!
>>> ss414_2018_a3.af.txt, ss414_2018_a3.en.txt Done!
>>> ss414_2019_a1.af.txt, ss414_2019_a1.en.txt Done!
>>> ss414_2019_a1.af.txt, ss414_2019_a1.en.txt Done!
>>> ss414_2019_a2.af.txt, ss414_2019_a2.en.txt Done!
>>> ss414_2019_a2.af.txt, ss414_2019_a2.en.txt Done!
>>> ss414_2019_a3.af.txt, ss414_2019_a3.en.txt Done!
>>> ss414_2019_a3.af.txt, ss414_2019_a3.en.txt Done!


In [135]:
def build_vocab(files)
V = {
	"<pad>": 0,
	"<sos>": 1,
	"<eos>": 2,
	"<unk>": 3,
	"<num>": 4,
	"<com>": 5,
	"<prc>": 6,
	"<opn>": 7,
	"<cld>": 8,
	"<ltx>": 9,
}

In [136]:
# sorted(list(English), key=lambda x: len(x))
English

{'<cld>',
 '<com>',
 '<eos>',
 '<ltx>',
 '<num>',
 '<opn>',
 '<prc>',
 '<sos>',
 'a',
 'about',
 'above',
 'accidentally',
 'accomplish',
 'according',
 'accuracies',
 'accuracy',
 'actually',
 'adc',
 'add',
 'adding',
 'additional',
 'additive',
 'advisable',
 'affect',
 'affected',
 'after',
 'again',
 'aim',
 'algebraically',
 'algorithm',
 'aliasing',
 'all',
 'also',
 'alternative',
 'amplitude',
 'amplitudes',
 'an',
 'analog',
 'analogue',
 'analysed',
 'analysis',
 'analytics',
 'and',
 'annotate',
 'another',
 'answer',
 'answering',
 'answers',
 'anti',
 'any',
 'anything',
 'aperiodic',
 'applied',
 'apply',
 'approach',
 'approaches',
 'appropriate',
 'are',
 'around',
 'array',
 'artist',
 'as',
 'assigned',
 'assignment',
 'assignments',
 'associative',
 'assume',
 'assumed',
 'assuming',
 'assumption',
 'assumptions',
 'assuring',
 'at',
 'audio',
 'autocorrelation',
 'automatically',
 'average',
 'avoid',
 'away',
 'axes',
 'axis',
 'b',
 'back',
 'band',
 'bandwidth',

In [126]:
Afrikans

{'<cld>',
 '<com>',
 '<eos>',
 '<ltx>',
 '<num>',
 '<opn>',
 '<prc>',
 '<sos>',
 'a',
 'aan',
 'aanbeveel',
 'aandui',
 'aangedui',
 'aangel',
 'aangeteken',
 'aangetoon',
 'aanleer',
 'aanname',
 'aannames',
 'aanneem',
 'aantal',
 'aanvaar',
 'aanvaarding',
 'aanvanklike',
 'adc',
 'add',
 'addisionele',
 'ado',
 'af',
 'afgemonster',
 'afgeprojekteer',
 'afgerig',
 'afleiding',
 'afloop',
 'afmonstering',
 'afnyfrekwensie',
 'afrig',
 'afrigakkuraatheid',
 'afrigdatastel',
 'afrigfout',
 'afrigpunte',
 'afrigverlies',
 'afsetterm',
 'afsnyfrekwensie',
 'afstand',
 'akkuraathede',
 'akkuraatheid',
 'al',
 'algebra',
 'algemeen',
 'algoritme',
 'aliasing',
 'all',
 'alle',
 'alternatiewe',
 'amerikaanse',
 'amplitude',
 'amplitudes',
 'amplitudeweergawe',
 'analitika',
 'analoog',
 'and',
 'ander',
 'anders',
 'annotasies',
 'annoteer',
 'anti',
 'antwoord',
 'antwoorde',
 'apart',
 'artist',
 'as',
 'asse',
 'assosiatief',
 'b',
 'baie',
 'bandbeperk',
 'banddeurlaatfilter',
 'bandwy