## Translation model

In [None]:
from collections import Counter
import re
import os

# Automatically go through all of the files 
path = "/work/dat410_europarl"
file_list = os.listdir(path)

def count_words(f, p = False):
    if p:
        print(f"Reading words from file {f}")
    words = re.findall(r'\w+', open('/work/dat410_europarl/' + f).read())
    return Counter(words)

def warmup(counter):
    print("10 most common words are:")
    print(counter.most_common(10))
    c_total = sum(counter.values())
    print(f"Probability for word beeing 'zebra': {c['zebra']/c_total}")
    print(f"Probability for word beeing 'speaker': {c['speaker']/c_total}")
    print("------------------------------------------------------")

We decided to check all documents (including all versions of the english documents) as they produce slighlty different results. Notably we can see that the probability that a word is 'speaker' is highest in the french-english document. 

In [None]:
# Warmup
for f in file_list:
    c = count_words(f, True)
    warmup(c)
    

Reading words from file europarl-v7.sv-en.lc.sv
10 most common words are:
[('att', 9181), ('och', 7038), ('i', 5954), ('det', 5687), ('som', 5028), ('för', 4959), ('av', 4013), ('är', 3840), ('en', 3724), ('vi', 3211)]
Probability for word beeing 'zebra': 0.0
Probability for word beeing 'speaker': 0.0
------------------------------------------------------
Reading words from file europarl-v7.de-en.lc.de
10 most common words are:
[('die', 10521), ('der', 9374), ('und', 7028), ('in', 4175), ('zu', 3169), ('den', 2976), ('wir', 2863), ('daß', 2738), ('ich', 2670), ('das', 2669)]
Probability for word beeing 'zebra': 0.0
Probability for word beeing 'speaker': 0.0
------------------------------------------------------
Reading words from file europarl-v7.sv-en.lc.en
10 most common words are:
[('the', 19327), ('of', 9344), ('to', 8814), ('and', 6949), ('in', 6124), ('is', 4400), ('that', 4357), ('a', 4271), ('we', 3223), ('this', 3222)]
Probability for word beeing 'zebra': 0.0
Probability for w

#### Language modeling

In [None]:
def read_sentences(f, add_NULL = True):
    sentences = re.split(r'[\n.]',open('/work/dat410_europarl/' + f).read())
    while('' in sentences) :
        sentences.remove('')
    if add_NULL:
        for idx in range(len(sentences)):
            sentences[idx] = 'NULL ' + sentences[idx]
    return sentences

def word_count_w_NULL(f):
    sentences = read_sentences(f, add_NULL=True)
    word_count = {}

    for sentence in sentences:
        words = re.findall(r'\w+', sentence)
        tmp_word_count = Counter(words)
        for word in tmp_word_count:
            if word in word_count:
                word_count[word]+= tmp_word_count[word]
            else:
                word_count[word] = tmp_word_count[word]

    return word_count
    
read_sentences(file_list[0], add_NULL=False)[0:8]
# Note the issue while parsing the abbreviation bl.a.

['jag förklarar europaparlamentets session återupptagen efter avbrottet den 17 december ',
 ' jag vill på nytt önska er ett gott nytt år och jag hoppas att ni haft en trevlig semester ',
 'som ni kunnat konstatera ägde &quot; den stora år 2000-buggen &quot; aldrig rum ',
 ' däremot har invånarna i ett antal av våra medlemsländer drabbats av naturkatastrofer som verkligen varit förskräckliga ',
 'ni har begärt en debatt i ämnet under sammanträdesperiodens kommande dagar ',
 'till dess vill jag att vi , som ett antal kolleger begärt , håller en tyst minut för offren för bl',
 'a',
 ' stormarna i de länder i europeiska unionen som drabbats ']

In [None]:
def bigram_probs(f):
    # Read all words from file and store word counts
    word_count = word_count_w_NULL(f)

    # Read out all sentences to prevent bigrams across sentences
    sentences = read_sentences(f)
    nmb_of_sentences = len(sentences)

    # dictionary for storing bigram counts
    tot_bi_counts = {}
    for sentence in sentences:
        sentence_words = re.findall(r'\w+', sentence)
        bi_count = Counter((sentence_words[idx],sentence_words[idx+1]) for idx in range(len(sentence_words) - 1))

        # Update the total counts for the bigrams
        for bigram in bi_count.elements():
            if(bigram in tot_bi_counts):
                tot_bi_counts[bigram] += bi_count[bigram]
            else:
                tot_bi_counts[bigram] = bi_count[bigram]
    
    # Dictionary that stores P(word2|word1)
    bi_probs = {}
    for bigram in tot_bi_counts:
        bi_probs[bigram] = tot_bi_counts[bigram]/word_count[bigram[0]]

    return bi_probs , word_count

# Split into functions for easier debugging and improvement
def calculate_sentence_prob(sentence, bi_probs, word_count):
    words = re.findall(r'\w+', sentence)
    
    prob = 1
    for idx in range(len(words)-1):
        try:
            prob *= bi_probs[(words[idx], words[idx+1])]
        except:
            # If we encounter word we haven't seen we calculate the probability 
            # it would have if it were in the data (Not entierly accurate since we don't modify the rest of bi_probs)
            if(words[idx] in word_count):
                1/word_count[words[idx]]
            # Note: P(Known|Unknown) = 1/1 therefore it's just skipped (same as prob*1)
    return prob


In our code we try to calculate the probability one instance of the unseen word would have had if it were part of the dataset. It's not entierly accurate since we do not recalculate what the other items probabilities would have had but this seems reasonable since we can't always trust the input text to this function. 

As we can see the probabilities are quite low and gets significantly lower the longer the sentences gets.

In [None]:
bp, word_count = bigram_probs(file_list[0])
print('Sentence from source text....')
print(calculate_sentence_prob("alla gläder vi oss åt att domstolen har friat honom", bp, word_count)) 
print('Sentence with unknown words...')
print(calculate_sentence_prob("Jag gillar vill gå på bio och titta på spiderman på fredag", bp, word_count)) 
print('Long sentence... ')
print(calculate_sentence_prob("på uppmaning av en fransk parlamentsledamot , zimeray , har redan en framställning gjorts , undertecknad av många , bland annat jag själv , men jag uppmanar er , i enlighet med de riktlinjer som europaparlamentet och hela den europeiska gemenskapen alltid har hållit fast vid , att med all den tyngd ni har i kraft av ert ämbete och den institution ni företräder , uppmana texas guvernör , bush , att uppskjuta verkställigheten och att benåda den dömde", bp, word_count)) 

Sentence from source text....
6.955939123260558e-18
Sentence with unknown words...
5.381281887355061e-12
Long sentence... 
4.198484462055951e-111


#### Translation modeling

In [None]:
import random

def IBMOne(scr_file, en_file, iterations = 10):
    # Extract lines from files
    scr_lines = open('/work/dat410_europarl/' + scr_file).read().splitlines()
    en_lines = open('/work/dat410_europarl/' + en_file).read().splitlines()

    # Initilize required dicts
    scr_given_en = init_word_translation_probs(scr_lines, en_lines)
    en_word_count = word_count_w_NULL(en_file)
    scr_en_pair_count = init_word_translation_probs(scr_lines, en_lines)

    for _ in range (iterations):
        # Set all counts to 0
        for word in en_word_count:
            en_word_count[word] = 0
        for pair in scr_en_pair_count:
            scr_en_pair_count[pair] = 0

        # Algorithm
        for line in range(len(scr_lines)):
            scr_words = re.findall(r'\w+', scr_lines[line])
            en_words = ['NULL'] + re.findall(r'\w+', en_lines[line])

            for scr_word in scr_words:
                p_sum = 0
                for en_word in en_words:
                    p_sum += scr_given_en[(scr_word,en_word)]

                for en_word in en_words:
                    #compute alignment prob 0.1 / 0.95
                    prob = scr_given_en[(scr_word,en_word)] / p_sum
                    #update pseudocount
                    scr_en_pair_count[(scr_word,en_word)] += prob
                    #update pseudocount
                    en_word_count[en_word] += prob

        #reestimate probabilities
        for word_pair in scr_given_en:
            scr_given_en[word_pair] = scr_en_pair_count[word_pair] / en_word_count[word_pair[1]]

    return scr_given_en #This is what we later call "dictionary" during the tranlation part


# Initialization dict with all word pairs that can be creating by matchining scr<->en lines
def init_word_translation_probs(scr_lines,en_lines, zeroes = False):
    translation_probs = dict()

    for line in range(len(scr_lines)):
        scr_words = re.findall(r'\w+', scr_lines[line])
        en_words = ['NULL'] + re.findall(r'\w+', en_lines[line])

        for scr_word in scr_words:
            for en_word in en_words:
                # Assign each pair a random number
                translation_probs[(scr_word,en_word)] = random.uniform(0, 0.99)

    return translation_probs

svGivenEn= IBMOne(file_list[0], file_list[2])

In [None]:
# 10 most probable words that translate to "european" from swedish
european_in_sv = []
for elem in svGivenEn:
    if(elem[1] == "european"):
        european_in_sv.append((svGivenEn[elem], elem[0]))
european_in_sv.sort(reverse=True)
european_in_sv[0:10]

[(0.8539941237408167, 'europeiska'),
 (0.08068846303480778, 'europeisk'),
 (0.01621272820284629, 'i'),
 (0.007984925393355307, 'till'),
 (0.006425479935892108, 'en'),
 (0.005689820584388745, 'för'),
 (0.003993640802996053, 'den'),
 (0.0037668360247995096, 'unionen'),
 (0.0036315362233786206, 'europaparlamentet'),
 (0.0032533871471700777, 'om')]

#### Decoding

In [None]:
import itertools
import numpy as np

class Translator():
    def __init__(self, scr_file, en_file, iterations=10):
        self.bp, self.en_word_count = bigram_probs(en_file)
        self.dictionary = IBMOne(scr_file, en_file, iterations)

    def translate(self, sentence):
        words = re.findall(r'\w+', sentence)
        translated_words = []
        for word in words:
            translated_word = self.most_prob_word(word)
            translated_words.append(translated_word)

        #order with sentence probability model model (maximize)
        translated_sentence = self.calculate_sentence_prob_maximize_two(translated_words)
        return translated_sentence

    def most_prob_word(self, scr_word):
        highest_prob = 0
        most_prob_word = ""
        # translation is a word pair in our dictionary (scr_word, en_word)
        for translation in self.dictionary:
            if translation[0] == scr_word:
                if self.dictionary[translation] > highest_prob:
                    most_prob_word = translation[1]
                    # Reads out the stored probability of that translation from dictionary
                    highest_prob = self.dictionary[translation]
        if most_prob_word == "":
            # If word is not in our dictionary
            most_prob_word = scr_word
        return most_prob_word

    # tries all ordered combinations of words in tranlated word list and returns the most likely ordering of them 
    # using P(E) function. An optimal layout would also try different tenses of the words 
    # and also try with different filler words, but for this assignment we have implemented the simplest model 
    def calculate_sentence_prob_maximize(self, word_list): 
        res_sentence = ""
        highest_prob = 0

        for possible_sentence in map(" ".join, itertools.permutations(word_list)):
            sentence_prob = calculate_sentence_prob(possible_sentence, self.bp, self.en_word_count)
            if  sentence_prob > highest_prob:
                res_sentence = possible_sentence
                highest_prob = sentence_prob
            
            while 'NULL' in res_sentence:
                res_sentence.remove('NULL')

        return res_sentence

    # Looks only at the most probable word of translated words to come after the previous word according to bi_probs model.
    def calculate_sentence_prob_maximize_two(self, word_list): 
        res_sentence = ""
        last_word = ""
        maybe_word = ""
        highest_prob = 0
        #start word
        for word in word_list:
            try:
                if self.bp[('NULL',word)] > highest_prob:
                    maybe_word = word
                    highest_prob = self.bp[('NULL',word)]
            except:
                if maybe_word == "":
                    maybe_word = word
        word_list.remove(maybe_word)
        res_sentence += maybe_word + " "
        last_word = maybe_word
        maybe_word = ""
        highest_prob = 0

        # rest of the words
        while len(word_list) > 0:
            for word in word_list:
                try:
                    if self.bp[(last_word, word)] > highest_prob:
                        maybe_word = word
                        highest_prob = self.bp[(last_word,word)]
                except:
                    if maybe_word == "":
                        maybe_word = word
            word_list.remove(maybe_word)
            res_sentence += maybe_word + " "
            last_word = maybe_word
            maybe_word = ""
            highest_prob = 0
        
        while 'NULL' in res_sentence:
            res_sentence.remove('NULL')

        return res_sentence


In [None]:
sv_to_en = Translator(file_list[0], file_list[2])

In [None]:
# Short test sentence
sv_to_en.translate("vi måste enas")

'we must agreeing '

In [None]:
# Sentence from text
sv_to_en.translate("frågan för närvarande inte föremål för någon begäran")

'for not for question currently presently anyone request '

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=4ba8ee87-29e0-4e7b-bdf0-521dbe79e112' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>