## c) Translation modeling. We will now estimate the parameters of the translation model P(f|e).

Self-check: if our goal is to translate from some language into English, why does our conditional probability seem to be written backwards? Why don't we estimate P(e|f) instead?

Write code that implements the estimation algorithm for IBM model 1. Then print, for either Swedish, German, or French, the 10 words that the English word european is most likely to be translated into, according to your estimate. It can be interesting to look at this list of 10 words and see how it changes during the EM iterations.


In [28]:
import random
from collections import Counter

class LanguageModel:
    def __init__(self, n, corpus):
        self.n = n
        self.ngrams = {}
        self.total_ngrams = 0
        self.corpus = corpus

    def train(self):
        for sentence in self.corpus:
            for i in range(len(sentence) - self.n + 1):
                ngram = tuple(sentence[i:i+self.n])
                self.ngrams[ngram] += 1
                self.total_ngrams += 1

    def probability(self, sentence):
        tokens = sentence.split()
        prob = 1.0
        for i in range(len(tokens) - self.n + 1):
            ngram = tuple(tokens[i:i+self.n])
            prob *= self.ngrams[ngram] / self.total_ngrams
        return prob

class IBMModel1:
    def __init__(self, swedish_file, english_file):
        self.swedish_file = swedish_file
        self.english_file = english_file
        self.swedish_sentences = []
        self.english_sentences = []
        self.swedish_vocab = {}
        self.english_vocab = {}
        self.bigram_probabilities = {}
        self.translation_probabilities = {}
        self.ngram_order = 2
        
    def read_files(self):
        with open(self.swedish_file, 'r', encoding='utf-8') as f:
            self.swedish_sentences = f.readlines()
        with open(self.english_file, 'r', encoding='utf-8') as f:
            self.english_sentences = f.readlines()
    
    def load_subset(self, num_sentences):
        with open(self.swedish_file, 'r', encoding='utf-8') as f:
            self.swedish_sentences = [next(f).strip() for _ in range(num_sentences)]
        with open(self.english_file, 'r', encoding='utf-8') as f:
            self.english_sentences = [next(f).strip() for _ in range(num_sentences)]

    def preprocess_data(self):
        self.load_subset(100)

        for i in range(len(self.swedish_sentences)):
            swedish_tokens = self.swedish_sentences[i].split()  # Splitting by whitespace
            english_tokens = self.english_sentences[i].split()  # Splitting by whitespace

            for token in swedish_tokens:
                if token not in self.swedish_vocab:
                    self.swedish_vocab[token] = len(self.swedish_vocab)

            for token in english_tokens:
                if token not in self.english_vocab:
                    self.english_vocab[token] = len(self.english_vocab)
    
    def run_lang_model(self):
        self.language_model = LanguageModel(self.ngram_order, self.english_sentences)

    def init_trans_prob(self):
        self.translation_probabilities = {
            (swedish_word, english_word): 1.0 / len(self.english_vocab)
            for swedish_word in self.swedish_vocab
            for english_word in self.english_vocab
        }

    def train(self, num_iterations=10):
        self.init_trans_prob()
        
        for iteration in range(num_iterations):
            print(self.translation_probabilities[("jag", "declare")])
            count_fe = {}

            for swedish_word in self.swedish_vocab:
                count_fe[swedish_word] = {}
                for english_word in self.english_vocab:
                    count_fe[swedish_word][english_word] = 0.0
        
            c_e = {}
            for english_word in self.english_vocab:
                    c_e[english_word] = 0.0

        

            #Expectation
            for k in range(len(self.swedish_sentences)): # For each sentence pair
             swedish_tokens = self.swedish_sentences[k].split()  
             english_tokens = self.english_sentences[k].split()

             for swedish_word in swedish_tokens:  # For each swedish word
                 total_sw = 0.0
                 for english_word in english_tokens:
                     total_sw += self.translation_probabilities[(swedish_word, english_word)]

                 for english_word in english_tokens:     #For each english word
                        delta = self.translation_probabilities[(swedish_word, english_word)] / total_sw  # Compute alignment prob
                        count_fe[swedish_word][english_word] += delta  # Update pseudocount
                        c_e[english_word] += delta  # Update pseudocount

         # Maximization step (Reestimate probabilities)
            for english_word in self.english_vocab:
                for swedish_word in self.swedish_vocab:
                    self.translation_probabilities[(swedish_word, english_word)] = count_fe[swedish_word][english_word] / c_e[english_word]

IBM = IBMModel1('datasets/europarl-v7.sv-en.lc.sv', 'datasets/europarl-v7.sv-en.lc.en')
IBM.preprocess_data()
IBM.train()



0.0013157894736842105
0.0967741935483871
0.08168535863305357
0.0612108085841098
0.04675750078441429
0.0368707668538915
0.030013385320389337
0.025152961997690494
0.021631448258980935
0.019030246860963575
