In [37]:
import random
import math
from collections import Counter

class LanguageModel:
    def __init__(self, n, corpus, vocabulary):
        self.n = n
        self.ngrams = Counter()
        self.total_ngrams = 0
        self.corpus = corpus
        self.vocabulary_size = len(vocabulary)

    def train(self):
        for sentence in self.corpus:
            tokens = sentence.split()
            for i in range(len(tokens) - self.n + 1):
                ngram = tuple(tokens[i:i+self.n])
                self.ngrams[ngram] += 1
                self.total_ngrams += 1

    def log_probability(self, sentence):
        tokens = sentence.split()
        log_prob = 0.0
        for i in range(len(tokens) - self.n + 1):
            ngram = tuple(tokens[i:i+self.n])
            prob = (self.ngrams[ngram] + 1) / (self.total_ngrams + self.vocabulary_size)  # Laplace smoothing
            log_prob += math.log(prob)
        return log_prob

class IBMModel1:
    def __init__(self, swedish_file, english_file):
        self.swedish_file = swedish_file
        self.english_file = english_file
        self.swedish_sentences = []
        self.english_sentences = []
        self.swedish_vocab = {}
        self.english_vocab = {}
        self.translation_probabilities = {}
        self.ngram_order = 2

    def read_files(self):
        with open(self.swedish_file, 'r', encoding='utf-8') as f:
            self.swedish_sentences = f.readlines()
        with open(self.english_file, 'r', encoding='utf-8') as f:
            self.english_sentences = f.readlines()

    def load_subset(self, num_sentences):
        with open(self.swedish_file, 'r', encoding='utf-8') as f:
            self.swedish_sentences = [next(f).strip() for _ in range(num_sentences)]
        with open(self.english_file, 'r', encoding='utf-8') as f:
            self.english_sentences = [next(f).strip() for _ in range(num_sentences)]

    def preprocess_data(self):
        self.load_subset(100)

        for i in range(len(self.swedish_sentences)):
            swedish_tokens = self.swedish_sentences[i].split()  # Splitting by whitespace
            english_tokens = self.english_sentences[i].split()  # Splitting by whitespace

            for token in swedish_tokens:
                if token not in self.swedish_vocab:
                    self.swedish_vocab[token] = len(self.swedish_vocab)

            for token in english_tokens:
                if token not in self.english_vocab:
                    self.english_vocab[token] = len(self.english_vocab)

    def init_trans_prob(self):
        self.translation_probabilities = {
            (swedish_word, english_word): random.random()
            for swedish_word in self.swedish_vocab
            for english_word in self.english_vocab
        }

    def train(self, num_iterations=10):
        self.init_trans_prob()
        self.language_model = LanguageModel(self.ngram_order, self.english_sentences, self.english_vocab)
        self.language_model.train()

        for iteration in range(num_iterations):
            count_fe = {}
            print(self.translation_probabilities[("europeiska", "european")])

            for swedish_word in self.swedish_vocab:
                count_fe[swedish_word] = {}
                for english_word in self.english_vocab:
                    count_fe[swedish_word][english_word] = 0.0

            c_e = {}
            for english_word in self.english_vocab:
                c_e[english_word] = 0.0
                
            # Expectation
            for k in range(len(self.swedish_sentences)):  # For each sentence pair
                swedish_tokens = self.swedish_sentences[k].split()
                english_tokens = self.english_sentences[k].split()

                for swedish_word in swedish_tokens:  # For each swedish word
                    total_sw = sum(self.translation_probabilities.get((swedish_word, english_word), 0) for english_word in english_tokens)

                    for english_word in english_tokens:  # For each english word
                        try:
                            delta = self.translation_probabilities[(swedish_word, english_word)] / total_sw  # Compute alignment prob
                            count_fe[swedish_word][english_word] += delta  # Update pseudocount
                            c_e[english_word] += delta  # Update pseudocount
                        except ZeroDivisionError:
                            pass  # Skip if total_sw is 0

            # Maximization step (Reestimate probabilities)
            for english_word in self.english_vocab:
                for swedish_word in self.swedish_vocab:
                    try:
                        self.translation_probabilities[(swedish_word, english_word)] = count_fe[swedish_word][english_word] / c_e[english_word]
                    except ZeroDivisionError:
                        pass  # Skip if c_e[english_word] is 0

    def log_probability(self, swedish_sentence, english_sentence):
        log_prob = 0.0
        for swedish_word in swedish_sentence.split():
            for english_word in english_sentence:
                try:
                    prob = self.translation_probabilities[(swedish_word, english_word)]
                    if prob <= 0:  # Handle non-positive probabilities
                        log_prob += math.log(1e-10)  # Add small epsilon instead of log(prob)
                    else:
                        log_prob += math.log(prob)
                except KeyError:
                    pass  # Skip if translation probability is not found
        return log_prob   
              
    def get_top_10(self, english_word, number_of_trans=10):
        translation = []
        for swedish_word, trans_prob in self.translation_probabilities.items():
            if swedish_word[1] == english_word:
                translation.append((swedish_word[0], trans_prob))
        translation.sort(key=lambda x: x[1], reverse=True)
        return translation[:number_of_trans]

    def decoder(self, swedish_sentence):
        max_log_prob = float('-inf')
        best_translation = None

        for english_sentence in self.english_sentences:
            p_e = self.language_model.log_probability(english_sentence)
            p_f_e = self.log_probability(swedish_sentence, english_sentence)
            joint_log_prob = p_e + p_f_e

            if joint_log_prob > max_log_prob:
                max_log_prob = joint_log_prob
                best_translation = english_sentence

        return best_translation

# Example usage:
IBM = IBMModel1('datasets/europarl-v7.sv-en.lc.sv', 'datasets/europarl-v7.sv-en.lc.en')
IBM.preprocess_data()
IBM.train()

swedish_sentence = 'jag tror att det är ett bra sätt .'
english_translation = IBM.decoder(swedish_sentence)
print(english_translation)




0.47932198471520016
0.007710276573754092
0.016504454337880652
0.033074328597251894
0.061048580398416784
0.10112491681696638
0.15055645469581197
0.20434536360870698
0.2577484921837751
0.3077500379609673
that did not happen .


In [38]:
top_translations = IBM.get_top_10("european")
print("Top translations for 'european' in Swedish:")
for swedish_word, trans_prob in top_translations:
    print(swedish_word,trans_prob)

Top translations for 'european' in Swedish:
europeiska 0.35301184881870934
för 0.15145843834191944
i 0.12903371480804549
de 0.0575277424221576
på 0.04495331308244658
med 0.036299816522218864
ett 0.029792549882698856
. 0.02837076835401436
av 0.023968595840000516
samt 0.023156696006807215
