In [1]:
import random
import math
from collections import Counter, defaultdict  
import nltk
from nltk.tokenize import word_tokenize
import numpy as np

In [4]:
class BigramLanguageModel:
    def __init__(self, corpus):
        self.bigram_counts = defaultdict(lambda: defaultdict(int))
        self.unigram_counts = defaultdict(int)
        self.total_bigrams = 0

        for sentence in corpus:
            sentence = ['NULL'] + sentence + ['NULL']
            for i in range(len(sentence) - 1):
                self.bigram_counts[sentence[i]][sentence[i+1]] += 1
                self.unigram_counts[sentence[i]] += 1
                self.total_bigrams += 1

    def score(self, sentence):
        sentence = ['NULL'] + sentence + ['NULL']
        score = 0.0
        for i in range(len(sentence) - 1):
            bigram_prob = self.bigram_counts[sentence[i]][sentence[i+1]] / self.unigram_counts[sentence[i]]
            score += np.log(bigram_prob) if bigram_prob > 0 else np.log(1e-3)
        return score



class IBMModel1:
    def __init__(self, swedish_sentences, english_sentences, iterations=10):
        self.swedish_sentences = swedish_sentences
        self.english_sentences = english_sentences
        self.iterations = iterations
        self.translation_probs = defaultdict(lambda: defaultdict(float))

    def get_top_translations(self, target_word, top_n=10):
        translations = self.translation_probs.get(target_word, {})
        sorted_translations = sorted(translations.items(), key=lambda x: x[1], reverse=True)
        return sorted_translations[:top_n]

    def train(self):
        vocab_s = set(word for sentence in self.swedish_sentences for word in sentence)
        vocab_e = set(word for sentence in self.english_sentences for word in sentence)
        init_prob = 1.0 / len(vocab_e)

        for sw, en in zip(self.swedish_sentences, self.english_sentences):
            for sw_word in sw:
                for en_word in en:
                    self.translation_probs[en_word][sw_word] = init_prob

        for i in range(self.iterations):
            count = defaultdict(float)
            total = defaultdict(float)

            for sw, en in zip(self.swedish_sentences, self.english_sentences):
                for en_word in en:
                    total_en = sum(self.translation_probs[en_word][sw_word] for sw_word in sw)
                    for sw_word in sw:
                        count[(en_word, sw_word)] += self.translation_probs[en_word][sw_word] / total_en
                        total[sw_word] += self.translation_probs[en_word][sw_word] / total_en

            for (en_word, sw_word), val in count.items():
                self.translation_probs[en_word][sw_word] = val / total[sw_word]

            # Print top translations for the English word "european"
            top_translations = self.get_top_translations("european", top_n=10)
            print(f"Iteration {i+1}:")
            print("Top 10 translations for the English word 'european':")
            for translation, probability in top_translations:
                print(f"{translation}: {probability}")
            print()

   

    def translate(self, swedish_sentence):
        english_sentence = []
        for sw_word in swedish_sentence:
            possible_translations = []
            max_prob = 0.0
            for en_word, probs in self.translation_probs.items():
                if sw_word in probs and probs[sw_word] > max_prob:
                    max_prob = probs[sw_word]
                    possible_translations = [en_word]
                elif sw_word in probs and probs[sw_word] == max_prob:
                    possible_translations.append(en_word)
            # Append all possible translations for the current Swedish word
            english_sentence.append(possible_translations)
        return english_sentence


def argmax_translation(ibm_model, language_model, swedish_sentence):
    best_translation = []
    best_score = 0
    translation = ibm_model.translate(swedish_sentence)

    score = language_model.score(translation)

       
    if abs(score) > best_score:
        best_score = score
        best_translation = translation

    # Return the best translation
    return best_translation, best_score

with open('datasets/europarl-v7.sv-en.lc.sv', 'r', encoding='utf-8') as f:
    swedish_lines = f.readlines()

with open('datasets/europarl-v7.sv-en.lc.en', 'r', encoding='utf-8') as f:
    english_lines = f.readlines()


sample_size = int(len(swedish_lines) * 0.01)

# Randomly sample lines from the datasets
sampled_swedish_lines = swedish_lines[:sample_size]
sampled_english_lines = english_lines[:sample_size]

# Process the sampled data
swedish_corpus = [word_tokenize(line.strip()) for line in sampled_swedish_lines]
english_corpus = [word_tokenize(line.strip()) for line in sampled_english_lines]




ibm_model = IBMModel1(swedish_corpus, english_corpus)
ibm_model.train()

    
bigram_lm = BigramLanguageModel(english_corpus)

swedish_sentence = ['jag', 'ber', 'er']
best_translation, best_score = argmax_translation(ibm_model, bigram_lm, swedish_sentence)

print("Swedish Sentence:", swedish_sentence)
print("Best English Translation:", best_translation)
print("Translation Score (log probability):", best_score)

Iteration 1:
Top 10 translations for the English word 'european':
personer: 0.03703703703703704
nyligen: 0.03703703703703704
mördades: 0.03703703703703704
kumar: 0.03703703703703704
ponnambalam: 0.03703703703703704
besökte: 0.03703703703703704
bara: 0.03703703703703704
månader: 0.03703703703703704
föreslår: 0.03448275862068965
röstar: 0.03448275862068965

Iteration 2:
Top 10 translations for the English word 'european':
dess: 0.04969107246932571
europaparlamentet: 0.04447611835136042
uttalande: 0.04119467091259389
samt: 0.04076888134095893
personer: 0.03958641739566613
nyligen: 0.03958641739566613
mördades: 0.03958641739566613
kumar: 0.03958641739566613
ponnambalam: 0.03958641739566613
besökte: 0.03958641739566613

Iteration 3:
Top 10 translations for the English word 'european':
dess: 0.08513355581513131
europaparlamentet: 0.07796174800151279
europeiska: 0.0775092581862694
samt: 0.0773235225511381
själv: 0.055163383258520896
uttalande: 0.04930062672755437
personer: 0.04123285974870843

KeyboardInterrupt: 