In [1]:
import pandas as pd
import numpy as np
import collections
import copy

In [2]:
# Load the sentences
with open(r'data/europarl-v7.sv-en.lc.sv') as f:
    sv_sentences = [line.rstrip('\n') for line in f]
sv_sentences = [i[:-2] for i in sv_sentences]

with open(r'data/europarl-v7.sv-en.lc.en') as f:
    en_sentences = [line.rstrip('\n') for line in f]
en_sentences = [i[:-2] for i in en_sentences]

In [3]:
# Save all the words
sv_words = [word for sentence in sv_sentences for word in sentence.split()]
en_words = [word for sentence in en_sentences for word in sentence.split()]

In [4]:
# Use a counter to count all the words
sv_counter = collections.Counter(sv_words)
en_counter = collections.Counter(en_words)

In [5]:
sv_counter.most_common(10)

[('att', 9181),
 (',', 8876),
 ('och', 7038),
 ('i', 5949),
 ('det', 5687),
 ('som', 5028),
 ('fÃ¶r', 4959),
 ('av', 4013),
 ('Ã¤r', 3840),
 ('en', 3724)]

In [6]:
en_counter.most_common(10)

[('the', 19322),
 (',', 13513),
 ('of', 9312),
 ('to', 8801),
 ('and', 6946),
 ('in', 6090),
 ('is', 4400),
 ('that', 4357),
 ('a', 4269),
 ('we', 3223)]

In [7]:
total_occurence = sum(en_counter.values())

print('Probability of speaker appearing', en_counter['speaker']/total_occurence)
print('Probability of zebra appearing', en_counter['zebra']/total_occurence)

Probability of speaker appearing 3.682725806332815e-05
Probability of zebra appearing 0.0


In [8]:
# Counts occurance of two words next to eachother
def occurance(words, next_word, previous_word=None):
    count = 0
    t = False
    if (previous_word == None):
        count = collections.Counter(words)[next_word]
    
    for word in words:
        if (word == next_word and t):
            count += 1
        if (word == previous_word):
            t = True
        else:
            t = False
    return count

In [9]:
# Based on words being next to each other, calculate the probability of a sentence
def sentence_proba(words,str):
    counter = collections.Counter(words)
    probabilities = []
    previous_word = None
    
    # Iterate over words
    for next_word in str.split():
        numb_of_occ = occurance(words, next_word, previous_word)
        
        # if it's the first word, calc the chance of just that word
        if (previous_word == None):
            probabilities.append((numb_of_occ + 1)/sum(counter.values()))
            
        # Else calculate the chance of those words being next to one another
        else:
            probabilities.append((numb_of_occ + 1)/counter[previous_word])
        previous_word = next_word
        
    return np.prod(probabilities) * len(probabilities)    

In [10]:
sentence_proba(en_words, 'i have received your attention')

1.7791098071080344e-07

If a word or combination of two words never occur in training, probability becomes 0

if its a long sentese probabiliy goes towards 0 

to avoid getting probability of 0 on a sentece, we add a seudo count of 1 all occurances, if something appears 0 times we now say it apreas 1 time so probability does not go to 0, if it apear 5000 times we now say it appear 5001 times

to avoid long senteses getting to small chance, we devide on number of words in sentes to get avrage probability.

# C


In [11]:
# Set dictionary
sv_dict = list(set(sv_words))
en_dict = list(set(en_words))+['NULL']

In [12]:
# Save indexes for speed
sv_dict_indexes = dict(zip(sv_dict,range(len(sv_dict))))
en_dict_indexes = dict(zip(en_dict,range(len(en_dict))))

In [19]:
t0 = pd.DataFrame(np.random.rand(len(sv_dict), len(en_dict)),columns=en_dict, index=sv_dict)
t = copy.deepcopy(t0)

In [54]:
# Run EM algorithm for iterations.
for j in range(9):
    
    # Reset the counter for every run
    Ces = pd.DataFrame(np.zeros((len(sv_dict), len(en_dict))),columns=en_dict, index=sv_dict)
    Ce  = {e:0 for e in en_dict}
    
    # For every sentence..
    for i in range(len(sv_sentences)): 
        current_en_sentence = en_sentences[i].split() + ['NULL']
        
        # Loop through every swedish word..
        for sv_word in sv_sentences[i].split():
            sv_index = sv_dict_indexes[sv_word]
            t_total = t.loc[sv_word, current_en_sentence].sum()
            
            # Pair every swedish word up with an english word..
            for en_word in current_en_sentence:
                en_index = en_dict_indexes[en_word]
                
                # And calculate the counts according to the IMB Model 1
                delta = (t.values[sv_index, en_index] / t_total)
                Ces.values[sv_index, en_index] = Ces.values[sv_index, en_index] + delta
                Ce[en_word] = Ce.get(en_word) + 1
    
    # Update the t matrix with the new values.
    for en_word in en_dict:   
        en_index = en_dict_indexes[en_word]
        t.values[:,en_index] = Ces.values[:,en_index]/Ce.get(en_word)
        
    # Keep track of iterations to compare.
    t_list.append(copy.deepcopy(t))

1
2
3
4
5


### Translating
The model is trained, save it as pickle file so we can use it in the future!

In [None]:
# Save the t matrix to use in future.
t.to_pikle('t_e_s.pkl')

In [13]:
t_used = pd.read_pickle('t_e_s.pkl')

In [14]:
# Find the words most probable to be transalte from "european"
t_used['european'].sort_values(ascending=False)[:10]

europeiska    0.009607
i             0.000622
att           0.000286
och           0.000134
den           0.000125
som           0.000080
en            0.000070
fÃ¶r          0.000059
europeisk     0.000044
till          0.000039
Name: european, dtype: float64

Dictionary gets shuffled so lets update it to the matrix

In [15]:
# Update the dictionaries and indexes to match the current t matrix used.
en_dict = t_used.columns
sv_dict = list(t_used.index.values)

sv_dict_indexes = dict(zip(sv_dict,range(len(sv_dict))))
en_dict_indexes = dict(zip(en_dict,range(len(en_dict))))

We use bayes furmula to get values proportional to the probability by multiplying the rate of which a word occus by the transition probability.

In [16]:
# Create our probability matrix which is p(e|s)=p(e)*t(s|e)
q = copy.deepcopy(t_used)
counter = collections.Counter(en_words)
total = sum(counter.values())

# For every word in the dictionary
for en_word in en_dict: 
        en_index = en_dict_indexes[en_word]
        
        # p(e|s)=p(e)*t(s|e)
        q.values[:,en_index] = t_used.values[:,en_index]*(counter[en_word]/total)

In [19]:
q.drop(',', axis=1, inplace=True)
q.drop(',', axis=0, inplace=True)

Let's test a swedish word

In [17]:
q.loc['kunna'].sort_values(ascending=False)[:10]

able           4.878776e-06
could          1.889025e-06
wound          4.602928e-07
enhancement    4.506636e-07
join           3.504941e-07
faster         2.991363e-07
preclude       2.985990e-07
heal           2.945410e-07
medal          2.630517e-07
schedule       2.539770e-07
Name: kunna, dtype: float64

In [21]:
import itertools
from itertools import chain

# Translate a sentence, given all the words used.
def translate(sentence,words):
    
    # Find the best words based on our q matrix
    best_words = []
    for word in sentence.split(): 
        best_words.append(q.loc[word,:].sort_values(ascending=False).index[0])
    
    while 'NULL' in best_words:
        best_words.remove('NULL')
    
    # Iterate over all perutations and find the most probable one.
    permutations = itertools.permutations(best_words)
    max_proba = 0
    best_translation = None
    for permutation in permutations:
        permutation = ' '.join(word for word in permutation)
        sp = sentence_proba(words,permutation)
        
        # Save best translation
        if sp > max_proba:
            max_proba = sp
            best_translation = permutation
            
    return best_translation

In [22]:
print(translate('trots relativ stabilitet', en_words))
print(translate('det finns nu visserligen', en_words))
print(translate('de ansvariga har inte tagit', en_words))

despite relative stability
now interpretative there is
not accountable stillborn they have
