In [5]:
import pickle as pk
import numpy as np
import sys
import os
import optparse
import logging

import optparse
import logging
import os

def get_arguments():
    parser = optparse.OptionParser()
    
    parser.add_option("-d", "--data", dest="dataset_prefix", default="data/hansards", help="Data file prefix (default=data)")
    parser.add_option("-e", "--english", dest="english_suffix", default="e", help="English file suffix (default=e)")
    parser.add_option("-f", "--french", dest="french_suffix", default="f", help="French file suffix (default=f)")
    parser.add_option("--eps", dest="epsilon", default=1e-3, type="float", help="Error threshold for stopping the EM algorithm (default=1e-3)")
    parser.add_option("-n", "--num_sentences", dest="sentence_count", default=100000000000, type="int", help="Number of sentences for training")
    parser.add_option("-l", "--lower", action="store_true", help="Lowercase the corpus (default=False)")
    parser.add_option("-g", "--debugging", action="store_true", help="Enable debugging (default=False)")
    parser.add_option("-s", "--save", dest="model_save_path", default="ibm1.pkl", help="Path to save the model")
    parser.add_option("-r", "--save_r", dest="reversed_model_save_path", default="ibm1_r.pkl", help="Path to save the reversed model")
    parser.add_option("-p", "--pretrained", dest="model_load_path", default="", help="Path to load the model")
    parser.add_option("-t", "--pretrained_r", dest="reversed_model_load_path", default="", help="Path to load the reversed model")
    parser.add_option("-m", "--max_iterations", dest="max_iterations", default=500, type="int", help="Maximum training iterations")
    parser.add_option("-c", "--continue", dest="current_iteration", type=int, default=None, help="Resume from given iteration. Requires -p option.")
    
    options, _ = parser.parse_args()

    french_file = f"{options.dataset_prefix}.{options.french_suffix}"
    english_file = f"{options.dataset_prefix}.{options.english_suffix}"

    if options.current_iteration:
        assert options.model_load_path, "Using -c requires -p"

    for model_path in [options.model_save_path, options.reversed_model_save_path]:
        if not (model_path == options.model_load_path or model_path == options.reversed_model_load_path):
            base_name, extension = os.path.splitext(model_path)
            new_base = f"{base_name}_num{options.sentence_count}_eps{options.epsilon}"
            new_path = new_base + extension
            directory, file_name = os.path.split(new_path)
            new_directory = os.path.join(directory, new_base)

            assert not os.path.exists(new_directory), f"{new_directory} already exists."
            os.makedirs(new_directory)

            if model_path == options.model_save_path:
                options.model_save_path = os.path.join(new_directory, file_name)
            else:
                options.reversed_model_save_path = os.path.join(new_directory, file_name)

    return options, french_file, english_file


class Tokenizer:
    def __init__(self, corpus, case_sensitive=True) -> None:
        words_set = set()
        for sent in corpus:
            for word in sent:
                if not case_sensitive:
                    word = word.lower()
                words_set.add(word)
        self.i2w = list(words_set)
        self.w2i = {word:index for index, word in enumerate(self.i2w)}
        self.V = len(self.i2w)
    
    def encode(self, sent):

        return [self.w2i[word] for word in sent]
    
    def decode(self, sent_ind):

        return [self.i2w[id] for id in sent_ind]


class IBMModel1:
    def __init__(self, bitext, f_tokenizer, e_tokenizer, save_path, eps=1e-5, max_iters=500) -> None:
        '''
        For all given f, \sum_e t(e|f) = 1

        Suppose Vocabulary size of e and f are Ve and Vf.
        Use a Vf*Ve matrix to represent t(e|f), where the rows are different f
        and the columns are different e. Thus, each row sum to 1.
        '''
        bitext_tokenized = [[f_tokenizer.encode(sent[0]), e_tokenizer.encode(sent[1])] for sent in bitext]
        self.bitext = bitext_tokenized
        '''Integerized parallel corpus'''
        self.f_tokenizer = f_tokenizer
        self.e_tokenizer = e_tokenizer
        self.Vf = f_tokenizer.V
        self.Ve = e_tokenizer.V
        self.t = np.ones((self.Vf, self.Ve)) / self.Ve # so that each entry is uniformly 1 / Ve (row normalized)
        self.save_path = save_path

        self.eps = eps
        self.max_iters = max_iters
        self.is_converged = False
        self.diff = -1
    
    def save(self, iters):
        base, ext = os.path.splitext(opts.save_path)
        base = f"{base}_iter{iters}"
        path = base + ext
        pickle.dump(self, open(path, "wb"))

    def em_train(self, iter_now=0):
        iters = iter_now
        while (iters < self.max_iters) and (not self.is_converged):
            iters += 1
            logging.info(f"Iteration {iters}, diff={self.diff:.8f}")
            logging.debug(f"t(the|la)={self.t_e_f('the', 'la')}")
            logging.debug(f"t(of|la)={self.t_e_f('of', 'la')}")

            # initialize
            count = np.zeros((self.Vf, self.Ve))
            total = np.zeros(self.Vf)

            for f_sent, e_sent in self.bitext:
                # compute normalization
                s_total = {}
                for e in e_sent:
                    s_total[e] = 0
                    for f in f_sent:
                        s_total[e] += self.t[f, e]
                # collect counts
                for e in e_sent:
                    for f in f_sent:
                        count[f, e] += self.t[f, e]/s_total[e]
                        total[f] += self.t[f, e]/s_total[e]
            
            # estimate probabilities
            self.diff = 0
            for f in range(self.Vf):
                for e in range(self.Ve):
                    new_t = count[f, e] / total[f]
                    el_diff = np.abs(new_t - self.t[f, e])
                    self.diff = max(self.diff, el_diff)
                    self.t[f, e] = new_t

            # check convergence and save
            if self.diff < self.eps:
                self.is_converged = True
            self.save(iters)

        if self.is_converged:
            logging.info(f"Model converged after {iters} iteration under error {self.eps}!")
        else:
            logging.info(f"Training stopped after reaching max number of iterations {self.max_iters}")

    def t_e_f(self, e, f):
        '''
        Computes t(e|f), where e & f are both string words
        
        If either e or f is not in the vocabulary, return -1
        '''
        try:
            ei = self.e_tokenizer.w2i[e]
            fi = self.f_tokenizer.w2i[f]
        except KeyError:
            return -1
        return self.t[fi, ei]

def align(model, model_r, bitext):
    
    for (f, e) in bitext:
        alignment = []
        alignment_r = []
        for (i, f_i) in enumerate(f): 
            max_prob = 0
            best_j = 0
            for (j, e_j) in enumerate(e):
                tmp_prob = model.t_e_f(e_j, f_i)
                if tmp_prob > max_prob:
                    best_j = j
                    max_prob = tmp_prob
            if(max_prob > 0):
                alignment.append([i,best_j])

        for (j, e_j) in enumerate(e):
            max_prob = 0
            best_i = 0
            for (i, f_i) in enumerate(f):
                tmp_prob = model_r.t_e_f(f_i, e_j)
                if tmp_prob > max_prob:
                    best_i = i
                    max_prob = tmp_prob
            if(max_prob > 0):
                alignment_r.append([best_i,j])
    
        for i in range(len(f)):
            for j in range(len(e)):
                if [i, j] in alignment and [i, j] in alignment_r:
                    sys.stdout.write("%i-%i " % (i, j))

        sys.stdout.write("\n")

if __name__ == "__main__": 
    opts, f_data, e_data = parse_args()
    loglvl = logging.DEBUG if opts.debugging else logging.INFO
    logging.basicConfig(level=loglvl)
    bitext = [[sentence.strip().split() for sentence in pair] for pair in zip(open(f_data), open(e_data))][:opts.num_sents]
    bitext_r = [[sentence.strip().split() for sentence in pair] for pair in zip(open(e_data), open(f_data))][:opts.num_sents]
    if opts.lower:
        bitext = [[[w.lower() for w in fs],[w.lower() for w in es]] for fs, es in bitext]
    f_text = [sent[0] for sent in bitext]
    e_text = [sent[1] for sent in bitext]
    f_tokenizer = Tokenizer(f_text)
    e_tokenizer = Tokenizer(e_text)
    
    f_text_r = [sent[0] for sent in bitext_r]
    e_text_r = [sent[1] for sent in bitext_r]
    f_tokenizer_r = Tokenizer(e_text)
    e_tokenizer_r = Tokenizer(f_text)

    # bitext_tokenized = []
    # for sent in bitext:
    #     print(sent)
    #     bitext_tokenized.append([f_tokenizer.encode(sent[0]), e_tokenizer.encode(sent[1])])

    logging.info(f"Vocabulary size - foreign:{f_tokenizer.V}, english:{e_tokenizer.V}")
    if opts.load_path:
        ibm = pickle.load(open(opts.load_path, "rb"))
        if opts.iter_now is not None:
            ibm.em_train(opts.iter_now)
    else:
        ibm = IBMModel1(bitext, f_tokenizer, e_tokenizer, opts.save_path, eps=opts.eps, max_iters=opts.max_iters)
        ibm.em_train()
        
    if opts.load_path_r:
        ibm_r = pickle.load(open(opts.load_path_r, "rb"))
        if opts.iter_now is not None:
            ibm_r.em_train(opts.iter_now)
    else:
        ibm_r = IBMModel1(bitext_r, f_tokenizer_r, e_tokenizer_r, opts.save_path_r, eps=opts.eps, max_iters=opts.max_iters)
        ibm_r.em_train()

    align(ibm, ibm_r, bitext)

AssertionError: ibm1_num100000000000_eps0.001 already exists. Aborting.