<div style="text-align:right;">Statistical Translation</div>
<div style="text-align:right;">Zixiao Wang with materials from Dino Konstantopoulos</div>
<div style="text-align:right;">2020.2.18</div>

In [37]:
import os
import argparse
import json
import math
import time
import multiprocessing
from operator import itemgetter
from copy import deepcopy
import pandas as pd

def load_corpus(corpus):
    with open(corpus, 'r') as f:
        sents = f.read().split('\n')
    sents = [sent.lower() for sent in sents]
    return sents

def build_vocab(corpus):
    sents = load_corpus(corpus)
    vocab = set()
    for sent in sents:
        for word in sent.split():
            vocab.add(word)
    return list(vocab)

def init_t(f_sents, e_sents, e_vocab):
    t = {e: {} for e in e_vocab}
    for f_sent, e_sent in zip(f_sents, e_sents):
        for e in e_sent.split():
            for f in f_sent.split():
                t[e][f] = 1 / len(e_vocab)
    return t

def distance(t_1, t_2):
    delta = 0
    for e in list(t_1.keys()):
        for f in list(t_1[e].keys()):
            delta += (t_1[e][f] - t_2[e][f]) ** 2
    return math.sqrt(delta)

def is_converged(t_prev, t_curr, epsilon):
    delta = distance(t_prev, t_curr)
    return delta < epsilon, delta




def train_iter(f_sents, e_sents, f_vocab, e_vocab, t_prev):
    t = deepcopy(t_prev)

    # Initialize count(e|f) and total(f)
    count = {e: {f: 0 for f in f_vocab}
             for e in e_vocab}
    total = {f: 0 for f in f_vocab}

    for f_sent, e_sent in zip(f_sents, e_sents):
        fs = f_sent.split()
        es = e_sent.split()
        # In fact s_total is a float,
        # we make it a dict for better readability
        s_total = {e: 0 for e in e_vocab}

        # Compute normalization
        # Eq 4.13 denominator part
        for e in es:
            s_total[e] = 0
            for f in fs:
                s_total[e] += t[e][f]

        # Collect counts
        for e in es:
            for f in fs:
                # Eq 4.14 numerator part
                count[e][f] += t[e][f] / s_total[e]
                # Eq 4.14 denominator part
                total[f] += t[e][f] / s_total[e]

    # Estimate probabilities
    # Eq 4.14
    for e in t.keys():
        for f in t[e].keys():
            t[e][f] = count[e][f] / total[f]

    return t

def train(f_corpus, e_corpus, epsilon, iter_num, save_dir, save_iteration=False, save_alignment=True):
    f_sents = load_corpus(f_corpus)
    e_sents = load_corpus(e_corpus)
    f_vocab = build_vocab(f_corpus)
    e_vocab = build_vocab(e_corpus)

    t_prev = init_t(f_sents, e_sents, e_vocab)

    converged = False
    i = 0
    if save_iteration:
        output_iteration(t_prev, save_dir, i)

    while not converged and i < iter_num:
        t = train_iter(f_sents, e_sents, f_vocab, e_vocab, t_prev)
        # t = train_iter_parallel(f_sents, e_sents, f_vocab, e_vocab, t_prev, multiprocessing.cpu_count())
        converged, delta = is_converged(t_prev, t, epsilon)
        i += 1

        print(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())) +
              '  Iteration {} finished! Delta = {}.'.format(i, delta))
        if save_iteration:
            output_iteration(t, save_dir, i)
            print(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())) +
                  '    Iteration information saved!')
        if save_alignment:
            output_alignment(t, save_dir)
            print(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())) +
                  '    Alignment result saved!')

        t_prev = t
    return t

def output_iteration(t, save_dir, iter_num):
    if iter_num == 0:
        e_vocab = list(t.keys())
        df = pd.DataFrame(columns=['e', 'f', str(iter_num)+' it.'], index=[0])
        for e in e_vocab:
            for f in t[e].keys():
                df.loc[df.index.max() + 1] = [e, f, t[e][f]]
        df.drop([0], inplace=True)
        df.to_csv(save_dir + 'iterations.csv', index=False)
    else:
        df = pd.read_csv(save_dir + 'iterations.csv')
        df[str(iter_num) + ' it.'] = list(map(lambda e, f: t[e][f], df['e'], df['f']))
        df.to_csv(save_dir + 'iterations.csv', index=False)

def output_alignment(t, save_dir):
    # Save all candidate alignment with probabilities.
    with open(save_dir + 'alignment_all.json', 'w') as f:
        json.dump(t, f, ensure_ascii=False)

    # Save one to one alignment without probabilities.
    res = {e: sorted(fs.items(),
                     key=itemgetter(1),
                     reverse=True)[0][0]
           for e, fs in t.items()}
    with open(save_dir + 'alignment.json', 'w') as f:
        json.dump(res, f, ensure_ascii=False)

    # Save one to one alignment with probabilities in descending order.
    res = {e: sorted(fs.items(),
                     key=itemgetter(1),
                     reverse=True)[0]
           for e, fs in t.items()}
    res = sorted(res.items(),
                 key=lambda x: x[1][1],
                 reverse=True)
    with open(save_dir + 'alignment.txt', 'w') as f:
        for e, f_prob in res:
            f.write('{}\t{}\t{}\n'.format(e, f_prob[0], f_prob[1]))


# Methodology

## Introduce the statistical translation by statistical tagger

We can use the [Brown corpus](https://en.wikipedia.org/wiki/Brown_Corpus) to build a [POS tagger](https://en.wikipedia.org/wiki/Part-of-speech_tagging), first using a simple [Bag of Words](https://en.wikipedia.org/wiki/Bag-of-words_model) model (***most probable POS by count***), then using a **Hidden Markov Model** (HMM) that gets *transition* and *emission* probabilities from [POS bigrams](https://en.wikipedia.org/wiki/Bigram) (given a POS, what's the most probable ***next*** POS in the sentence?).

We can divide the Brown corpus into training and test sets, and compare accuraces for BOW and HMM models.

We can use some advanced python structures that are often used in Natural Language Processing (NLP).

## Statistical translation

Basically, the statistical translation is to bulid a mapping function base on probability. Like how many times that one word in one language will translate in other language.

Let's say we want to translate "你好吗" into "How are you?"

We need to know the probability to transfer "你" into "You", etc.

In order to train translation, we need some parallel corpus for two language.

Here I'm going to use Chinese-English paralled corpus

In order to gain a higher accurace, we also use Bigram method to improve the performace.

## Requirements

Use the methodology in this notebook to build a statistical language translator, *from your language to english*. So, from Hindi or Chinese to English. Teams of **3** students. You *have* to use a Hidden Markov Model and `pomegranate` as your HMM library, to ensure all student teams start from the same baseline. Start from a Most Frequent Word (BOW) translation baseline, then move on to a Hidden Markov Model to improve translation. How much can you improve it by? The translation engine with the best accuracy, per language, will be presented in class.

In [1]:
import matplotlib.pyplot as plt
import numpy as np
from IPython.core.display import HTML
from itertools import chain
from collections import Counter, defaultdict, namedtuple, OrderedDict
from pomegranate import State, HiddenMarkovModel, DiscreteDistribution
import os
from io import BytesIO
from itertools import chain
import random

In [45]:
en_file = './data/fbis.en.10k'
zh_file = './data/fbis.zh.10k'

In [46]:
with open(zh_file,'r') as test_zh:
    for i in range(2):
        line = test_zh.readline().split(" ")
        print(len(line))
        print(line)
        

10
['伊犁', '大规模', '开展', '“', '面', '对', '面', '”', '宣讲', '活动\n']
56
['新华社', '乌鲁木齐', '2', '月', '1', '日', '电', '(', '记者', '樊英利', '、', '丁建刚', '、', '李秀芩', ')', '为', '促进', '民族', '团结', ',', '维护', '社会', '安定', ',', '近年', '来', '新疆', '伊犁', '创造', '性地', '开展', '了', '“', '面', '对', '面', '”', '宣讲', '活动', ',', '让', '各', '族', '群众', '听到', '了', '党', '和', '政府', '的', '声音', ',', '受到', '热烈', '欢迎', '。\n']


In [53]:
with open(en_file,'r') as test_en:
    for i in range(2):
        line = test_en.readline().split(" ")
        print(len(line))
        print(line)

10
['"', 'Yili', 'Launches', 'Large-Scale', "'", 'Face-to-face', "'", 'Propaganda', 'Activity', '"\n']
64
['Urumqi', ',', '1', 'Feb', '(', 'Xinhua', ')', '--', 'Over', 'the', 'past', 'few', 'years', ',', 'in', 'order', 'to', 'promote', 'nationality', 'solidarity', 'and', 'safeguard', 'social', 'stability', ',', 'Xinjiang', "'s", 'Yili', 'has', 'launched', 'in', 'a', 'creative', 'way', 'a', '"', 'face-to-face', '"', 'propaganda', 'activity', 'to', 'let', 'the', 'people', 'of', 'all', 'nationalities', 'hear', 'the', 'voice', 'of', 'the', 'party', 'and', 'government', ',', 'and', 'the', 'activity', 'has', 'been', 'warmly', 'welcomed', '.\n']


In [48]:
def load_corpus(corpus):
    with open(corpus, 'r') as f:
        sents = f.read().split('\n')
    sents = [sent.lower() for sent in sents]
    return sents

def build_vocab(corpus):
    sents = load_corpus(corpus)
    vocab = set()
    for sent in sents:
        for word in sent.split():
            vocab.add(word)
    return list(vocab)

In [62]:
with open('./data/vocabulart_en.txt','w') as test:
    for sent in build_vocab(en_file):
        test.write(sent+'\n')

In [71]:
with open('./data/alignment.txt') as test:
    for i in range(3):
        print((test.readline().split("\t"))[:2])

['introduction', '引言']
[':', '略论']
['summary', '略论']


In [50]:
len(build_vocab(zh_file))

10001

In [39]:
# train(f_corpus='./data/fbis.zh.10k',e_corpus='./data/fbis.en.10k',epsilon=1e3,iter_num=10,save_dir='./data/')

In [None]:
def read_data(filename):
    """Read tagged sentence data"""
    with open(filename, 'r') as f:
        sentence_lines = [l.split("\n") for l in f.read().split("\n\n")]
    return OrderedDict(((s[0], Sentence(*zip(*[l.strip().split("\t")
                        for l in s[1:]]))) for s in sentence_lines if s[0]))

def read_tags(filename):
    """Read a list of word tag classes"""
    with open(filename, 'r') as f:
        tags = f.read().split("\n")
    return frozenset(tags)