In [51]:
from __future__ import print_function
from __future__ import division
import os
import cPickle as pickle
import json
import subprocess
from IPython.display import display
from IPython.display import Audio
import bisect
from collections import namedtuple
import numpy as np
import pandas as pd
from collections import Counter
from prettytable import PrettyTable
import matplotlib.pyplot as plt
import seaborn as sns

from matplotlib.ticker import MultipleLocator, \
     FormatStrFormatter, AutoMinorLocator
%matplotlib inline

In [52]:
with open("config.json") as json_data_file:
    config = json.load(json_data_file)

In [25]:
nodes_fname = config["es"]['nodes_fname']
seg_nodes_fname = config["es"]['seg_nodes_fname']
nodes_dict_fname = config["es"]['nodes_dict_fname']

edges_utd_fname = config["es"]['edges_utd_fname']
edges_olap_fname = config["es"]['edges_olap_fname']
edges_all_fname = config["es"]['edges_all_fname']
edges_score_fname = config["es"]['edges_score_fname']

clusters_utd_fname = config['es']['clusters_utd_fname']
clusters_fname = config['es']['clusters_fname']
clusters_stats_fname = config['es']['clusters_stats_fname']

pairs_fname = config['es']['score_pairs_fname']
eval_fname = config['es']['eval_pairs_fname']

feats_fname = config['es']['feats_fname']
feats_dict_fname = config['es']['feats_dict_fname']

In [26]:
Align = namedtuple('Align', ['word', 'start', 'end'])
Node = namedtuple('Node', ['file', 'seg', 'start', 'end', 'es', 'es_cnt'])
Eval = namedtuple('Eval', ['n1', 'n2', 'dtw', 'es_sim', 'es_cnt_sim', 'en_j_sim'])

In [27]:
segment_map = pickle.load(open(config['es']['segment_dict_fname'], "rb"))
align_dict = pickle.load(open(config['es']['align_dict_fname'], "rb"))
nodes_dict = pickle.load(open(nodes_dict_fname, "rb"))
pairs_list = pickle.load(open(pairs_fname, "rb"))
eval_dict = pickle.load(open(eval_fname, "rb"))
clusters = pickle.load(open(clusters_fname, "rb"))
clusters_stats = pickle.load(open(clusters_stats_fname, "rb"))
feats_dict = pickle.load(open(feats_dict_fname, "rb"))

# Create parallel corpus

- Use list of files specified for training
- Create golden parallel corpus, using es transcriptions
- Create pseudotext ||| english parallel corpus
- For English, filter for content words


In [44]:
def create_parallel_corpus(segment_list_fname, corpus_fname, pseudotext=False, es_w_key='es', en_w_key='en'):
    with open(corpus_fname, "w") as out_f, open(segment_list_fname) as in_f:
        for seg_id in in_f:
            seg_id = seg_id.strip()
            if pseudotext:
                es_words = map(str, feats_dict[seg_id])
            else:
                es_words = [w.word for w in align_dict[seg_id.split('.')[0]][seg_id][es_w_key]]
                if not es_words:
                    es_words = ["-1"]

            en_words = [w.word for w in align_dict[seg_id.split('.')[0]][seg_id][en_w_key]]
            if not en_words:
                en_words = [w.word for w in align_dict[seg_id.split('.')[0]][seg_id]['en']]
            outline = " ".join(es_words) + " ||| " + " ".join(en_words) + "\n"
            out_f.write(outline)
    print("Finished generating MT corpus")
        

In [45]:
segment_list_fname = config['es']['mt_train_files']
corpus_fname = config['es']['mt_corpus_train_gold']
create_parallel_corpus(segment_list_fname, corpus_fname, pseudotext=False, es_w_key='es_cnt', en_w_key='en_cnt')

Finished generating MT corpus


In [46]:
segment_list_fname = config['es']['mt_train_files']
corpus_fname = config['es']['mt_corpus_train_pseudo']
create_parallel_corpus(segment_list_fname, corpus_fname, pseudotext=True, es_w_key='', en_w_key='en_cnt')

Finished generating MT corpus


# Train MT model using *fast_align*

In [57]:
def train_mt(corpus_fname, probs_fname):
    FASTALIGN = config['base']['fast_align']
    subprocess.call([FASTALIGN,"-i", corpus_fname, "-v", "-N", "-p", probs_fname])
    

In [58]:
corpus_fname = config['es']['mt_corpus_train_gold']
probs_fname = config['es']['mt_probs_gold']

train_mt(corpus_fname, probs_fname)

In [59]:
corpus_fname = config['es']['mt_corpus_train_pseudo']
probs_fname = config['es']['mt_probs_pseudo']

train_mt(corpus_fname, probs_fname)

# Make predictions

# Evaluate