In [103]:
from __future__ import print_function
from __future__ import division
import os
import cPickle as pickle
import json
import subprocess
from IPython.display import display
from IPython.display import Audio
import bisect
from collections import namedtuple
import numpy as np
import pandas as pd
from collections import Counter
from prettytable import PrettyTable
import matplotlib.pyplot as plt
import seaborn as sns
from tabulate import tabulate
from matplotlib.ticker import MultipleLocator, \
     FormatStrFormatter, AutoMinorLocator
%matplotlib inline

In [104]:
with open("config.json") as json_data_file:
    config = json.load(json_data_file)

In [105]:
nodes_fname = config["es"]['nodes_fname']
seg_nodes_fname = config["es"]['seg_nodes_fname']
nodes_dict_fname = config["es"]['nodes_dict_fname']

edges_utd_fname = config["es"]['edges_utd_fname']
edges_olap_fname = config["es"]['edges_olap_fname']
edges_all_fname = config["es"]['edges_all_fname']
edges_score_fname = config["es"]['edges_score_fname']

clusters_utd_fname = config['es']['clusters_utd_fname']
clusters_fname = config['es']['clusters_fname']
clusters_stats_fname = config['es']['clusters_stats_fname']

pairs_fname = config['es']['score_pairs_fname']
eval_fname = config['es']['eval_pairs_fname']

feats_fname = config['es']['feats_fname']
feats_dict_fname = config['es']['feats_dict_fname']

gold_probs_fname = config['es']['mt_probs_gold']
gold_probs_dict_fname = config['es']['mt_probs_dict_gold']

pseudo_probs_fname = config['es']['mt_probs_pseudo']
pseudo_probs_dict_fname = config['es']['mt_probs_dict_pseudo']

train_segment_list_fname = config['es']['mt_train_files']
gold_corpus_fname = config['es']['mt_corpus_train_gold']
pseudo_corpus_fname = config['es']['mt_corpus_train_pseudo']

gold_feats_dict_fname = config['es']['gold_feats']

In [106]:
Align = namedtuple('Align', ['word', 'start', 'end'])
Node = namedtuple('Node', ['file', 'seg', 'start', 'end', 'es', 'es_cnt'])
Eval = namedtuple('Eval', ['n1', 'n2', 'dtw', 'es_sim', 'es_cnt_sim', 'en_j_sim'])

In [117]:
segment_map = pickle.load(open(config['es']['segment_dict_fname'], "rb"))
align_dict = pickle.load(open(config['es']['align_dict_fname'], "rb"))
nodes_dict = pickle.load(open(nodes_dict_fname, "rb"))
pairs_list = pickle.load(open(pairs_fname, "rb"))
eval_dict = pickle.load(open(eval_fname, "rb"))
clusters = pickle.load(open(clusters_fname, "rb"))
clusters_stats = pickle.load(open(clusters_stats_fname, "rb"))
feats_dict = pickle.load(open(feats_dict_fname, "rb"))
gold_feats_dict = pickle.load(open(gold_feats_dict_fname, "rb"))

if os.path.exists(gold_probs_dict_fname):
    gold_probs_dict = pickle.load(open(gold_probs_dict_fname, "rb"))

if os.path.exists(pseudo_probs_dict_fname):
    pseudo_probs_dict = pickle.load(open(pseudo_probs_dict_fname, "rb"))

# Create parallel corpus

- Use list of files specified for training
- Create golden parallel corpus, using es transcriptions
- Create pseudotext ||| english parallel corpus
- For English, filter for content words


In [82]:
# def create_parallel_corpus(segment_list_fname, corpus_fname, pseudotext=False, es_w_key='es', en_w_key='en'):
#     with open(corpus_fname, "w") as out_f, open(segment_list_fname) as in_f:
#         for seg_id in in_f:
#             seg_id = seg_id.strip()
#             if pseudotext:
#                 es_words = map(str, feats_dict[seg_id])
#             else:
#                 es_words = [w.word for w in align_dict[seg_id.split('.')[0]][seg_id][es_w_key]]
#                 if not es_words:
#                     es_words = ['-1']
            
#             en_words = [w.word for w in align_dict[seg_id.split('.')[0]][seg_id][en_w_key]]
#             if not en_words:
#                 en_words = [w.word for w in align_dict[seg_id.split('.')[0]][seg_id]['en']]
            
#             if es_words != ['-1']:
#                 outline = " ".join(es_words) + " ||| " + " ".join(en_words) + "\n"
#                 out_f.write(outline)
#     print("Finished generating MT corpus")
        

In [122]:
def create_parallel_corpus(segment_list_fname, corpus_fname, feats_dict, en_w_key='en'):
    total_errors = 0
    with open(corpus_fname, "w") as out_f, open(segment_list_fname) as in_f:
        for seg_id in in_f:
            seg_id = seg_id.strip()
            es_words = feats_dict[seg_id]
            if not es_words:
                es_words = ['-1']
                print('Aha')
            
            en_words = [w.word for w in align_dict[seg_id.split('.')[0]][seg_id][en_w_key]]
            if not en_words:
                en_words = [w.word for w in align_dict[seg_id.split('.')[0]][seg_id]['en']]
            
            if es_words != ['-1']:
                outline = " ".join(es_words) + " ||| " + " ".join(en_words) + "\n"
                out_f.write(outline)
            else:
                total_errors += 1
    print("Finished generating MT corpus")
    print("Not found for: %d utterances" % total_errors)

In [123]:
lol = [item for lst in feats_dict.values() for item in lst]
'-1' in map(str, lol[:10])

True

In [124]:
create_parallel_corpus(train_segment_list_fname, gold_corpus_fname, gold_feats_dict, en_w_key='en_cnt')

Finished generating MT corpus
Not found for: 0 utterances


In [125]:
create_parallel_corpus(train_segment_list_fname, pseudo_corpus_fname, feats_dict, en_w_key='en_cnt')

Finished generating MT corpus
Not found for: 5506 utterances


# Train MT model using *fast_align*

In [126]:
def train_mt(corpus_fname, probs_fname):
    FASTALIGN = config['base']['fast_align']
    subprocess.call([FASTALIGN,"-i", corpus_fname, "-v", "-N", "-p", probs_fname])
    

In [127]:
train_mt(gold_corpus_fname, gold_probs_fname)

In [128]:
train_mt(pseudo_corpus_fname, pseudo_probs_fname)

In [129]:
def save_mt_probs(probs_fname, probs_dict_fname):
    probs_dict = {}
    with open(probs_fname, "r") as in_f:
        for i, line in enumerate(in_f):
            line_items = line.strip().split()
            es_w = line_items[0]
            if es_w not in probs_dict:
                probs_dict[es_w] = {}
            en_w = line_items[1]
            log_prob_val = float(line_items[2])
            probs_dict[es_w][en_w] = log_prob_val
    print("Finished reading mt probs file: %s" % os.path.basename(probs_fname))
    pickle.dump(probs_dict, open(probs_dict_fname, "wb"))
    print("Finished saving mt probs dictionary: %s" % os.path.basename(probs_dict_fname))
    return probs_dict
            

In [130]:
gold_probs_dict = save_mt_probs(gold_probs_fname, gold_probs_dict_fname)

Finished reading mt probs file: mt_probs_gold.txt
Finished saving mt probs dictionary: mt_probs_dict_gold.p


In [131]:
pseudo_probs_dict = save_mt_probs(pseudo_probs_fname, pseudo_probs_dict_fname)

Finished reading mt probs file: mt_probs_pseudo.txt
Finished saving mt probs dictionary: mt_probs_dict_pseudo.p


In [132]:
display(gold_probs_dict.keys()[:5])
display(pseudo_probs_dict.keys()[:5])
pseudo_probs_dict['1869']

['INTERVENCI\xc3\xb3N',
 'AGRADECIMIENTO',
 '\xc3\xa9XITO',
 'BUSQUE',
 'DILUY\xc3\xb3']

['1869', '13357', '13356', '19719', '11542']

{'HOMEWORK': -1.0866}

# Make predictions

In [97]:
def mt_predict(pred_fname, pred_dict_fname, pseudotext=False, probs_dict):
    
    for fid in align_dict:
        for seg_id in align_dict[fid]:
            if pseudotext:
                es_words = map(str, feats_dict[seg_id])
            else:
                es_words = [w.word for w in align_dict[seg_id.split('.')[0]][seg_id][es_w_key]]
                if not es_words:
                    es_words = ['-1']

            en_words = [w.word for w in align_dict[seg_id.split('.')[0]][seg_id][en_w_key]]
            if not en_words:
                en_words = [w.word for w in align_dict[seg_id.split('.')[0]][seg_id]['en']]
    
    if k not in pred_dict:
        pred_dict[k] = {}
    
    for fid in align_dict:
        for segid in align_dict[fid]:
            pred_dict[k][se] = 
            

IndentationError: unexpected indent (<ipython-input-97-3205508e7f86>, line 4)

In [35]:
sorted([seg for fid in align_dict for seg in align_dict[fid]]) == sorted(seg for fid in segment_map for seg in segment_map[fid])

True

In [50]:
align_dict['041']['041.093']

{'en': [Align(word='AND', start=3, end=11),
  Align(word='THAT', start=11, end=39)],
 'en_cnt': [],
 'es': [Align(word='Y', start=3, end=11), Align(word='YA', start=11, end=39)],
 'es_cnt': []}

In [60]:
haha = [1 for fid in align_dict for segid in align_dict[fid] if align_dict[fid][segid]['en_cnt'] == []]
print(sum(haha))
haha = [1 for fid in align_dict for segid in align_dict[fid] if align_dict[fid][segid]['en_cnt'] != []]
print(sum(haha))
print(485+16909)

485
16909
17394


In [18]:
for fid in segment_map:
    for seg_id in segment_map[fid]:
        

['090',
 '091',
 '092',
 '093',
 '094',
 '095',
 '096',
 '097',
 '010',
 '011',
 '012',
 '013',
 '014',
 '015',
 '018',
 '025',
 '024',
 '027',
 '026',
 '021',
 '023',
 '022',
 '029',
 '028',
 '115',
 '114',
 '038',
 '116',
 '111',
 '110',
 '113',
 '112',
 '032',
 '033',
 '030',
 '031',
 '036',
 '037',
 '034',
 '035',
 '108',
 '049',
 '048',
 '047',
 '046',
 '100',
 '101',
 '043',
 '042',
 '104',
 '105',
 '058',
 '059',
 '103',
 '054',
 '056',
 '057',
 '050',
 '051',
 '052',
 '053',
 '044',
 '106',
 '107',
 '041',
 '040',
 '061',
 '060',
 '063',
 '062',
 '065',
 '064',
 '067',
 '066',
 '117',
 '039',
 '076',
 '077',
 '075',
 '072',
 '073',
 '070',
 '071',
 '045',
 '078',
 '079',
 '119',
 '118',
 '089',
 '088',
 '083',
 '082',
 '081',
 '087',
 '086',
 '085',
 '084',
 '002',
 '001',
 '007',
 '006',
 '005',
 '009',
 '120']

# Evaluate

# Search speech

- Take english words as a list, output speech utterances containing that English word
- Calculate precision recall for retrieved documents