In [21]:
from __future__ import print_function
from __future__ import division
import os
import cPickle as pickle
import json
import subprocess
from IPython.display import display
from IPython.display import Audio
import bisect
from collections import namedtuple
import numpy as np
import pandas as pd
from collections import Counter
from prettytable import PrettyTable
import matplotlib.pyplot as plt
import seaborn as sns
from tabulate import tabulate
import random
from matplotlib.ticker import MultipleLocator, \
     FormatStrFormatter, AutoMinorLocator
%matplotlib inline

In [22]:
with open("config.json") as json_data_file:
    config = json.load(json_data_file)

In [23]:
nodes_fname = config["es"]['nodes_fname']
seg_nodes_fname = config["es"]['seg_nodes_fname']
nodes_dict_fname = config["es"]['nodes_dict_fname']

edges_utd_fname = config["es"]['edges_utd_fname']
edges_olap_fname = config["es"]['edges_olap_fname']
edges_all_fname = config["es"]['edges_all_fname']
edges_score_fname = config["es"]['edges_score_fname']

clusters_utd_fname = config['es']['clusters_utd_fname']
clusters_fname = config['es']['clusters_fname']
clusters_stats_fname = config['es']['clusters_stats_fname']

pairs_fname = config['es']['score_pairs_fname']
eval_fname = config['es']['eval_pairs_fname']

feats_fname = config['es']['feats_fname']

# Gold feats
gold_feats_dict_fname = config['es']['gold_feats']
# Pseudo feats
feats_dict_fname = config['es']['feats_dict_fname']

gold_probs_fname = config['es']['mt_probs_gold']
gold_probs_dict_fname = config['es']['mt_probs_dict_gold']

pseudo_probs_fname = config['es']['mt_probs_pseudo']
pseudo_probs_dict_fname = config['es']['mt_probs_dict_pseudo']

train_segment_list_fname = config['es']['mt_train_files']
dev_segment_list_fname = config['es']['mt_dev_files']

gold_corpus_fname = config['es']['mt_corpus_train_gold']
pseudo_corpus_fname = config['es']['mt_corpus_train_pseudo']

mt_gold_pred_dict_fname = config['es']['mt_gold_pred_dict']
mt_pseudo_pred_dict_fname = config['es']['mt_pseudo_pred_dict']

mt_gold_eval_dict_fname = config['es']['mt_gold_eval_dict']
mt_pseudo_eval_dict_fname = config['es']['mt_pseudo_eval_dict']

es_merge_wavs_path = config['es']['es_merge_wavs']
utd_wavs_path = config['es']['utd_wavs']

utd_tmp_wav_path = config['es']['utd_wavs']

In [24]:
Align = namedtuple('Align', ['word', 'start', 'end'])
Node = namedtuple('Node', ['file', 'seg', 'start', 'end', 'es', 'es_cnt'])
Eval = namedtuple('Eval', ['n1', 'n2', 'dtw', 'es_sim', 'es_cnt_sim', 'en_j_sim'])

In [25]:
segment_map = pickle.load(open(config['es']['segment_dict_fname'], "rb"))
align_dict = pickle.load(open(config['es']['align_dict_fname'], "rb"))
nodes_dict = pickle.load(open(nodes_dict_fname, "rb"))
pairs_list = pickle.load(open(pairs_fname, "rb"))
eval_dict = pickle.load(open(eval_fname, "rb"))
clusters = pickle.load(open(clusters_fname, "rb"))
clusters_stats = pickle.load(open(clusters_stats_fname, "rb"))
feats_dict = pickle.load(open(feats_dict_fname, "rb"))
gold_feats_dict = pickle.load(open(gold_feats_dict_fname, "rb"))

if os.path.exists(gold_probs_dict_fname):
    gold_probs_dict = pickle.load(open(gold_probs_dict_fname, "rb"))

if os.path.exists(pseudo_probs_dict_fname):
    pseudo_probs_dict = pickle.load(open(pseudo_probs_dict_fname, "rb"))

if os.path.exists(mt_gold_pred_dict_fname):
    mt_gold_pred_dict = pickle.load(open(mt_gold_pred_dict_fname, "rb"))
    
if os.path.exists(mt_pseudo_pred_dict_fname):
    mt_pseudo_pred_dict = pickle.load(open(mt_pseudo_pred_dict_fname, "rb"))
    
if os.path.exists(mt_gold_eval_dict_fname):
    mt_gold_eval_dict = pickle.load(open(mt_gold_eval_dict_fname, "rb"))
    
if os.path.exists(mt_pseudo_eval_dict_fname):
    mt_pseudo_eval_dict = pickle.load(open(mt_pseudo_eval_dict_fname, "rb"))    

## Create train/dev set split between calls

In [45]:
split_factor = 10
start_fid = 63
end_fid = 120
train_fname = "../files-train-segments-%d-%d-%d.txt" % (start_fid, end_fid, split_factor)
dev_fname = "../files-dev-segments-%d-%d-%d.txt" % (start_fid, end_fid, split_factor)
train_call_fname = "../files-train-calls-%d-%d-%d.txt" % (start_fid, end_fid, split_factor)
dev_call_fname = "../files-dev-calls-%d-%d-%d.txt" % (start_fid, end_fid, split_factor)

In [52]:
align_dict['001']['001.001']

{'en': [Align(word='MECHITA', start=12, end=50),
  Align(word='WHAT', start=50, end=73),
  Align(word='SENT', start=129, end=169),
  Align(word='IT', start=126, end=129),
  Align(word='TO', start=169, end=176),
  Align(word='WHOM', start=176, end=192),
  Align(word='TO', start=192, end=198),
  Align(word='POCHO', start=198, end=225)],
 'en_cnt': [Align(word='MECHITA', start=12, end=50),
  Align(word='SENT', start=129, end=169),
  Align(word='POCHO', start=198, end=225)],
 'es': [Align(word='MECHITA', start=12, end=50),
  Align(word='QU\xc3\xa9', start=50, end=73),
  Align(word='LAS', start=109, end=126),
  Align(word='HA', start=126, end=129),
  Align(word='MANDADO', start=129, end=169),
  Align(word='A', start=169, end=176),
  Align(word='QUI\xc3\xa9N', start=176, end=192),
  Align(word='A', start=192, end=198),
  Align(word='POCHO', start=198, end=225)],
 'es_cnt': [Align(word='MECHITA', start=12, end=50),
  Align(word='MANDADO', start=129, end=169),
  Align(word='QUI\xc3\xa9N', star

In [53]:
def create_segment_level_sets(t_fname, d_fname, start_fid, end_fid, split_factor):
    if split_factor == 0:
        print("Invalid split factor")
        return
    print("creating %s, %s" %(t_fname, d_fname))
    s_fid = "%03d" % start_fid
    e_fid = "%03d" % end_fid
    
    sel_fids = sorted([fid for fid in align_dict.keys() if \
                int(fid) >= start_fid and int(fid) <= end_fid])
    
    t_count = 0
    d_count = 0
    t_dur = 0
    d_dur = 0

    with open(train_fname, "w") as t_f, open(dev_fname, "w") as d_f:
        for fid in sel_fids:
            sids = align_dict[fid].keys()
            dev_num = len(sids) // split_factor
            random.shuffle(sids)
            dev_sids = sids[:dev_num]
            d_dur += sum([(a.end-a.start) for s in dev_sids for a in align_dict[fid][s]['es']])
            train_sids = sids[dev_num:]
            t_dur += sum([(a.end-a.start) for s in train_sids for a in align_dict[fid][s]['es']])
            t_count += len(train_sids)
            d_count += len(dev_sids)
            d_f.write('\n'.join(dev_sids))
            d_f.write('\n')
            t_f.write('\n'.join(train_sids))
            t_f.write('\n')
    print(t_count, d_count)
    print("training: %0.2f (hrs), dev: %0.2f (hrs)" %((t_dur/100/3600), (d_dur/100/3600)))

In [58]:
def create_call_level_sets(t_fname, d_fname, start_fid, end_fid, split_factor):
    if split_factor == 0:
        print("Invalid split factor")
        return
    print("creating %s, %s" %(t_fname, d_fname))
    s_fid = "%03d" % start_fid
    e_fid = "%03d" % end_fid
    
    sel_fids = sorted([fid for fid in align_dict.keys() if \
                int(fid) >= start_fid and int(fid) <= end_fid])
    
    random.shuffle(sel_fids)
    
    t_count = 0
    d_count = 0
    
    t_fids = []
    d_fids = []
    t_dur = 0
    d_dur = 0

    with open(train_fname, "w") as t_f, open(dev_fname, "w") as d_f:
        for i, fid in enumerate(sel_fids):
            sids = align_dict[fid].keys()
            if i < len(sel_fids) // split_factor:
                d_count += len(sids)
                d_f.write('\n'.join(sids))
                d_f.write('\n')
                d_dur += sum([(a.end-a.start) for s in sids for a in align_dict[fid][s]['es']])
                d_fids.append(fid)
            else:
                t_count += len(sids)
                t_f.write('\n'.join(sids))
                t_f.write('\n')
                t_dur += sum([(a.end-a.start) for s in sids for a in align_dict[fid][s]['es']])
                t_fids.append(fid)
    print(t_count, d_count)
    print("training: %0.2f (hrs), dev: %0.2f (hrs)" %((t_dur/100/3600), (d_dur/100/3600)))
    print("# files training: %d, test: %d" %(len(t_fids), len(d_fids)))

In [59]:
len([fid for fid in align_dict.keys() if int(fid) >= 63])

50

In [60]:
train_fname, dev_fname

('../files-train-segments-63-120-10.txt',
 '../files-dev-segments-63-120-10.txt')

In [61]:
create_segment_level_sets(train_fname, dev_fname, start_fid, end_fid, split_factor)

creating ../files-train-segments-63-120-10.txt, ../files-dev-segments-63-120-10.txt
8094 872
training: 6.06 (hrs), dev: 0.63 (hrs)


In [62]:
create_call_level_sets(train_call_fname, dev_call_fname, start_fid, end_fid, split_factor)

creating ../files-train-calls-63-120-10.txt, ../files-dev-calls-63-120-10.txt
8188 778
training: 6.03 (hrs), dev: 0.66 (hrs)
# files training: 45, test: 5


In [63]:
train_segment_list_fname, dev_segment_list_fname

(u'../files-train-segments-1-120-10.txt',
 u'../files-dev-segments-1-120-10.txt')