In [1]:
from __future__ import print_function
from __future__ import division
import os
import cPickle as pickle
import json
import subprocess
from IPython.display import display
from IPython.display import Audio
import bisect
from collections import namedtuple
import numpy as np
import pandas as pd
from collections import Counter
from prettytable import PrettyTable
import matplotlib.pyplot as plt
import seaborn as sns
from tabulate import tabulate
import random
from matplotlib.ticker import MultipleLocator, \
     FormatStrFormatter, AutoMinorLocator
%matplotlib inline

In [2]:
with open("config.json") as json_data_file:
    config = json.load(json_data_file)

In [3]:
nodes_fname = config["es"]['nodes_fname']
seg_nodes_fname = config["es"]['seg_nodes_fname']
nodes_dict_fname = config["es"]['nodes_dict_fname']

edges_utd_fname = config["es"]['edges_utd_fname']
edges_olap_fname = config["es"]['edges_olap_fname']
edges_all_fname = config["es"]['edges_all_fname']
edges_score_fname = config["es"]['edges_score_fname']

clusters_utd_fname = config['es']['clusters_utd_fname']
clusters_fname = config['es']['clusters_fname']
clusters_stats_fname = config['es']['clusters_stats_fname']

pairs_fname = config['es']['score_pairs_fname']
eval_fname = config['es']['eval_pairs_fname']

feats_fname = config['es']['feats_fname']

# Gold feats
gold_feats_dict_fname = config['es']['gold_feats']
# Pseudo feats
feats_dict_fname = config['es']['feats_dict_fname']

gold_probs_fname = config['es']['mt_probs_gold']
gold_probs_dict_fname = config['es']['mt_probs_dict_gold']

pseudo_probs_fname = config['es']['mt_probs_pseudo']
pseudo_probs_dict_fname = config['es']['mt_probs_dict_pseudo']

train_segment_list_fname = config['es']['mt_train_files']
dev_segment_list_fname = config['es']['mt_dev_files']

gold_corpus_fname = config['es']['mt_corpus_train_gold']
pseudo_corpus_fname = config['es']['mt_corpus_train_pseudo']

mt_gold_pred_dict_fname = config['es']['mt_gold_pred_dict']
mt_pseudo_pred_dict_fname = config['es']['mt_pseudo_pred_dict']

mt_gold_eval_dict_fname = config['es']['mt_gold_eval_dict']
mt_pseudo_eval_dict_fname = config['es']['mt_pseudo_eval_dict']

es_merge_wavs_path = config['es']['es_merge_wavs']
utd_wavs_path = config['es']['utd_wavs']

utd_tmp_wav_path = config['es']['utd_wavs']

In [4]:
Align = namedtuple('Align', ['word', 'start', 'end'])
Node = namedtuple('Node', ['file', 'seg', 'start', 'end', 'es', 'es_cnt'])
Eval = namedtuple('Eval', ['n1', 'n2', 'dtw', 'es_sim', 'es_cnt_sim', 'en_j_sim'])

In [5]:
segment_map = pickle.load(open(config['es']['segment_dict_fname'], "rb"))
align_dict = pickle.load(open(config['es']['align_dict_fname'], "rb"))
nodes_dict = pickle.load(open(nodes_dict_fname, "rb"))
pairs_list = pickle.load(open(pairs_fname, "rb"))
eval_dict = pickle.load(open(eval_fname, "rb"))
clusters = pickle.load(open(clusters_fname, "rb"))
clusters_stats = pickle.load(open(clusters_stats_fname, "rb"))
feats_dict = pickle.load(open(feats_dict_fname, "rb"))
gold_feats_dict = pickle.load(open(gold_feats_dict_fname, "rb"))

if os.path.exists(gold_probs_dict_fname):
    gold_probs_dict = pickle.load(open(gold_probs_dict_fname, "rb"))

if os.path.exists(pseudo_probs_dict_fname):
    pseudo_probs_dict = pickle.load(open(pseudo_probs_dict_fname, "rb"))

if os.path.exists(mt_gold_pred_dict_fname):
    mt_gold_pred_dict = pickle.load(open(mt_gold_pred_dict_fname, "rb"))
    
if os.path.exists(mt_pseudo_pred_dict_fname):
    mt_pseudo_pred_dict = pickle.load(open(mt_pseudo_pred_dict_fname, "rb"))
    
if os.path.exists(mt_gold_eval_dict_fname):
    mt_gold_eval_dict = pickle.load(open(mt_gold_eval_dict_fname, "rb"))
    
if os.path.exists(mt_pseudo_eval_dict_fname):
    mt_pseudo_eval_dict = pickle.load(open(mt_pseudo_eval_dict_fname, "rb"))    

## Create train/dev set split between calls

In [26]:
split_factor = 10
start_fid = 81
end_fid = 120
train_fname = "../files-train-segments-sf-%d.txt" % split_factor
dev_fname = "../files-dev-segments-sf-%d.txt" % split_factor

In [27]:
int("031")

31

In [28]:
def create_segment_level_sets(t_fname, d_fname, start_fid, end_fid, split_factor):
    if split_factor == 0:
        print("Invalid split factor")
        return
    print("creating %s, %s" %(t_fname, d_fname))
    s_fid = "%03d" % start_fid
    e_fid = "%03d" % end_fid
    
    sel_fids = sorted([fid for fid in align_dict.keys() if \
                int(fid) >= start_fid and int(fid) <= end_fid])
    
    t_count = 0
    d_count = 0

    with open(train_fname, "w") as t_f, open(dev_fname, "w") as d_f:
        for fid in sel_fids:
            sids = align_dict[fid].keys()
            dev_num = len(sids) // split_factor
            random.shuffle(sids)
            dev_sids = sids[:dev_num]
            train_sids = sids[dev_num:]
            t_count += len(train_sids)
            d_count += len(dev_sids)
            d_f.write('\n'.join(dev_sids))
            d_f.write('\n')
            t_f.write('\n'.join(train_sids))
            t_f.write('\n')
    print(t_count, d_count)

In [29]:
# sel_fids = sorted([fid for fid in align_dict.keys() if \
#                 int(fid) >= start_fid and int(fid) <= end_fid])

# sids = align_dict['001'].keys()
# dev_num = len(sids) // split_factor
# print(dev_num)
# random.shuffle(sids)
# dev_sids = sids[:dev_num]
# train_sids = sids[dev_num:]
# print(','.join(dev_sids))
# print('\n')
# print(','.join(train_sids))
# print('\n')
# print(len(train_sids), len(dev_sids))
# len(sel_fids)

In [30]:
create_segment_level_sets(train_fname, dev_fname, start_fid, end_fid, split_factor)

creating ../files-train-segments-sf-10.txt, ../files-dev-segments-sf-10.txt
5930 639


In [36]:
train_segment_list_fname, dev_segment_list_fname

(u'../files-train-segments-sf-10.txt', u'../files-dev-segments-sf-10.txt')

In [13]:
train_fids = []
train_sids = []
with open(train_segment_list_fname, "r") as in_f:
    for line in in_f:
        train_fids.append(line.strip().split(".")[0])
        train_sids.append(line.strip())
train_fids = set(train_fids)
train_sids = set(train_sids)

In [14]:
dev_fids = []
dev_sids = []
with open(dev_segment_list_fname, "r") as in_f:
    for line in in_f:
        dev_fids.append(line.strip().split(".")[0])
        dev_sids.append(line.strip())
dev_fids = set(dev_fids)
dev_sids = set(dev_sids)

In [24]:
len(dev_sids)

1282

In [37]:
clusters_stats['is_dev'] = []
clusters_stats['is_train'] = []
clusters_stats['oov'] = []
for sids in clusters_stats['sids']:
    is_in_dev = len(dev_sids & set(sids)) > 0
    is_in_train = len(train_sids & set(sids)) > 0
    clusters_stats['is_dev'].append(is_in_dev)
    clusters_stats['is_train'].append(is_in_train)
    clusters_stats['oov'].append(is_in_dev and not is_in_train)

clusters_stats['dev_depth'] = []
clusters_stats['train_depth'] = []
for nlist in clusters_stats['sids']:
    clusters_stats['dev_depth'].append(len([i for i in nlist if i in dev_sids]))
    clusters_stats['train_depth'].append(len([i for i in nlist if i in train_sids]))


In [38]:
num_oov = sum([i for i in clusters_stats['oov']])
print(num_oov)
sum_oov = sum([d for i, d in enumerate(clusters_stats['dev_depth']) if clusters_stats['oov'][i]])
total_pwords = sum([d for i, d in enumerate(clusters_stats['dev_depth']) if clusters_stats['is_dev'][i]])
print(sum_oov)
print(total_pwords)

511
620
2674


In [40]:
dev_pwords = []
train_pwords = []
oov_pwords = []
for did in dev_sids:
    dev_pwords.extend(feats_dict[did])
for tid in train_sids:
    train_pwords.extend(feats_dict[tid])
set_train_pwords = set(train_pwords)
oov_pwords = [cid for cid in dev_pwords if cid not in set_train_pwords and cid != "-1"]
missing_words = [i for cid in dev_pwords if cid == "-1"]
out_line = PrettyTable(["Total words", "oov", "Missing", "oov + missing"])
out_line.add_row([len(dev_pwords), len(oov_pwords), \
                  len(missing_words), \
                 len(oov_pwords) + len(missing_words)])
print(out_line)

out_line = PrettyTable(["total vocab", "oov vocab"])
out_line.add_row([len(set(dev_pwords)), len(set(oov_pwords))])
print(out_line)

+-------------+-----+---------+---------------+
| Total words | oov | Missing | oov + missing |
+-------------+-----+---------+---------------+
|     3324    | 620 |   650   |      1270     |
+-------------+-----+---------+---------------+
+-------------+-----------+
| total vocab | oov vocab |
+-------------+-----------+
|     2335    |    511    |
+-------------+-----------+


In [41]:
dev_pwords = []
train_pwords = []
oov_pwords = []
for did in dev_sids:
    dev_pwords.extend(feats_dict[did])
for tid in train_sids:
    train_pwords.extend(feats_dict[tid])
set_train_pwords = set(train_pwords)
oov_pwords = [cid for cid in dev_pwords if cid not in set_train_pwords or cid == "-1"]
out_line = PrettyTable(["Total words", "oov", "Missing", "oov + missing"])
out_line.add_row([len(dev_pwords), len(oov_pwords), \
                  len([i for i in dev_pwords if i == "-1"]), \
                 len(oov_pwords) + len([i for i in dev_pwords if i == "-1"])])
print(out_line)

out_line = PrettyTable(["total vocab", "oov vocab"])
out_line.add_row([len(set(dev_pwords)), len(set(oov_pwords))])
print(out_line)

+-------------+------+---------+---------------+
| Total words | oov  | Missing | oov + missing |
+-------------+------+---------+---------------+
|     3324    | 1270 |   650   |      1920     |
+-------------+------+---------+---------------+
+-------------+-----------+
| total vocab | oov vocab |
+-------------+-----------+
|     2335    |    512    |
+-------------+-----------+


In [23]:
len(dev_pwords), len([i for i in dev_pwords if i != '-1']), len(set([i for i in dev_pwords if i != '-1']))

(3324, 2674, 2334)

In [42]:
pseudo_mt_eval_dict

NameError: name 'pseudo_mt_eval_dict' is not defined