In [9]:
from __future__ import print_function
from __future__ import division
import os
import cPickle as pickle
import json
import subprocess
from IPython.display import display
from IPython.display import Audio
import bisect
from collections import namedtuple
import numpy as np
import pandas as pd
from collections import Counter
from prettytable import PrettyTable
import matplotlib.pyplot as plt
import seaborn as sns
import textwrap
import nltk
from nltk.corpus import stopwords

from matplotlib.ticker import MultipleLocator, \
     FormatStrFormatter, AutoMinorLocator
%matplotlib inline

In [10]:
with open("config.json") as json_data_file:
    config = json.load(json_data_file)

In [13]:
nodes_fname = config["es"]['nodes_fname']
seg_nodes_fname = config["es"]['seg_nodes_fname']
nodes_dict_fname = config["es"]['nodes_dict_fname']

edges_utd_fname = config["es"]['edges_utd_fname']
edges_olap_fname = config["es"]['edges_olap_fname']
edges_all_fname = config["es"]['edges_all_fname']
edges_score_fname = config["es"]['edges_score_fname']

clusters_utd_fname = config['es']['clusters_utd_fname']
clusters_fname = config['es']['clusters_fname']
clusters_stats_fname = config['es']['clusters_stats_fname']

pairs_fname = config['es']['score_pairs_fname']
eval_fname = config['es']['eval_pairs_fname']

feats_fname = config['es']['feats_fname']

# Gold feats
gold_feats_dict_fname = config['es']['gold_feats']
# Pseudo feats
feats_dict_fname = config['es']['feats_dict_fname']

gold_probs_fname = config['es']['mt_probs_gold']
gold_probs_dict_fname = config['es']['mt_probs_dict_gold']

pseudo_probs_fname = config['es']['mt_probs_pseudo']
pseudo_probs_dict_fname = config['es']['mt_probs_dict_pseudo']

train_segment_list_fname = config['es']['mt_train_files']
dev_segment_list_fname = config['es']['mt_dev_files']

gold_corpus_fname = config['es']['mt_corpus_train_gold']
pseudo_corpus_fname = config['es']['mt_corpus_train_pseudo']

mt_gold_pred_dict_fname = config['es']['mt_gold_pred_dict']
mt_pseudo_pred_dict_fname = config['es']['mt_pseudo_pred_dict']

mt_gold_eval_dict_fname = config['es']['mt_gold_eval_dict']
mt_pseudo_eval_dict_fname = config['es']['mt_pseudo_eval_dict']

es_merge_wavs_path = config['es']['es_merge_wavs']
utd_wavs_path = config['es']['utd_wavs']

utd_tmp_wav_path = config['es']['utd_wavs']

In [14]:
Align = namedtuple('Align', ['word', 'start', 'end'])
Node = namedtuple('Node', ['file', 'seg', 'start', 'end', 'es', 'es_cnt'])
Eval = namedtuple('Eval', ['n1', 'n2', 'dtw', 'es_sim', 'es_cnt_sim', 'en_j_sim'])

In [15]:
segment_map = pickle.load(open(config['es']['segment_dict_fname'], "rb"))
align_dict = pickle.load(open(config['es']['align_dict_fname'], "rb"))
nodes_dict = pickle.load(open(nodes_dict_fname, "rb"))
pairs_list = pickle.load(open(pairs_fname, "rb"))
eval_dict = pickle.load(open(eval_fname, "rb"))
clusters = pickle.load(open(clusters_fname, "rb"))
clusters_stats = pickle.load(open(clusters_stats_fname, "rb"))
feats_dict = pickle.load(open(feats_dict_fname, "rb"))
gold_feats_dict = pickle.load(open(gold_feats_dict_fname, "rb"))

if os.path.exists(gold_probs_dict_fname):
    gold_probs_dict = pickle.load(open(gold_probs_dict_fname, "rb"))

if os.path.exists(pseudo_probs_dict_fname):
    pseudo_probs_dict = pickle.load(open(pseudo_probs_dict_fname, "rb"))

if os.path.exists(mt_gold_pred_dict_fname):
    mt_gold_pred_dict = pickle.load(open(mt_gold_pred_dict_fname, "rb"))
    
if os.path.exists(mt_pseudo_pred_dict_fname):
    mt_pseudo_pred_dict = pickle.load(open(mt_pseudo_pred_dict_fname, "rb"))
    
if os.path.exists(mt_gold_eval_dict_fname):
    mt_gold_eval_dict = pickle.load(open(mt_gold_eval_dict_fname, "rb"))
    
if os.path.exists(mt_pseudo_eval_dict_fname):
    mt_pseudo_eval_dict = pickle.load(open(mt_pseudo_eval_dict_fname, "rb"))    

In [16]:
fish_path = "../../fisher/"
fish_flist_fname = "../../fisher/goodfiles-gdfa.txt"
fish_es_align_path = os.path.join(fish_path, "wav2es-word-final")
fish_subfolders = [os.path.join(fish_es_align_path, f) for f in map(str, range(8))]

In [17]:
es_words = [a.word for fid in align_dict for sid in align_dict[fid] for a in align_dict[fid][sid]['es']]
es_cnt_words = [a.word for fid in align_dict for sid in align_dict[fid] for a in align_dict[fid][sid]['es_cnt']]
en_words = [a.word for fid in align_dict for sid in align_dict[fid] for a in align_dict[fid][sid]['en']]
en_cnt_words = [a.word for fid in align_dict for sid in align_dict[fid] for a in align_dict[fid][sid]['en_cnt']]

In [18]:
print(len(es_words), len(set(es_words)))
print(len(es_cnt_words), len(set(es_cnt_words)))
print(len(en_words), len(set(en_words)))
print(len(en_cnt_words), len(set(en_cnt_words)))

168195 10674
77134 10449
159777 6723
84466 6596


In [19]:
def fisher_get_details(transcript_path):
    es_words = []
    fids = []
    dur = 0
    dur_500ms = 0
    for fname in [f for f in os.listdir(transcript_path) if f.endswith(".es")]:
        fids.append(os.path.splitext(fname)[0].split("_")[2])
        with open(os.path.join(transcript_path, fname), "r") as in_f:
            for line in in_f:
                line_items = line.strip().split()
                start, end = map(float, line_items[1:])
                es_words.append(line_items[0])
                curr_dur = end-start
                dur += curr_dur
                dur_500ms += curr_dur if curr_dur >= 0.5 else 0
    print("finished")
    return es_words, fids, dur, dur_500ms
                
            

In [20]:
# fish_es, fish_fids, fish_dur, fish_dur_500ms = fisher_get_details(fish_subfolders[0])
fish_es = []
fish_fids = set()
fish_dur = 0
fish_dur_500ms = 0
for subfolder in fish_subfolders[2:3]:
    temp_es, temp_fids, temp_dur, temp_dur_500ms = fisher_get_details(subfolder)
    fish_es.extend(temp_es)
    fish_fids |= set(temp_fids)
    fish_dur += temp_dur
    fish_dur_500ms += temp_dur_500ms
    print("missing es: %d" % len(set(es_words)-set(temp_es)))
    print("dur, total: %.2f, 500ms: %.2f" % (temp_dur / 3600, temp_dur_500ms / 3600))

finished
missing es: 7111
dur, total: 14.56, 500ms: 4.69


In [21]:
fish_es_cnt = [w for w in list(set(fish_es)) if w.lower().decode("utf-8") not in set(stopwords.words('spanish'))]
    
print("#es: %d, set: %d, #fids: %d" % (len(fish_es), len(set(fish_es)), len(set(fish_fids))))
print("tots dur: %.2f, min 500ms dur: %.2f" %(fish_dur / 3600, fish_dur_500ms / 3600))
print("# es cnt: %d," % (len(set(fish_es_cnt))))

#es: 180912, set: 9473, #fids: 116
tots dur: 14.56, min 500ms dur: 4.69
# es cnt: 9296,


In [22]:
print("missing es\t%d, overlap: %.1f%%" % (len(set(es_words) - set(fish_es)), (len(set(es_words) & set(fish_es)) / len(set(es_words)) * 100)))
print("missing es cnt\t%d" % len(set(es_cnt_words) - set(fish_es_cnt)))

missing es	7111, overlap: 33.4%
missing es cnt	7051


In [23]:
len(fish_fids)

116

In [24]:
[os.path.splitext(f)[0] for f in os.listdir(os.path.join(fish_es_align_path, "0"))[:5]]

['20050908_191808_23_fsp_0_0020',
 '20050926_220244_130_fsp_0_0045',
 '20050909_221657_28_fsp_0_0086_plus',
 '20050908_182943_22_fsp_0_0026',
 '20050917_211045_76_fsp_0_0054']