In [1]:
import nltk
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu, modified_precision
from nltk.translate.chrf_score import sentence_chrf, corpus_chrf
from nltk.metrics import scores
import scipy.io.wavfile
from IPython.display import Audio
from IPython.display import display
from nltk.stem import *
# from nltk.stem.snowball import SnowballStemmer
from stemming.porter2 import stem
import stemming
from nltk.metrics.scores import recall

from nltk.corpus import stopwords

%matplotlib inline

In [2]:
from bow_run import *

In [3]:
%run utils.ipynb

In [4]:
smooth_fun = nltk.translate.bleu_score.SmoothingFunction()

In [5]:
def bow_basic_precision_recall(r, h, display=False):
    p_numerators = Counter() # Key = ngram order, and value = no. of ngram matches.
    p_denominators = Counter() # Key = ngram order, and value = no. of ngram in ref.
    r_numerators = Counter() # Key = ngram order, and value = no. of ngram matches.
    r_denominators = Counter() # Key = ngram order, and value = no. of ngram in ref.
    metrics = {"rc": 0, "rt": 0, "tp": 0, "tc": 0, "word": {}}

    if display:
        print("total utts={0:d}".format(len(r)))

    i=1

    for references, hypothesis in zip(r, h):
        if min([len(any_ref) for any_ref in references]) > 0:
            if len(hypothesis) > 0:
                p_i = modified_precision(references, hypothesis, i)
                p_numerators[i] += p_i.numerator
                p_denominators[i] += p_i.denominator

                metrics["tc"] += p_i.numerator
                metrics["tp"] += p_i.denominator
            else:
                p_numerators[i] += 0
                p_denominators[i] += 0

                metrics["tc"] += 0
                metrics["tp"] += 0

            #print(p_i.numerator, p_i.denominator)

            tot_match = 0
            tot_count = 0

            common_ref_words = set(references[0])
            for curr_ref in references[1:]:
                common_ref_words &= set(curr_ref)
            
            common_words = common_ref_words & set(hypothesis)
            for w in common_ref_words:
                if w not in metrics["word"]:
                    metrics["word"][w] = {"t": 0, "tp": 0, "tc": 0}
                metrics["word"][w]["t"] += 1
            
            for w in set(hypothesis):
                if w not in metrics["word"]:
                    metrics["word"][w] = {"t": 0, "tp": 0, "tc": 0}
                metrics["word"][w]["tp"] += 1
                
            for w in common_words:
                if w not in metrics["word"]:
                    metrics["word"][w] = {"t": 0, "tp": 0, "tc": 0}
                metrics["word"][w]["tc"] += 1
            
            r_numerators[i] += len(common_words)
            r_denominators[i] += len(common_ref_words)
            metrics["rc"] += len(common_words)
            metrics["rt"] += len(common_ref_words)
            

#             max_recall_match, max_tp, max_t, max_word_level_details = count_match(list(common_ref_words), list(set(hypothesis)))
#             max_recall = max_recall_match / max_t if max_t > 0 else 0

            # max_recall_match, max_tp, max_t, max_word_level_details = count_match(references[0], hypothesis)
            # max_recall = max_recall_match / max_t if max_t > 0 else 0

            # for curr_ref in references:
            #     curr_match, curr_tp, curr_t, curr_word_level_details = count_match(curr_ref, hypothesis)
            #     curr_recall = curr_match / curr_t if curr_t > 0 else 0

            #     if curr_recall > max_recall:
            #         max_recall_match = curr_match
            #         max_t = curr_t
            #         max_recall = curr_recall
            #         max_word_level_details = curr_word_level_details
            

#             r_numerators[i] += max_recall_match
#             r_denominators[i] += max_t
#             metrics["rc"] += max_recall_match
#             metrics["rt"] += max_t
#             for key in {"t","tp","tc"}:
#                 for w in max_word_level_details[key]:
#                     if w not in metrics["word"]:
#                         metrics["word"][w] = {"t": 0, "tp": 0, "tc": 0}
#                     metrics["word"][w][key] += max_word_level_details[key][w]

            
    
    
    

    prec = [(n / d) * 100 if d > 0 else 0 for n,d in zip(p_numerators.values(), p_denominators.values())]
    rec = [(n / d) * 100 if d > 0 else 0 for n,d in zip(r_numerators.values(), r_denominators.values())]

    if display:
        print("{0:10s} | {1:>8s}".format("metric", "1-gram"))
        print("-"*54)
        print("{0:10s} | {1:8.2f}".format("precision", *prec))
        print("{0:10s} | {1:8.2f}".format("recall", *rec))

    return prec[0], rec[0], metrics

In [6]:
def nmt_basic_precision_recall(r, h, display=False):
    p_numerators = Counter() # Key = ngram order, and value = no. of ngram matches.
    p_denominators = Counter() # Key = ngram order, and value = no. of ngram in ref.
    r_numerators = Counter() # Key = ngram order, and value = no. of ngram matches.
    r_denominators = Counter() # Key = ngram order, and value = no. of ngram in ref.
    metrics = {"rc": 0, "rt": 0, "tp": 0, "tc": 0, "word": {}}

    if display:
        print("total utts={0:d}".format(len(r)))

    i=1

    for references, hypothesis in zip(r, h):
        if min([len(any_ref) for any_ref in references]) > 0:
            if len(hypothesis) > 0:
                p_i = modified_precision(references, hypothesis, i)
                p_numerators[i] += p_i.numerator
                p_denominators[i] += p_i.denominator

                metrics["tc"] += p_i.numerator
                metrics["tp"] += p_i.denominator
            else:
                p_numerators[i] += 0
                p_denominators[i] += 0

                metrics["tc"] += 0
                metrics["tp"] += 0

            #print(p_i.numerator, p_i.denominator)

            tot_match = 0
            tot_count = 0

            max_recall_match, max_tp, max_t, max_word_level_details = count_match(references[0], hypothesis)
            max_recall = max_recall_match / max_t if max_t > 0 else 0

            for curr_ref in references:
                curr_match, curr_tp, curr_t, curr_word_level_details = count_match(curr_ref, hypothesis)
                curr_recall = curr_match / curr_t if curr_t > 0 else 0

                if curr_recall > max_recall:
                    max_recall_match = curr_match
                    max_t = curr_t
                    max_recall = curr_recall
                    max_word_level_details = curr_word_level_details

            r_numerators[i] += max_recall_match
            r_denominators[i] += max_t
            metrics["rc"] += max_recall_match
            metrics["rt"] += max_t
            for key in {"t","tp","tc"}:
                for w in max_word_level_details[key]:
                    if w not in metrics["word"]:
                        metrics["word"][w] = {"t": 0, "tp": 0, "tc": 0}
                    metrics["word"][w][key] += max_word_level_details[key][w]

    prec = [(n / d) * 100 if d > 0 else 0 for n,d in zip(p_numerators.values(), p_denominators.values())]
    rec = [(n / d) * 100 if d > 0 else 0 for n,d in zip(r_numerators.values(), r_denominators.values())]

    if display:
        print("{0:10s} | {1:>8s}".format("metric", "1-gram"))
        print("-"*54)
        print("{0:10s} | {1:8.2f}".format("precision", *prec))
        print("{0:10s} | {1:8.2f}".format("recall", *rec))

    return prec[0], rec[0], metrics

In [7]:
def get_model_data(nmt_path, use_google=False):
    if use_google:
        google_s2t_hyps, google_s2t_refs, nmt_4refs = get_google_data()
        nmt_hyps = google_s2t_hyps['fisher_dev_r0']
        nmt_refs = google_s2t_refs['fisher_dev_ref_0']
    else:
        nmt_refs = pickle.load(open(os.path.join(nmt_path, 
                                                 "model_s2t_refs.dict"), "rb"))
        nmt_hyps = pickle.load(open(os.path.join(nmt_path, 
                                                 "model_s2t_hyps.dict"), "rb"))
        nmt_4refs = pickle.load(open(os.path.join(nmt_path,
                                                  "model_s2t_refs_for_eval.dict"), "rb"))
    
    return nmt_refs, nmt_hyps, nmt_4refs

In [8]:
def eval_nmt_model(nmt_path, use_google=False, min_len=10):
    smooth_fun = nltk.translate.bleu_score.SmoothingFunction()
    
    nmt_refs, nmt_hyps, nmt_4refs = get_model_data(nmt_path, use_google=use_google)

    nmt_preds_bow = {}
    nmt_1_ref = {}
    nmt_refs_bow = {}

    dev_utt_ids = nmt_hyps.keys()

    for u in dev_utt_ids:
        nmt_preds_bow[u] = list(get_words_in_bow_vocab(nmt_hyps[u], bow_dict))
        nmt_refs_bow[u] = []
        nmt_1_ref[u] = [list(get_words_in_bow_vocab(nmt_refs[u], bow_dict))]
        for r in nmt_4refs[u]:
            nmt_refs_bow[u].append(list(get_words_in_bow_vocab(r, bow_dict)))
    
    
    p_bow, r_bow, metrics_1_bow = bow_basic_precision_recall(nmt_1_ref.values(), 
                                                       nmt_preds_bow.values())
    print("-"*80)
    print("BOW - using 1 reference")
    print("-"*20)
    print("precision={0:.2f}, recall={1:.2f}".format(p_bow, r_bow))
    num_1correct = len([item for item in metrics_1_bow["word"].items() 
                        if item[1]['tc'] > 0])
    
    num_all = len([item for item in metrics_1_bow["word"].items() 
                        if item[1]['t'] > 0])

    print("-"*20)
    print("{0:d} out of {1:d} retrieved with atleast 1 correct instance".format(num_1correct, num_all))
    

    p_bow, r_bow, metrics_bow = bow_basic_precision_recall(nmt_refs_bow.values(), 
                                                       nmt_preds_bow.values())
    
    print("-"*80)
    print("BOW - using all 4 references")
    print("-"*20)
    print("precision={0:.2f}, recall={1:.2f}".format(p_bow, r_bow))
    
    num_1correct = len([item for item in metrics_bow["word"].items() 
                        if item[1]['tc'] > 0])
    
    num_all = len([item for item in metrics_bow["word"].items() 
                        if item[1]['t'] > 0])

    print("-"*20)
    print("{0:d} out of {1:d} retrieved with atleast 1 correct instance".format(num_1correct, num_all))
    
    # MT PRECISION RECALL - NOOOT BOW
    p_nmt, r_nmt, metrics_nmt = nmt_basic_precision_recall(nmt_4refs.values(), 
                                                       nmt_hyps.values())
    print("-"*80)
    print("MT task - using all 4 references")
    print("-"*80)
    print("precision={0:.2f}, recall={1:.2f}".format(p_nmt, r_nmt))

    nmt_bleu = corpus_bleu(nmt_4refs.values(), 
                           nmt_hyps.values(),
                           smoothing_function=smooth_fun.method2)

    print("-"*80)
    print("4 references bleu={0:2f}".format(nmt_bleu*100))
    
    one_ref_list = []
    one_hyp_list = []
    
#     for u in nmt_refs:
#         one_ref_list.append([nmt_refs[u]])
#         one_hyp_list.append(nmt_hyps[u])
        
#     p_nmt_one, r_nmt_one, metrics_nmt_one = nmt_basic_precision_recall(one_ref_list, 
#                                                            one_hyp_list)
#     print("-"*80)
#     print("MT task - using single references")
#     print("-"*80)
#     print("precision={0:.2f}, recall={1:.2f}".format(p_nmt_one, r_nmt_one))
    
#     nmt_bleu = corpus_bleu(one_ref_list, 
#                            one_hyp_list,
#                            smoothing_function=smooth_fun.method2)

#     print("-"*80)
#     print("single reference bleu={0:2f}".format(nmt_bleu*100))
    print("-"*80)
    print("using min len filter")
    print("-"*20)
    check_bleu_with_len_filter(nmt_4refs, nmt_hyps, min_len=min_len)
    
    return metrics_1_bow, metrics_bow

In [9]:
def eval_prec_recall_for_words(nmt_path, bow_dict, use_google=False):
    smooth_fun = nltk.translate.bleu_score.SmoothingFunction()
    
    nmt_refs, nmt_hyps, nmt_4refs = get_model_data(nmt_path, use_google=use_google)

    nmt_preds_bow = {}
    nmt_1_ref = {}
    nmt_refs_bow = {}

    dev_utt_ids = nmt_hyps.keys()

    for u in dev_utt_ids:
        pred = get_out_str(nmt_hyps[u], use_google=use_google).strip().split()
        nmt_preds_bow[u] = list(get_words_in_bow_vocab(pred, bow_dict))
        nmt_refs_bow[u] = []
        ref1 = get_out_str(nmt_refs[u], use_google=use_google).strip().split()
        nmt_1_ref[u] = [list(get_words_in_bow_vocab(ref1, bow_dict))]
        for r in nmt_4refs[u]:
            curr_ref = get_out_str(r, use_google=use_google).strip().split()
            nmt_refs_bow[u].append(list(get_words_in_bow_vocab(curr_ref, bow_dict)))
    
    
    p_bow, r_bow, metrics_1_bow = bow_basic_precision_recall(nmt_1_ref.values(), 
                                                       nmt_preds_bow.values())
    
#     print("-"*80)
#     print("Using word list: \n{0:s}".format(" -- ".join([w.decode() for w in bow_dict["w2i"].keys()])))
#     print("number of words: {0:d}".format(len(bow_dict["w2i"])))
    print("-"*80)
    print("-"*20)
    p_bow, r_bow, metrics_bow = bow_basic_precision_recall(nmt_refs_bow.values(), 
                                                       nmt_preds_bow.values())
    
    print("-"*80)
    print("BOW - using all 4 references")
    print("-"*20)
    print("precision={0:.2f}, recall={1:.2f}".format(p_bow, r_bow))
    
    num_1correct = len([item for item in metrics_bow["word"].items() 
                        if item[1]['tc'] > 0])
    
    words_present = [item[0] for item in metrics_bow["word"].items() 
                        if item[1]['t'] > 0]
    num_all = len(words_present)
    print("-"*80)
    print("Using word list: \n{0:s}".format(" -- ".join(words_present)))
    print("number of words: {0:d}".format(num_all))
    top_five = [w[0] for w in sorted([(item[0], item[1]['t']) for item in metrics_bow["word"].items() 
                        if item[1]['t'] > 0], reverse=True, key= lambda t: t[1])[:5]]
    print("Top 5 words present: \n{0:s}".format(" -- ".join(top_five)))

    print("-"*20)
    print("{0:d} out of {1:d} retrieved with atleast 1 correct instance".format(num_1correct, num_all))
    
    
    return metrics_bow

In [10]:
# "crisis": os.path.join(m_cfg['data_path'], 
#                                                     "bow_crises_vocab.dict")

In [11]:
def eval_all_word_lists(nmt_path, use_google=False):
    metrics = {}
    eval_word_lists = {"en_freq": os.path.join(m_cfg['data_path'], "eval_en_freq_vocab.dict"),
                       "en_rare": os.path.join(m_cfg['data_path'], "eval_en_rare_vocab.dict"),
#                        "en_es_common": os.path.join(m_cfg['data_path'], 
#                                                     "eval_en_es_common_vocab.dict"),
                       "crisis": os.path.join(m_cfg['data_path'], 
                                                    "eval_en_crisis_vocab.dict")}
    for key, word_list in eval_word_lists.items():
        words = pickle.load(open(word_list, "rb"))
        metrics[key] = eval_prec_recall_for_words(nmt_path, words, use_google=use_google)
    return metrics

In [12]:
def check_bleu_with_len_filter(refs, hyps, min_len):
    sel_refs, sel_hyps = [], []
    for u in refs:
        len_ref = min([len(r) for r in refs[u]])
        if len_ref >= min_len:
            sel_refs.append(refs[u])
            sel_hyps.append(hyps[u])
    print("{0:d} out of {1:d} have len >= {2:d}".format(len(sel_refs), len(refs), min_len))
    bleu_score = corpus_bleu(sel_refs, sel_hyps, smoothing_function=smooth_fun.method2)*100
    print("BLEU={0:.2f}".format(bleu_score))
    sel_p, sel_r, _ = nmt_basic_precision_recall(sel_refs, sel_hyps)
    print("precision={0:.2f}, recall={1:.2f}".format(sel_p, sel_r))

## Edin model

In [13]:
cfg_path = "sp2bagwords/sp_0.50_trial-A/"

In [14]:
last_epoch, model, optimizer, m_cfg, t_cfg = check_model(cfg_path)

cnn_out_dim = rnn_in_units =  640




using ADAM optimizer
--------------------------------------------------------------------------------
model not found


In [15]:
%%capture
train_key = m_cfg['train_set']
dev_key = m_cfg['dev_set']
batch_size=t_cfg['batch_size']
enc_key=m_cfg['enc_key']
dec_key=m_cfg['dec_key']
input_path = os.path.join(m_cfg['data_path'], m_cfg['dev_set'])
# -------------------------------------------------------------------------
# get data dictionaries
# -------------------------------------------------------------------------
map_dict, vocab_dict, bucket_dict, bow_dict = get_data_dicts(m_cfg)
if os.path.exists(os.path.join(m_cfg['model_dir'], "model_s2t_dev_out.dict")):
    dev_utts = pickle.load(open(os.path.join(m_cfg['model_dir'], "model_s2t_dev_out.dict"), "rb"))

if os.path.exists(os.path.join(m_cfg['model_dir'], "model_s2t_train_out.dict")):
    train_utts = pickle.load(open(os.path.join(m_cfg['model_dir'], "model_s2t_train_out.dict"), "rb"))
# batch_size = {'max': 128, 'med': 128, 'min': 128, 'scale': 1}
batch_size = {'max': 64, 'med': 64, 'min': 64, 'scale': 1}
batch_size = t_cfg['batch_size']

edin_s2t_refs_for_eval_path = os.path.join("../chainer2/speech2text/both_fbank_out/", 
                                           "edin_s2t_refs_for_eval.dict")
edin_s2t_refs_for_eval = pickle.load(open(edin_s2t_refs_for_eval_path, "rb"))
single_dev_ref = [[i[0]] for i in dev_utts["refs"]]

In [None]:
map_dict["fisher_dev"]['20051009_182032_217_fsp-B-1'].keys()

In [None]:
input_path = os.path.join(m_cfg['data_path'],
                                      m_cfg['train_set'])
train_utts, train_loss = feed_model(model,
                              optimizer=optimizer,
                              m_dict=map_dict[train_key],
                              b_dict=bucket_dict[train_key],
                              vocab_dict=vocab_dict,
                              bow_dict=bow_dict,
                              batch_size=batch_size,
                              x_key=enc_key,
                              y_key=dec_key,
                              train=False,
                              input_path=input_path,
                              max_dec=m_cfg['max_en_pred'],
                              t_cfg=t_cfg,
                              use_y=True,
                              get_probs=True)

In [None]:
# pickle.dump(train_utts, open(os.path.join(m_cfg['model_dir'], "model_s2t_train_out.dict"), "wb"))

In [None]:
mean_pos_scores = np.array([0.0 for _ in bow_dict["i2w"]], dtype="f")
mean_neg_scores = np.array([0.0 for _ in bow_dict["i2w"]], dtype="f")


for i_w in range(4, len(bow_dict["i2w"])):
    this_word = bow_dict["i2w"][i_w]
    pos_indx = [i_w in r[0] for r in train_utts["refs"]]
    neg_indx = [i_w not in r[0] for r in train_utts["refs"]]
    mean_pos_scores[i_w] = np.mean(train_utts["probs"][:,i_w][pos_indx])
    mean_neg_scores[i_w] = np.mean(train_utts["probs"][:,i_w][neg_indx])

In [None]:
xp.mean(mean_pos_scores), xp.mean(mean_neg_scores)

In [None]:
train_avg_p, _ = compute_avg_precision(train_utts["probs"],
                                                     0.0, 1.0, 5,
                                                     m_cfg['max_en_pred'],
                                                     train_utts["refs"])
train_avg_p

In [None]:
THRESH = m_cfg["pred_thresh"]
train_pred_words = get_pred_words_from_probs(train_utts["probs"],
#                                              mean_pos_scores,
                                               0.5,
                                               m_cfg['max_en_pred'])

train_prec, train_rec, _ = basic_precision_recall(train_utts["refs"], train_pred_words)
train_prec, train_rec

In [None]:
input_path = os.path.join(m_cfg['data_path'], m_cfg['dev_set'])

dev_utts, dev_loss = feed_model(model,
                                optimizer=optimizer,
                                m_dict=map_dict[dev_key],
                                b_dict=bucket_dict[dev_key],
                                vocab_dict=vocab_dict,
                                bow_dict=bow_dict,
                                batch_size=batch_size,
                                x_key=enc_key,
                                y_key=dec_key,
                                train=False,
                                input_path=input_path,
                                max_dec=m_cfg['max_en_pred'],
                                t_cfg=t_cfg,
                                use_y=True,
                                get_probs=True)

In [None]:
single_dev_ref = [[i[0]] for i in dev_utts["refs"]]

In [None]:
np.min(dev_utts["probs"]), np.max(dev_utts["probs"])

In [None]:
mean_dev_pos_scores = np.array([0.0 for _ in bow_dict["i2w"]], dtype="f")
mean_dev_neg_scores = np.array([0.0 for _ in bow_dict["i2w"]], dtype="f")


for i_w in range(4, len(bow_dict["i2w"])):
    this_word = bow_dict["i2w"][i_w]
    pos_indx = [i_w in r[0] for r in dev_utts["refs"]]
    neg_indx = [i_w not in r[0] for r in dev_utts["refs"]]
    mean_dev_pos_scores[i_w] = np.mean(dev_utts["probs"][:,i_w][pos_indx])
    mean_dev_neg_scores[i_w] = np.mean(dev_utts["probs"][:,i_w][neg_indx])

In [None]:
PRED_THRESH = 0.2
print("using prediction threshold={0:.2f}".format(PRED_THRESH))
dev_pred_words = get_pred_words_from_probs(dev_utts["probs"],
                                           PRED_THRESH,
                                           len(bow_dict['i2w']))
p, r, haha = basic_precision_recall(dev_utts["refs"], dev_pred_words)
print(p,r)
print("using mean positive prediction threshold")
dev_pred_words = get_pred_words_from_probs(dev_utts["probs"],
                                               mean_dev_pos_scores,
                                               m_cfg['max_en_pred'])
p, r, _ = basic_precision_recall(dev_utts["refs"], dev_pred_words)
print(p,r)

In [None]:
PRED_THRESH = 0.1
print("using prediction threshold={0:.2f}".format(PRED_THRESH))
dev_pred_words = get_pred_words_from_probs(dev_utts["probs"],
                                           PRED_THRESH,
                                           len(bow_dict['i2w']))
p, r, _ = basic_precision_recall(single_dev_ref, dev_pred_words)
print(p,r)
print("using mean positive prediction threshold")
dev_pred_words = get_pred_words_from_probs(dev_utts["probs"],
                                               mean_dev_pos_scores,
                                               m_cfg['max_en_pred'])
p, r, _ = basic_precision_recall(single_dev_ref, dev_pred_words)
print(p,r)

In [None]:
min_prob, max_prob = float(xp.min(dev_utts["probs"])), float(xp.max(dev_utts["probs"]))
min_prob, max_prob

### Precision-Recall Plot - word level threshold

In [None]:
np.arange(-0.5, 0.5, 0.1)

In [None]:
mean_pos_scores[4:14]*1.3

In [None]:
thresh_deltas = np.asarray([0.7,0.8,0.9,1,1.1,1.2,1.3], dtype="f")

In [None]:
p_r_thresh = {}
thresh_delta = 0.05
for thresh in tqdm(np.arange(-0.5, 0.5+thresh_delta, thresh_delta)):
# for thresh in tqdm(thresh_deltas):
# for thresh in tqdm(np.linspace(min_prob, max_prob,num=20,endpoint=True)):
    p_r_thresh[thresh] = {}
    dev_pred_words_at_thresh = get_pred_words_from_probs(dev_utts["probs"],
                                                           mean_pos_scores + thresh,
                                                           len(bow_dict['i2w']))
    p_r_thresh[thresh]['p'], p_r_thresh[thresh]['r'], _ = basic_precision_recall(dev_utts["refs"], 
                                                                              dev_pred_words_at_thresh)
    

In [None]:
thresh_labels, p_vals, r_vals = [], [], []
for l in p_r_thresh:
    p_vals.append(p_r_thresh[l]["p"])
    r_vals.append(p_r_thresh[l]["r"])
    thresh_labels.append("{0:.2f}".format(l))

In [None]:
fig, ax = plt.subplots(figsize=(10,8))
ax.plot(r_vals, p_vals, label="Precision/Recall")
for i,j,k in zip(r_vals, p_vals, thresh_labels):
    ax.annotate(k,xy=(i,j), fontsize=12)
ax.set_ylabel("Precision", fontsize=28)
ax.set_xlabel("Recall", fontsize=26)
ax.legend(loc='upper center', bbox_to_anchor=(1.3, 0.9),
                  ncol=1, fancybox=True, shadow=True, fontsize=26)

plt.yticks(rotation=0, size=18)
plt.xticks(rotation=0, size=18)

### Precision-Recall Plot - fixed threshold

In [None]:
len(dev_utts["probs"][0]), len(bow_dict['i2w'])

In [None]:
def compute_avg_precision(probs, min_prob, max_prob, num_points, max_words, refs):
    p_r_thresh = {}
    for thresh in tqdm(np.linspace(min_prob, max_prob, num=num_points, endpoint=True)):
        p_r_thresh[thresh] = {}
        words_at_thresh = get_pred_words_from_probs(probs, thresh, max_words)
        p_r_thresh[thresh]['p'], p_r_thresh[thresh]['r'], _ = basic_precision_recall(refs, words_at_thresh)
    
    precision_array = np.array([p_r_thresh[i]['p']/100 for i in p_r_thresh], dtype="f")
    recall_array = np.array([p_r_thresh[i]['r']/100 for i in p_r_thresh], dtype="f")
    avg_p = np.trapz(precision_array[::-1], recall_array[::-1])
    return avg_p, p_r_thresh
    

In [None]:
avg_p, p_r_thresh = compute_avg_precision(dev_utts["probs"], 0.0, 1.0, 50, 104, dev_utts["refs"])

In [None]:
avg_p

In [None]:
# p_r_thresh = {}
# thresh_delta = 0.01
# # for thresh in tqdm(np.arange(min_prob, max_prob+thresh_delta,thresh_delta)):
# for thresh in tqdm(np.linspace(min_prob, max_prob, num=30,endpoint=True)):
#     p_r_thresh[thresh] = {}
#     dev_pred_words_at_thresh = get_pred_words_from_probs(dev_utts["probs"],
#                                                            thresh,
#                                                            len(bow_dict['i2w']))
#     p_r_thresh[thresh]['p'], p_r_thresh[thresh]['r'], _ = basic_precision_recall(dev_utts["refs"], 
#                                                                               dev_pred_words_at_thresh)
    

In [None]:
thresh_labels, p_vals, r_vals = [], [], []
for l in p_r_thresh:
    p_vals.append(p_r_thresh[l]["p"])
    r_vals.append(p_r_thresh[l]["r"])
    thresh_labels.append("{0:.2f}".format(l))

In [None]:
fig, ax = plt.subplots(figsize=(10,8))
ax.plot(r_vals, p_vals, label="Precision/Recall")
for i,j,k in zip(r_vals, p_vals, thresh_labels):
    ax.annotate(k,xy=(i,j), fontsize=12)
ax.set_ylabel("Precision", fontsize=28)
ax.set_xlabel("Recall", fontsize=26)
ax.legend(loc='upper center', bbox_to_anchor=(1.3, 0.9),
                  ncol=1, fancybox=True, shadow=True, fontsize=26)

plt.yticks(rotation=0, size=18)
plt.xticks(rotation=0, size=18)

In [None]:
PRED_THRESH = 0.15
print("using prediction threshold={0:.2f}".format(PRED_THRESH))
dev_pred_words = get_pred_words_from_probs(dev_utts["probs"],
                                           PRED_THRESH,
                                           len(bow_dict['i2w']))
p, r, _ = basic_precision_recall(single_dev_ref, dev_pred_words)
print(p,r)
# print("using mean positive prediction threshold")
# dev_mean_pred_words = get_pred_words_from_probs(dev_utts["probs"],
#                                            mean_pos_scores,
#                                            m_cfg['max_en_pred'])
# p, r, _ = basic_precision_recall(single_dev_ref, dev_mean_pred_words)
# print(p,r)

In [None]:
PRED_THRESH = 0.01
print("using prediction threshold={0:.2f}".format(PRED_THRESH))
dev_pred_words = get_pred_words_from_probs(dev_utts["probs"],
                                           PRED_THRESH,
                                           len(bow_dict['i2w']))
p, r, _ = basic_precision_recall(dev_utts["refs"], dev_pred_words)
print(p,r)
# print("using mean positive prediction threshold")
# dev_mean_pred_words = get_pred_words_from_probs(dev_utts["probs"],
#                                            mean_pos_scores,
#                                            m_cfg['max_en_pred'])
# p, r, _ = basic_precision_recall(single_dev_ref, dev_mean_pred_words)
# print(p,r)

In [None]:
precision_array = np.array([p_r_thresh[i]['p']/100 for i in p_r_thresh], dtype="f")
recall_array = np.array([p_r_thresh[i]['r']/100 for i in p_r_thresh], dtype="f")

In [None]:
precision_array[::-1]

In [None]:
np.trapz(precision_array[::-1], recall_array[::-1])

### get preds and refs in words

In [None]:
list(zip(train_pred_words, train_utts["refs"]))[:10]

In [None]:
dev_utt_preds_words = {}
dev_utt_refs_words = {}
for u, p, refs in zip(dev_utts['ids'], dev_pred_words, dev_utts["refs"]):
    dev_utt_preds_words[u] = list(set([bow_dict['i2w'][i].decode() for i in p]))
    dev_utt_refs_words[u] = []
    for r in refs:
        #print(r)
        dev_utt_refs_words[u].append([bow_dict['i2w'][i].decode() for i in set(r)])
single_dev_ref_words = {u: [dev_utt_refs_words[u][0]] for u in dev_utt_refs_words}

In [None]:
p, r, metric = basic_precision_recall(list(dev_utt_refs_words.values()), list(dev_utt_preds_words.values()))
p, r

In [None]:
ps, rs, _ = basic_precision_recall(single_dev_ref_words.values(), dev_utt_preds_words.values())
ps, rs

In [None]:
[(k, metric[k]) for k in ['rc', 'rt', 'tp', 'tc']]

In [None]:
words_correctly_predicted = [item for item in metric["word"].items() if item[1]['tc'] > 0]
print(len(words_correctly_predicted))
display(words_correctly_predicted)

In [None]:
# most common train words
[w.decode() for w, f in sorted(bow_dict['freq'].items(), reverse=True, key=lambda t: t[1])][:10]

In [None]:
list(single_dev_ref_words.items())[:5]

In [None]:
display_bow_words(single_dev_ref_words, 
                  dev_utt_preds_words, 
                  bow_dict, 
                  map_dict["fisher_dev"], display_num=100)

## Google model

In [None]:
_ = eval_nmt_model("", use_google=True, min_len=1)

In [None]:
metrics = eval_all_word_lists("", use_google=True)

## Edin 150 hours model

In [None]:
sim_dict_path = os.path.join(m_cfg['data_path'], "mix_sim.dict")
sim_dict = pickle.load(open(sim_dict_path, "rb"))

In [None]:
len(bow_dict_es['freq_dev'])

In [None]:
# for w in sim_dict["w"]:
#     if len(sim_dict["w"][w]) > 1 and w in bow_dict_es["w2i"]:
#         print(w)
#         print(sim_dict["w"][w])
#         print(bow_dict_es["w2i"][w])

In [None]:
_ = eval_nmt_model("sp2enw_hyp_search/sp_1.0_l2e-4_rnn-3_drpt-0.5_cnn_96-2-2", min_len=1)

In [None]:
metrics = eval_all_word_lists("sp2enw_hyp_search/sp_1.0_l2e-4_rnn-3_drpt-0.5_cnn_96-2-2", 
                              use_google=False)

### BLEU script
```
[bonnybridge]s1444673: export BLEU_SCRIPT=/afs/inf.ed.ac.uk/group/project/lowres/work/installs/mosesdecoder/scripts/generic/multi-bleu.perl
[bonnybridge]s1444673: export PREDS=sp2enw_hyp_search/sp_1.0_l2e-4_rnn-3_drpt-0.5_cnn_96-2-2
[bonnybridge]s1444673: perl $BLEU_SCRIPT $PREDS/fsh_dev_fisher_dev_en.ref* < $PREDS/fsh_dev_fisher_dev_hyp
BLEU = 29.44, 65.1/38.4/22.8/13.7 (BP=0.991, ratio=0.991, hyp_len=39719, ref_len=40096)
[bonnybridge]s1444673: perl $BLEU_SCRIPT $PREDS/fsh_test_fisher_test_en.ref* < $PREDS/fsh_test_fisher_test_hyp
BLEU = 29.64, 66.2/38.4/22.7/13.5 (BP=0.999, ratio=0.999, hyp_len=39201, ref_len=39257)
[bonnybridge]s1444673: perl $BLEU_SCRIPT $PREDS/fsh_dev_fisher_dev_en.ref[0,1,2]* < $PREDS/fsh_dev_fisher_dev_hyp

BLEU = 27.03, 62.7/35.8/20.7/12.1 (BP=0.987, ratio=0.987, hyp_len=39719, ref_len=40242)
[bonnybridge]s1444673: perl $BLEU_SCRIPT $PREDS/fsh_dev_fisher_dev_en.ref[1,2,3]* < $PREDS/fsh_dev_fisher_dev_hyp
BLEU = 27.03, 62.6/35.8/20.8/12.2 (BP=0.984, ratio=0.984, hyp_len=39719, ref_len=40353)
[bonnybridge]s1444673: perl $BLEU_SCRIPT $PREDS/fsh_dev_fisher_dev_en.ref[2,3,0]* < $PREDS/fsh_dev_fisher_dev_hyp
BLEU = 27.00, 62.8/35.9/20.8/12.1 (BP=0.984, ratio=0.984, hyp_len=39719, ref_len=40346)
[bonnybridge]s1444673: perl $BLEU_SCRIPT $PREDS/fsh_dev_fisher_dev_en.ref[1,3,0]* < $PREDS/fsh_dev_fisher_dev_hyp
BLEU = 27.10, 62.9/35.9/20.8/12.2 (BP=0.985, ratio=0.985, hyp_len=39719, ref_len=40339)
```

## Edin 50 hours model

In [None]:
old_metrics = eval_nmt_model("./sp2enw_mel-80_vocab-nltk/sp_0.33_h-256_e-128_l2e-3_lstm_drpt-0.3_cnn-32-2-2_rnn-3_b-40-50")

## Edin 25 hours model

In [None]:
_ = eval_nmt_model("./sp2enw_mel-80_vocab-nltk/sp_0.16_h-256_e-128_l2e-3_lstm_drpt-0.3_cnn-32-2-2_rnn-3_b-80-25_no-ln-bn")

## Edin 15 hours model

In [None]:
_ = eval_nmt_model("./sp2enw/sp_.10/")

## Edin 50 hours model - sample word embeddings

### seed: 0.33

In [None]:
_ = eval_nmt_model("sp2enw_mel-80_vocab-nltk/sp_0.33_h-300_e-128_l2e-6_lstm_drpt-0.5_cnn-64-2-2_rnn-3_b-80-25_no-bn-ln")

In [None]:
_ = eval_nmt_model("sp2enw_mel-80_vocab-nltk/sp_0.33_h-300_e-128_l2e-6_lstm_drpt-0.5_cnn-64-2-2_rnn-3_b-80-25_no-bn-ln_sample/")

### seed: AA

In [None]:
new50_metrics = eval_nmt_model("emb_sp2enw/sp_0.33_seed-AA")

In [None]:
new50_metrics = eval_nmt_model("emb_sp2enw/sp_0.33_seed-AA_mix-0.5")

## Edin 80 hours model - sample word embeddings

In [None]:
_ = eval_nmt_model("sp2enw_mel-80_vocab-nltk/sp_0.50_h-300_e-128_l2e-6_lstm_drpt-0.5_cnn-64-2-2_rnn-3_b-80-25_no-bn-ln")

In [None]:
_ = eval_nmt_model("sp2enw_mel-80_vocab-nltk/sp_0.50_h-300_e-128_l2e-6_lstm_drpt-0.5_cnn-64-2-2_rnn-3_b-80-25_no-bn-ln_sample/")

### Interspeech results

In [None]:
_ = eval_nmt_model("./sp2enw_interspeech/sp_0.33_seed-AB_l2e-4_drpt-0.5", min_len=1)

In [None]:
_ = eval_all_word_lists("./sp2enw_interspeech/sp_0.33_seed-AB_l2e-4_drpt-0.5")

In [None]:
model_50_da = eval_nmt_model("./sp2enw_interspeech/sp_0.33_seed-AB_l2e-4_drpt-0.5_sample-mix-0.5", 
                             min_len=1)

In [None]:
_ = eval_all_word_lists("./sp2enw_interspeech/sp_0.33_seed-AB_l2e-4_drpt-0.5_sample-mix-0.5")

In [None]:
_ = eval_nmt_model("./sp2enw_interspeech/sp_0.50_h-300_e-128_l2e-4_lstm_drpt-0.5_cnn-64-2-2_rnn-3_b-80-25_no-bn-ln", min_len=1)

In [None]:
_ = eval_all_word_lists("./sp2enw_interspeech/sp_0.50_h-300_e-128_l2e-4_lstm_drpt-0.5_cnn-64-2-2_rnn-3_b-80-25_no-bn-ln")

In [None]:
model_80_da = eval_nmt_model("./sp2enw_interspeech/sp_0.50_h-300_e-128_l2e-4_lstm_drpt-0.5_cnn-64-2-2_rnn-3_b-80-25_no-bn-ln_mix-0.5/", min_len=1)

In [None]:
_ = eval_all_word_lists("./sp2enw_interspeech/sp_0.50_h-300_e-128_l2e-4_lstm_drpt-0.5_cnn-64-2-2_rnn-3_b-80-25_no-bn-ln_mix-0.5")

#### View translations

In [16]:
def clean_out_str(out_str):
    out_str = out_str.replace("`", "")
    out_str = out_str.replace('"', '')
    out_str = out_str.replace('¿', '')
    out_str = out_str.replace("''", "")
    out_str = out_str.strip()
    return out_str
    

In [17]:
def get_out_str(h, use_google):
    if use_google:
        out_str = " ".join(h)
    else:
        out_str = ""
        for w in h:
            out_str += "{0:s}".format(w) if (w.startswith("'") or w=="n't") else " {0:s}".format(w)

        out_str = clean_out_str(out_str)
    return out_str

In [18]:
def write_to_file_len_filtered_preds(nmt_path, 
                                     set_key="fisher_dev", 
                                     min_len=0, max_len=300, 
                                     use_google=False):
    refs, hyps, allrefs = get_model_data(nmt_path, use_google=use_google)
    filt_utts = []
    for u in refs:
        if (len(map_dict[set_key][u]["es_w"]) >= min_len and 
           len(map_dict[set_key][u]["es_w"]) <= max_len):
            filt_utts.append(u)
    
    filt_utts = sorted(filt_utts)
    print("Utts matching len filter={0:d}".format(len(filt_utts)))
    hyp_path = os.path.join(nmt_path, "hyps_min-{0:d}_max-{1:d}.en".format(min_len, max_len))
    print("writing hyps to: {0:s}".format(hyp_path))
    with open(hyp_path, "w") as out_f:
        for u in filt_utts:
            out_str = get_out_str(hyps[u], use_google=use_google)
#             if use_google:
#                 out_str = " ".join(hyps[u])
#             else:
#                 out_str = ""
#                 for w in hyps[u]:
#                     out_str += "{0:s}".format(w) if (w.startswith("'") or w=="n't") else " {0:s}".format(w)
                
#                 out_str = clean_out_str(out_str)
                
            out_f.write("{0:s}\n".format(out_str))
    
    for i in range(len(list(allrefs.values())[0])):
        refs_path = os.path.join(nmt_path, "ref_min-{0:d}_max-{1:d}.en{2:d}".format(min_len, 
                                                                                    max_len,
                                                                                    i))
        print("writing ref {0:d} to: {1:s}".format(i, refs_path))
        with open(refs_path, "w") as out_f:
            for u in filt_utts:
                out_str = get_out_str(allrefs[u][i], use_google=use_google)
#                 if use_google:
#                     out_str = " ".join(allrefs[u][i])
#                 else:
#                     out_str = ""
#                     for w in allrefs[u][i]:
#                         out_str += "{0:s}".format(w) if (w.startswith("'") or w=="n't") else " {0:s}".format(w)
#                     out_str = clean_out_str(out_str)
                out_f.write("{0:s}\n".format(out_str))
    print("all done")

In [19]:
# nmt_path = "google"
# write_to_file_len_filtered_preds(nmt_path, 
#                                  set_key="fisher_dev", 
#                                  min_len=MIN_LEN, max_len=MAX_LEN, 
#                                  use_gooogle=True)

# !paste -d"\n" google/ref_min-0_max-2.en* > google/all_ref_min-0_max-2_meteor
# # !paste -d"\n" google/ref_min-{$MIN_LEN}_max-{$MAX_LEN}.en* > google/all_ref_min-{$MIN_LEN}_max-{$MAX_LEN}_meteor
# # !paste -d"\n" $nmt_path/ref_min-{$MIN_LEN}_max-{$MAX_LEN}.en* > google/$meteor_out

In [20]:
MINLEN = 0
MAXLEN = 300

In [21]:
nmt_models = ["google",
              "sp2enw_hyp_search/sp_1.0_l2e-4_rnn-3_drpt-0.5_cnn_96-2-2",
              "sp2enw_hyp_search/sp_1.0_l2e-4_rnn-3_drpt-0.5_cnn_96-2-2_mix-0.5/",
              "./sp2enw_interspeech/sp_0.50_h-300_e-128_l2e-4_lstm_drpt-0.5_cnn-64-2-2_rnn-3_b-80-25_no-bn-ln",
              "./sp2enw_interspeech/sp_0.50_h-300_e-128_l2e-4_lstm_drpt-0.5_cnn-64-2-2_rnn-3_b-80-25_no-bn-ln_mix-0.5/",
              "./sp2enw_interspeech/sp_0.33_seed-AB_l2e-4_drpt-0.5",
              "./sp2enw_interspeech/sp_0.33_seed-AB_l2e-4_drpt-0.5_sample-mix-0.5"
              
             ]

In [None]:
for nmt_path in nmt_models:
    write_to_file_len_filtered_preds(nmt_path, 
                                     set_key="fisher_dev", 
                                     min_len=MINLEN, max_len=MAXLEN, 
                                     use_google = nmt_path == "google")
    print(nmt_path == "google")
    
    meteor_out = os.path.join(nmt_path, "meteor_4refs_min-{0:d}_max-{1:d}.en".format(MINLEN, MAXLEN))
    meteor_in = os.path.join(nmt_path, "ref_min-{0:d}_max-{1:d}.en".format(MINLEN, MAXLEN))
    
    !paste -d"\n" $meteor_in* > $meteor_out

```
export PREDS=haha
perl $BLEU_SCRIPT $PREDS/ref_min-0_max-300.* < $PREDS/hyps_min-0_max-300.en

java -Xmx2G -jar ../installs/meteor-1.5/meteor-*.jar $PREDS/hyps_min-0_max-300.en $PREDS/meteor_4refs_min-0_max-300.en -r 4 -l en -norm
```

In [27]:
model_keys = ["google",
              "sp_160",
              "sp_160_sample",
              "sp_80",
              "sp_80_sample",
              "sp_50",
              "sp_50_sample"
             ]

In [28]:
model_map = {model_keys[i]: nmt_models[i] for i in range(len(model_keys))}

In [29]:
len_filts = [(0,2), (3,5), (6,20), (21,40), (41,300)]

In [30]:
model_data = {}

In [31]:
for m in model_map:
    model_data[m] = get_model_data(model_map[m], use_google= m == "google")

eval refs found, loading


In [32]:
u_bucks = {i:[] for i in range(len(len_filts))}
for u in map_dict["fisher_dev"]:
    es_w_len = len(map_dict["fisher_dev"][u]["es_w"])
    for i, f in enumerate(len_filts):
        if es_w_len >= f[0] and es_w_len <= f[1]:
            u_bucks[i].append(u)

In [33]:
[(i,len(v)) for i, v in u_bucks.items()]

[(0, 1193), (1, 628), (2, 1447), (3, 687), (4, 24)]

In [34]:
random.seed("hmm")
selected_utts = []
for i in u_bucks:
    sub_set = random.sample(u_bucks[i], min(10,len(u_bucks[i])))
    selected_utts.extend(sub_set)

In [35]:
len(selected_utts)

50

In [36]:
play_utt(utt, m_dict)

NameError: name 'utt' is not defined

In [None]:
with open("all_model_preds.txt", "w") as out_f:
    for u in selected_utts:
        out_f.write("------{0:s}------\n".format(u))
        es_words = " ".join([w.decode() for w in map_dict["fisher_dev"][u]["es_w"]])
        out_f.write("{0:20s} : {1:s}\n".format("es reference", es_words))
        out_f.write("{0:20s} : {1:s}\n".format("en reference", " ".join(model_data["google"][0][u])))
        for m in model_data:
            if m == "google":
                out_str =  " ".join(model_data[m][1][u])
            else:
                out_str = ""
                for w in model_data[m][1][u]:
                    out_str += "{0:s}".format(w) if (w.startswith("'") or w=="n't") else " {0:s}".format(w)
            out_f.write("{0:20s} : {1:s}\n".format(m, out_str))
        out_f.write("--------------")

In [38]:
wavs_path = os.path.join("../chainer2/speech2text/both_fbank_out/", "wavs")

STARTHERE

In [48]:
model_data["google"][2][u][0]

['then']

In [56]:
for u in selected_utts:
    print("------{0:s}------".format(u))
    play_utt(u, map_dict["fisher_dev"])
    es_words = " ".join([w.decode() for w in map_dict["fisher_dev"][u]["es_w"]])
    print("{0:20s} : {1:s}".format("es reference", es_words))
    for i in model_data["google"][2][u]:
        print("{0:20s} : {1:s}".format("en reference", " ".join(i)))
    for m in model_data:
        if m == "google":
            out_str =  " ".join(model_data[m][1][u])
        else:
            out_str = ""
            for w in model_data[m][1][u]:
                out_str += "{0:s}".format(w) if (w.startswith("'") or w=="n't") else " {0:s}".format(w)
        print("{0:20s} : {1:s}".format(m, out_str))
    print("--------------")

------20051026_211309_346_fsp-B-32------
225.49 226.76


es reference         : entonces
en reference         : then
en reference         : then
en reference         : then
en reference         : then
google               : then
sp_160               :  so
sp_160_sample        :  then
sp_80                :  so
sp_80_sample         :  so
sp_50                :  then
sp_50_sample         :  is so
--------------
------20051016_210626_267_fsp-B-31------
286.96 287.62


es reference         : sí
en reference         : yes
en reference         : yes
en reference         : yes
en reference         : yes
google               : yes
sp_160               :  yes
sp_160_sample        :  yes
sp_80                :  yes
sp_80_sample         :  yes
sp_50                :  yes
sp_50_sample         :  yes
--------------
------20051017_180712_270_fsp-B-33------
215.8 217.16


es reference         : hm mm
en reference         : hm mm
en reference         : um
en reference         : hm mm
en reference         : hmm
google               : hm mm
sp_160               :  mm
sp_160_sample        :  mm
sp_80                :  mmm
sp_80_sample         :  uh um
sp_50                :  mhm
sp_50_sample         :  mhm
--------------
------20051025_212334_337_fsp-A-80------
582.03 583.27


es reference         : sí
en reference         : yes
en reference         : yes
en reference         : yes
en reference         : yes
google               : yes
sp_160               :  yes
sp_160_sample        :  yes
sp_80                :  yes
sp_80_sample         :  eh
sp_50                :  yes
sp_50_sample         :  yes
--------------
------20051010_212418_225_fsp-A-26------
223.89 224.82


es reference         : mhm
en reference         : mhm
en reference         : mhm
en reference         : mhm
en reference         : mhm
google               : mhm
sp_160               :  mhm
sp_160_sample        :  mm
sp_80                :  hmm
sp_80_sample         :  mm
sp_50                :  um uh
sp_50_sample         :  mhm
--------------
------20051022_180817_311_fsp-B-28------
178.61 181.12


es reference         : sí
en reference         : yes
en reference         : yes
en reference         : yes
en reference         : yes
google               : yes yes i
sp_160               :  yes
sp_160_sample        :  yes yes
sp_80                :  and you
sp_80_sample         :  yes yes
sp_50                :  and yes
sp_50_sample         :  yes
--------------
------20051019_190221_288_fsp-B-81------
552.64 553.65


es reference         : sí
en reference         : yes
en reference         : yes
en reference         : yes
en reference         : yes
google               : yes
sp_160               :  yes
sp_160_sample        :  yes
sp_80                :  yes
sp_80_sample         :  yes
sp_50                :  yes
sp_50_sample         :  yes
--------------
------20051009_182032_217_fsp-B-64------
261.08 261.99


es reference         : ajá
en reference         : aha
en reference         : yeah
en reference         : aha
en reference         : aha
google               : aha
sp_160               :  aha
sp_160_sample        :  aha
sp_80                :  uh uh
sp_80_sample         :  aha
sp_50                :  aha
sp_50_sample         :  aha
--------------
------20051017_234550_276_fsp-B-65------
317.84 319.01


es reference         : no no
en reference         : no no
en reference         : no no
en reference         : no no
en reference         : no no
google               : no no
sp_160               :  no no
sp_160_sample        :  no no
sp_80                :  no no
sp_80_sample         :  no no
sp_50                :  no no
sp_50_sample         :  no
--------------
------20051017_180712_270_fsp-A-64------
452.32 453.38


es reference         : ajá
en reference         : aha
en reference         : aha
en reference         : yes
en reference         : uh huh
google               : aha
sp_160               :  aha
sp_160_sample        :  aha
sp_80                :  oh
sp_80_sample         :  aha
sp_50                :  right
sp_50_sample         :  right
--------------
------20051025_212334_337_fsp-A-39------
319.36 323.08


es reference         : petty oh ¿sessy o jessy
en reference         : petty oh sessy or jessy
en reference         : petty oh sessy or jessy
en reference         : petty or sessy or jessy
en reference         : petty oh sessy or jessy
google               : yes oh celsius jesus
sp_160               :  thirteen oh i'm joe
sp_160_sample        :  it's cool oh eight
sp_80                :  yes yes oh yes yes yes
sp_80_sample         :  yes yes oh yes yes
sp_50                :  yes oh yes i hear you
sp_50_sample         :  yes oh oh i didn't know if
--------------
------20051022_180817_311_fsp-B-34------
223.12 226.93


es reference         : sí sí sí sí
en reference         : yeah yeah yeah yeah
en reference         : yes yes yes yes
en reference         : yes yes
en reference         : yes yes yes yes
google               : yes yes yes yes
sp_160               :  yes yes yes yes
sp_160_sample        :  yes yes yes yes
sp_80                :  yes yes yes yes
sp_80_sample         :  yes yes yes yes
sp_50                :  yes yes yes yes yes
sp_50_sample         :  yes yes yes yes
--------------
------20051024_180453_327_fsp-A-39------
287.02 288.69


es reference         : ah o k
en reference         : ahok
en reference         : ah ok
en reference         : ah ok
en reference         : ah ok
google               : ah okay
sp_160               :  ah okay
sp_160_sample        :  ah okay
sp_80                :  ah okay
sp_80_sample         :  ah okay
sp_50                :  ah okay
sp_50_sample         :  ah okay
--------------
------20051019_190221_288_fsp-B-27------
110.29 113.33


es reference         : y así nada más
en reference         : and so nothing more
en reference         : just like that
en reference         : and yeah that's all
en reference         : and nothing else
google               : and that's it
sp_160               :  and that's just more
sp_160_sample        :  and that's how it is
sp_80                :  and like that no
sp_80_sample         :  and like that
sp_50                :  yes so more
sp_50_sample         :  and like
--------------
------20051017_234550_276_fsp-A-113------
616.98 620.42


es reference         : problemas programas de conversación
en reference         : problems conversation programs
en reference         : problems converstation programs
en reference         : problems conversational programs
en reference         : problems conversation programs
google               : problems ah conversation programs
sp_160               :  problems programs of conversations
sp_160_sample        :  problems we programs of
sp_80                :  problems problems problems
sp_80_sample         :  we all we were problems
sp_50                :  problems we have problems with this
sp_50_sample         :  are you going to to the problem of this
--------------
------20051018_210744_280_fsp-A-41------
270.13 276.6


es reference         : oh oh boy
en reference         : oh boy
en reference         : ohoh chico
en reference         : oh oh boy
en reference         : oh oh boy
google               : oh oh boy
sp_160               :  oh boy boy
sp_160_sample        :  oh boy
sp_80                :  oh boy
sp_80_sample         :  oh oh boy
sp_50                :  oh boy and
sp_50_sample         :  oh oh boy
--------------
------20051017_234550_276_fsp-A-24------
86.28 87.55


es reference         : así que no sé dónde
en reference         : so i don't know where
en reference         : so i'm not sure then
en reference         : so i don't know where
en reference         : so i don't know where
google               : so i don't know where
sp_160               :  so i don't know
sp_160_sample        :  so i don't know
sp_80                :  so i don't know
sp_80_sample         :  so i don't know
sp_50                :  so i don't know
sp_50_sample         :  so i'm
--------------
------20051025_212334_337_fsp-B-57------
424.18 426.87


es reference         : sí de washington ya
en reference         : yes from washington yeah
en reference         : yes from washington
en reference         : yes from washington
en reference         : yes washington already
google               : well yes no from washington day
sp_160               :  well yes no washington
sp_160_sample        :  well yes i'm from washington yeah
sp_80                :  well yes no washington
sp_80_sample         :  well yes from washington
sp_50                :  well yes no washington
sp_50_sample         :  well yes no washington washington
--------------
------20051019_190221_288_fsp-B-44------
202.54 204.26


es reference         : me gustaría estar allá
en reference         : i would like to be there
en reference         : i would like to be there
en reference         : i would like to be over there
en reference         : i would like to be there
google               : i would like to be there
sp_160               :  you like it over there
sp_160_sample        :  i would like to go to
sp_80                :  i wouldn't like it
sp_80_sample         :  i don't like it
sp_50                :  i like to go there
sp_50_sample         :  i like the r
--------------
------20051022_180817_311_fsp-A-3------
9.0 11.13


es reference         : ¿de dónde tu eres arturo
en reference         : and where are you from arturo
en reference         : where are you from arturo
en reference         : where are you from arturo
en reference         : where are you from arturo
google               : where were you from right
sp_160               :  where are you from
sp_160_sample        :  where are you from
sp_80                :  where are you from
sp_80_sample         :  where are you from
sp_50                :  where are you from
sp_50_sample         :  where are you from
--------------
------20051019_190221_288_fsp-B-13------
46.35 49.33


es reference         : ah de vacaciones no de trabajo de vacaciones
en reference         : uh on vacations not from work on vacations
en reference         : uh on vacations not work on vacations
en reference         : ahh from vacations not to work from vacations
en reference         : ah on vacation not work on vacation
google               : ah of vacations no of work on vacations
sp_160               :  ah vacations right from work or something
sp_160_sample        :  ah vacations the work of the
sp_80                :  ah vacations no not work in the other things
sp_80_sample         :  ah education doesn't work or work
sp_50                :  ah i was not from the work i work in work
sp_50_sample         :  ah i almost almost i was not working
--------------
------20051016_180547_265_fsp-A-70------
637.64 641.02


es reference         : pero a veces di es una pe erre jota
en reference         : but at times it is a a j
en reference         : but sometimes di is a per jay
en reference         : but sometimes say it is a r j
en reference         : but sometimes is a p r j
google               : but sometimes it's a short one
sp_160               :  at the same time it's a pg
sp_160_sample        :  at the time it's a worse
sp_80                :  sometimes it's a disaster
sp_80_sample         :  but sometimes it's a a a
sp_50                :  at the time is a holiday
sp_50_sample         :  but sometimes it's a sad
--------------
------20051017_180712_270_fsp-B-44------
284.47 286.0


es reference         : ah bueno esa es buena idea
en reference         : ah good that is a good idea
en reference         : ah well that is a good idea
en reference         : oh well thas a good idea
en reference         : oh good that's a good idea
google               : ah well that's a good idea
sp_160               :  ah well it's a good idea
sp_160_sample        :  ah well it's a good idea
sp_80                :  ah good bolivia
sp_80_sample         :  ah well how nice
sp_50                :  oh good a good idea
sp_50_sample         :  ah good is good
--------------
------20051009_210519_219_fsp-B-45------
445.28 449.62


es reference         : judía y yo creo me imagino que tal vez por eso ya yo era la segunda ya no
en reference         : jewish and i think i imagine that maybe that is why i was the second one it did not
en reference         : jewish and i believe that i imagine that maybe because i was the second no longer
en reference         : jewish and i think i imagine that maybe that's why right i was the second one i wasn't
en reference         : jewish and i think i imagine that maybe yeah i was the second one they didn't
google               : jewish and maybe that's why i was the second one not anymore
sp_160               :  jewish and i was that maybe i was over there and i don't
sp_160_sample        :  i could say and i told him that maybe i already the second one i didn't
sp_80                :  i could tell me that i was that's why i already already already already already
sp_80_sample         :  and what was my friend that was talking about that and i already already already already
sp

es reference         : y se cambian bien bonitas y están jóvenes hay que ponerse bonita
en reference         : and they dress very nicely and they're young and one has to dress nicely
en reference         : and they change very pretty and they are young they have to get pretty
en reference         : they dress up really pretty they are young and you have to get really pretty
en reference         : and then dress up very beautiful and they are young they dress up beautifully
google               : and they change really beautiful if they're young you have to put on beautiful
sp_160               :  and it's very beautiful and they are young there are that that's why it's pretty
sp_160_sample        :  it's change it's really pretty and it's young there's be a nice
sp_80                :  and it's very nice now they're young that you can be supposed to be
sp_80_sample         :  yes it's change it's also young that you can be nice
sp_50                :  i'm also very young in young peop

es reference         : si ella es de oaxaca méxico
en reference         : yes she is from oaxaca mexico
en reference         : yes she is from oaxaca mexico
en reference         : yeah she's from oaxaca mexico
en reference         : yes she is from oaxaca mexico
google               : yes she's born here in mexico
sp_160               :  yes i'm from mexico
sp_160_sample        :  yes i'm in mexico
sp_80                :  yes i have a lot of mexico
sp_80_sample         :  yes i'm in mexico mexico
sp_50                :  yes i'm here in mexico
sp_50_sample         :  yes that's also mexico
--------------
------20051009_182032_217_fsp-A-73------
304.3 307.63


es reference         : así porque no así no es es como yo digo
en reference         : like this because no that's not the way it is it is the way is say it is
en reference         : yeah why right that's not right it's what i say
en reference         : really because is not like that is like i say
en reference         : so why not it's not it is as i say
google               : like that because it's not like i say
sp_160               :  so because it's not like that is not like i say
sp_160_sample        :  so well it's not like i say
sp_80                :  like that because it's not like i'm like i'm like
sp_80_sample         :  like that because it doesn't do it's like i say
sp_50                :  ah yes but i have not like like i say it
sp_50_sample         :  yes so i don't like it is like i say
--------------
------20051023_232057_325_fsp-B-109------
711.29 715.17


es reference         : claro cuando uno no tiene niños por lo menos uno se cuida solo no pero
en reference         : of course when you don't have kids at least you take care of yourself no but
en reference         : of course when you don't have children at least you take care of yourself right but
en reference         : clearly when someone doesn't have kids they only have to worry about themselves but
en reference         : sure when you have kids at least you can take care yourself but
google               : sure when one doesn't have kids at least one takes care of it alone right but
sp_160               :  sure when you don't have kids at least i don't know what it is but
sp_160_sample        :  sure when you don't have kids at least i don't remember right but
sp_80                :  sure when you don't have kids but i don't remember i don't remember but
sp_80_sample         :  sure when you don't have children but you can't remember it but
sp_50                :  sure when you h

es reference         : mm yo creo que yo yo lo que creo es que que el
en reference         : mm i think that i what i think is that that the
en reference         : mm i think that i i think that it is what it is
en reference         : mm i believe that i i that which i think is that that the
en reference         : mm i believe that i what i believe is that that
google               : mm i think that i what i think is that
sp_160               :  mm i think that i believe that
sp_160_sample        :  i think that i think that that
sp_80                :  mm i think that i think that i think that
sp_80_sample         :  mm i think that i think that is that
sp_50                :  mm i think that i believe that what
sp_50_sample         :  mm i think that i see that
--------------
------20051017_234550_276_fsp-B-115------
602.88 606.77


es reference         : tu ah ¿ cuando vas guiando oyes música tu o no
en reference         : you ah when you are guiding do you listen to music or not
en reference         : you ah when you're leading do you listen music or you not
en reference         : you ah when you're driving do you listen to music or not
en reference         : you ah when you are driving do you listen music or not
google               : when you drive you listen to music or not
sp_160               :  you you are going to go out in a place or not you
sp_160_sample        :  you go when you are going to listen to music you don't
sp_80                :  to your parents playing listening to music or not
sp_80_sample         :  to your daughter when you go to the music you know
sp_50                :  entire food when you were playing music or you
sp_50_sample         :  more or when you go out when you go
--------------
------20051024_180453_327_fsp-B-82------
583.03 592.76


es reference         : sí porque sí sí uno va a salir si uno va a filadelfia pero eso uno va a old city pero entonces en old city hay tanta gente que sale ahí que tienen y policías y todo
en reference         : yes because if if on is going to go out if on goes to philadelphia but if one's going to old city but then in old city there's so many people that goes out that there are many policemen there and everything
en reference         : yes because if if you are going to go out if you are going to philadelphia but that you go to old city but then in old city there are so many people that go there that they even have police there and everything
en reference         : yes because if if one is going to go out if one goes to philadelphia but if one goes to old city but then in old city there is so much people that go out that they have police and everything
en reference         : yes why yes if one goes out one goes to philadelphia but that's an old city so they have cops and everything
go

es reference         : en estados unidos en méxico en toda la onda están imitando y nada más están agarrandola como un o sea ya se ha convertido como un género global
en reference         : in united states in mexico in everything is an imitation and nothing else is taking like or is has convert in a global type
en reference         : in the united states in mexico in every wave they are only imitating like if its turned into something global
en reference         : in the united state in mexico in all wave are imitating and are just taking like one or rather it has become a global common
en reference         : in the united states in mexico they are they are just copying it they are using as a i mean it's become like a global gender
google               : in the united states in mexico in all the sides they are limiting and they are just taking them like a i mean that converted like a global gender
sp_160               :  in the united states in mexico in all the are also talking and t

es reference         : el mall es muy chiquitito no hay lugares para ir a visitar o lugares ballet o conciertos y cosas así esas cosas no hay acá
en reference         : the mall is very small there are no places to visit or places ballet or concerts and things like that there are no such things here
en reference         : the mall is very tiny no places to visit or places ballet concerts and stuff there are no such things here
en reference         : the mall is very small there are no places to go visit or places for ballet or concerts and things like that these things are not here
en reference         : the mall is very small there are no places to visit or places ballet or concerts and things like that those things are not around here
google               : mall is very small there are no places to go visit or places or concerts and things like that those things right here
sp_160               :  and it's very small not to visit the place or places or places or to to concerts and thi

es reference         : no eso es lo mejor eso es lo mejor porque le digo el punto de vista de una persona que fue educadora
en reference         : no that's the best that's the best because i tell you from the point of view of a person that was an educator
en reference         : no that is the best that is the best because i tell you the point of view of a person who was an educator
en reference         : no that's the better because i tell my point of view of a person that was a teacher
en reference         : no that's the best it's the best because i'm telling you the point of view of a person that was a teacher
google               : no it's the best thing that's not better because i tell the point of view of a person who was educating
sp_160               :  nothing is the best that's the best because i tell you the point of view of a person who was a lawyer
sp_160_sample        :  and that's the best that's the best because i tell you the point of view of view that was a lawyer
sp

es reference         : así que agarró todo mi esposo fue allá y agarró toda las cosas de él y ya no hay nada de él en ese apartamento
en reference         : so he got everything my husband went over there and grabbed all of his things and there is nothing of his anymore in that apartment
en reference         : so he took his stuff and my husband went there and helped him pick up his stuff and he left nothing in that apartment
en reference         : so he took everything my husband went there and took all his things and now there is nothing of his in that apartment
en reference         : so he got all of his things my husband went over there and took all of his things and there isn't anything else of his in that apartment anymore
google               : so he did clothes my husband was there and caught everything from him and there's nothing in this apartment
sp_160               :  yes that a car and my husband was there the all the things of day in nothing
sp_160_sample        :  yes t

es reference         : sí sí a mi me encanta viajar yo espero ahora que empiezo a trabajar en enfermera y que empecé a a tener dinero poder viajar
en reference         : yes yes i love traveling i hope now that i start working on aa nurse and i started to have money to travel
en reference         : yes yes i love to travel i hope now that i start to work as a nurse and that i started to have money to be able to travel
en reference         : yes yes i love travelling i hope that nos that i'm starting to work as a nurse and start having money i cant travel
en reference         : yes yes i love traveling i hope now that i start working as a nurse and i start to to have money to be able to travel
google               : yes yes i love traveling but now that the nurse starts to have money to travel
sp_160               :  yes yes i love to travel i hope but now i start to work nurse that started to have money to travel
sp_160_sample        :  yes yes i've lived i love it i i but now i start 

es reference         : fue importantísimo porque él con toda su música hacía los movimientos de revolución o sea la gente no no no lo veía a él como un múscio sino más bien como a un líder político
en reference         : he was really important because he with all his music made the movements of the revolution that is the people didn't didn't didn't see him as a musician but more like a political leader
en reference         : it was very important because him with all his music he made the revolution's movements i mean people didn't see him as a musician but as a political leader
en reference         : it was important because with his music he made the movements of revolution i mean people didn't see him as a musician but as a political leader
en reference         : he was very important because him with all his music he made revolutionary movements i mean people didn't didn't didn't see him as a musician but rather as a political leader
google               : it was important because

es reference         : no te quieren ayudar porque tu haces mucho supuestamente y mucho no es tanto no puedes sobrevivir en eso y tienes que dejar de trabajar para que poder que ellos te ayuden
en reference         : they don't want to help you because you do a lot supposedly and a lot is not much you can't survive and you have to stop working so that they can help you
en reference         : they don't want to help you because you make too much supposedly and too much is not so much they cannot survive with that and you have to stop working for them to help you
en reference         : they do not want to help because you make a lot supposely it is not that much you can not survive with that and you would have to stop working for them to help
en reference         : they don't want to help you because you do a lot supposedly and a lot isn't enough you can't survive on this and you have to work so that you can make enough for them to help you
google               : they don't want to help 

es reference         : del estudio tu sabes siempre todo estaba fue enfocado sí mi mamá y papá eramos pobres pero sí uno de nosotros quería tocar música mi papá y mi mamá hacían lo que podían para llevarnos a coger lesiones de música
en reference         : take me away from my studies i was always focused and my mom and dad we were poor but if one of us wanted to play music my dad and mom would do whatever they could so that we could get music lessons
en reference         : of studies you know that everything was always was focused my mother and father were poor but if one of us wanted to play music my mom and dad did whatever they could so i could go to take music lessons
en reference         : from school you know everything was always was focused yes my mother and father we were poor but yes one of us wanted to play music and my father and mother did whatever they could to take music lessons
en reference         : from my studies you know it was always it was focused yea my mom and 

es reference         : verdad entonces al fin pasó caminando todo el semestre y y en el invierno había no se veinte grados y s se ponía a caminar cuarenta minutos para llegar a la clase es que
en reference         : true then the last step and walking throughout the semester and in the winter it wasn't twenty degrees and we began to walk forty minutes to get to class it is
en reference         : right so in the end he spent the whole semester walking and and in winter it was i don't know twenty degrees and h he walked forty minutes to get to class it's that
en reference         : true so in the end he spent the whole semester walking and and in the winter there was i don't know twenty degrees and h he would walk forty minutes to get to class is that
en reference         : right then he he spend the whole semester walking and during the winter it was like twenty degrees and he walked he walked forty minutes to get to class
google               : yes but the truth is that there was the w

es reference         : es terrible pero pues este pero pues la verdad osea últimante también ya ha habido mucho más conciencia social en cuanto a que ya l lo los ciudadanos este ahora sí están empezando digamos a oponerse a eso ¿me entiendes entonces
en reference         : it's terrible but well uh but well really i mean lately there's been much more social awareness towards the the citizens i mean now they're starting to let's say fight again'st that do you understand so
en reference         : it's terrible but well this but well to tell you the truth i mean lately also there have been much more social conscience about what th the th citizens this now they're really starting let's say to oppose it you get me then
en reference         : terrible but well but well actually i mean lately there has been a lot more social awareness regarding that the the citizens well now they're starting to lets say to oppose that do you understand me so
en reference         : it is terrible but well this

es reference         : no no yo le dije a tu hermano que le dijera que que yo fui quien envió ese email y que y que no lo yo no lo había enviado por ella lo había hablado por esa otra señora que no hablaba pero nada de español muy pocas palabras lo que sabía decir en espa
en reference         : no no i told your brother to tell him that that i was the one that sent that email and that that i hadn't sent it because of her i had sent it because of that other person that didn't speak any spanish she only knew very little words in spa
en reference         : no no i told your brother that said that that i was who sent that email and that and that i hadn't sent it because of her i had spoken with that other lady that didn't speak any spanish very few words that she knew how to say in spa
en reference         : no no i told your brother to tell him that i was the one sending that email and that i didn't send it for her it was for the other lady that she didn't talk spanish too much at all few

es reference         : y la mayoría de esta de esta algunas personas que están en el evangelio o en los testigos de jeováh y hasta tratan de meter las cosas por el ojos y quieren venir a tu casa a a temprano por la mañana a meterse a vender cuanta cosa hay
en reference         : and the majority of this of this some people that are in the gospel or in the jehova witnesses and they even try to get things in through your eyes and they want to come to your house to to early in the morning to come and sell every thing they can think of
en reference         : and the majority of this of this some people who are in the gospel or in jehova's witnesses and they even try to stick things through the eyes and want to come to your house to to early in the morning to go and sell whatever there is
en reference         : majority of the people are of them are in gospel or jehovah's witnesses nad they want to force you believe they come to your home in the morning and try to sell their stuff
en refere

es reference         : mi hijo está de vuelta el grande que se mudó ésta en mi casa de nuevo ya se peleo con la novia y y la muchacha le hizo algo malo y ese hombre mi hijo cuando tu le haces algo malo te dice adiós y así nunca te ve más
en reference         : my son is back the big one who move is in my house again he already got in a fight with the girlfriend and and the girl did something bad to him and that man my son when you do something to him he tells you goodbye and like that he never sees you again
en reference         : my son is now back to our house the older one moved back with us he had a fight with his girlfriend and the girlfriend did something bad to him and this man my son when you do something bad to him he just waives goodbye to you and he will never see you again
en reference         : my sone is back the older one moved he is at my home again he fought with his girlfriend and the girl did something bad to him and this man my son when you do something bad to him h

es reference         : Él estaba hablando ahí y está la esposa hablando ahí y y salió algo que él juró que y que bandera y yo no sabía que es eso y ellos dijeron que él juró en la bandera que no es dominicano y eso y esto y se hizo
en reference         : he was speaking there and the wife was speaking there and and he right that and that banner and i didn't know what that was and he said that the right on the banner was not dominican and that and that and he made
en reference         : and he was there his wife was talking and something happened and he said bag words to the flag i don't do that and he told me that he was swearing to the flag because he is not dominican he's this and he did
en reference         : he was sitting there and his wife is there talking and something came up about he swore that and the flag and i didn't know that and they said that he swore on the flag that he's not dominican and this and that and he made
en reference         : he was talking there and the wif

es reference         : pero antes sí era eh pues era en inglés pero también lo tenían en árabe y y español y lo que sea pero sí sí e eso ya es una historia vieja ya lo han hecho varias veces entonces no sé o por lo menos una vez
en reference         : but before yes it waseh it was in english but there was too in arabian and in spanish and whatever but yes yes that is an old story they have make it many times then i don´t know or at least one time
en reference         : but that was before that was in english but they did it in arabic and and spanish and whatever but that is an old story they have done it several times then i do not know at least once
en reference         : but before yes was eh like was in english but also they had it in arab and and spanish and whatever but yesyes t that is old history they have done it a many times then i don't know or al least one time
en reference         : but before yes it was uh well it was in english but they also had it in arabic and and span

es reference         : que que la persona tomó un poco de tiempo para hacer eso ¿verdad entonces si uno solo estas mandando dos emails no importa entonces llega el email y sabe que no es spam pero si uno esta mandando millones de emails
en reference         : that the person took a little time to make right so if you're only sending two mails it doesn't matter then the mail arrives and it knows it's not spam but if you're sending millions of emails
en reference         : that the person wasted a little time doing that right so if you are just sending two emails it doesn't matter the email gets through and you know it's not spam but if you are sending millions of emails
en reference         : that that the person took a little time to do that right then if you are only sending two emails it doesn't matter then the email arrives and you know it is spam but if you are sending millions of emails
en reference         : that that the person took a little of time to make that right then if on

es reference         : y pues este también digo como viviendo aquí como universitario pues este pues tam todo mundo está viviendo por aquí entonces a la vez como pues no no quieres estar tampoco muy lejos de aquí porque está también muy desconectado de la gente y de la universidad entonces
en reference         : and then this too i say like living here as a student then this like all the people is living here then at the same time like then they don't want to be too far from here because you are like disconnected from the university people then
en reference         : well i am living as a university student i don´t want to be far away from here i want to be in touch with the people and the university
en reference         : and then also i said like living in here as a college guy then this then whole world in living in here then at the same time howthen no don't even want to be very far from here because is very disconnected from the people and the university then
en reference         

es reference         : y y no sé a mi no me ha pasado nunca nada y no y no me siento no me siento en peligro ni siento que sea peligroso pero bueno conozco gente gente que realmente sí ¿no le le da mucho miedo west philly y
en reference         : and and i don't know nothing has ever happened to me and no i don't feel i don't feel in danger nor do i feel it is dangerous but well i know people people that really are scared of west philly and
en reference         : and i don't know nothing has happen to me and i don't feel threaten i don't think this is a dangerous place but i know people who are really scared of west philly and
en reference         : and and i don't know nothing has ever happened to me and no and i don't feel i don't feel in danger nor do i feel it is dangerous but well i know people people who really yes no they they are really scared of west philly and
en reference         : and i don't know i say nothing bad happened to me ever and i would say i feel safe i don't fee

es reference         : todo lo que uno consume es uno uno va a comprar un carro mejor y uno tal vez hace esto y hace lo otro y ya cuando uno ve ya se voló el dinero ¿verdad y uno solo se queda me parece a mi solo con las responsabilidades ¿verdad
en reference         : everything that one consumes its one one goes to buy a better car and one maybe does this and does that and so when one looks money flew away right and one only stays with that's what it looks like to me only with the responsibilities right
en reference         : all that we consumeis one one get better car and one maybe and do another and that is it and when we come and see we had spend all the money right and we just say it appears to me with all the responsabilitys right
en reference         : everything that one consumes it one one goes to buy a better car and one maybe does this and does that and does the other and that's it when you pay attention the money's gone right and one stays it looks to me only with respons

In [None]:
!paste -d"\n" $nmt_path/ref_min-0_max-2.en* > $nmt_path/all_ref_min-0_max-2_meteor

In [None]:
en_data = {}

In [None]:
len(list(en_data['google']['4refs'].values())[0])

In [None]:
en_data['google'] = {}
en_data['google']['refs'], en_data['google']['hyps'], en_data['google']['4refs'] = get_model_data("", 
                                                                                                  use_google=True)

In [None]:
nmt_path = "sp2enw_hyp_search/sp_1.0_l2e-4_rnn-3_drpt-0.5_cnn_96-2-2_mix-0.5/"
en_data['160hrs_da'] = {}
en_data['160hrs_da']['refs'], en_data['160hrs_da']['hyps'], en_data['160hrs_da']['4refs'] = get_model_data(nmt_path, 
                                                                                               use_google=False)

In [None]:
nmt_path = "./sp2enw_interspeech/sp_0.50_h-300_e-128_l2e-4_lstm_drpt-0.5_cnn-64-2-2_rnn-3_b-80-25_no-bn-ln"
en_data['80hrs'] = {}
en_data['80hrs']['refs'], en_data['80hrs']['hyps'], en_data['80hrs']['4refs'] = get_model_data(nmt_path, 
                                                                                               use_google=False)

In [None]:
nmt_path = "./sp2enw_interspeech/sp_0.50_h-300_e-128_l2e-4_lstm_drpt-0.5_cnn-64-2-2_rnn-3_b-80-25_no-bn-ln_mix-0.5"
en_data['80hrs_da'] = {}
en_data['80hrs_da']['refs'], en_data['80hrs_da']['hyps'], en_data['80hrs_da']['4refs'] = get_model_data(nmt_path, 
                                                                                               use_google=False)

In [None]:
nmt_path = "./sp2enw_interspeech/sp_0.33_seed-AB_l2e-4_drpt-0.5"
en_data['50hrs'] = {}
en_data['50hrs']['refs'], en_data['50hrs']['hyps'], en_data['50hrs']['4refs'] = get_model_data(nmt_path, 
                                                                                               use_google=False)

In [None]:
nmt_path = "./sp2enw_interspeech/sp_0.33_seed-AB_l2e-4_drpt-0.5_sample-mix-0.5"
en_data['50hrs_da'] = {}
en_data['50hrs_da']['refs'], en_data['50hrs_da']['hyps'], en_data['50hrs_da']['4refs'] = get_model_data(nmt_path, 
                                                                                               use_google=False)

In [None]:
utt = '20051023_232057_325_fsp-A-3'
utt = '20051017_180712_270_fsp-B-2'
utt = '20051009_182032_217_fsp-B-149'

for m in en_data:
    print(m, ' & ', " ".join(en_data[m]['hyps'][utt]), ' \\\\')
    print()
    
print(" ".join(en_data[m]['refs'][utt]))

In [None]:
", ".join([w.decode() for w in bow_dict['w2i'].keys()])

In [None]:
len(bow_dict['freq'])-4

In [None]:
keyword = "help"
data_key = "80hrs_da"

print(bow_dict["freq"][keyword.encode()], bow_dict["freq_dev"][keyword.encode()])

t_count = 0
c_count = 0
tp_count = 0
corr_utts = []

for u in en_data[data_key]["hyps"]:
    common_ref_words = set(en_data[data_key]["4refs"][u][0])
    for curr_ref in en_data[data_key]["4refs"][u][1:]:
        common_ref_words &= set(curr_ref)
#     if sum([1 if keyword in set(r) else 0 for r in en_data['50hrs_da']["4refs"][u]]) >= 4:
    if keyword in common_ref_words:
        t_count += 1
    if keyword in en_data[data_key]["hyps"][u]:
        tp_count += 1
    if keyword in en_data[data_key]["hyps"][u] and keyword in common_ref_words:
        c_count += 1
        corr_utts.append(u)

print(t_count, c_count, tp_count)
        
for u in corr_utts:
    print(" ".join(en_data[data_key]["hyps"][u]))
    for r in en_data[data_key]["4refs"][u]:
        if keyword in r:
            print(" ".join(r))
            break

In [None]:
model_50_da[1]['word'][keyword]

In [None]:
tc = sum([model_50_da[1]['word'][w]['tc'] for w in model_50_da[1]['word']])

In [None]:
tp = sum([model_50_da[1]['word'][w]['t'] for w in model_50_da[1]['word']])

In [None]:
tc / tp

In [None]:
for w in model_50_da[1]['word']:
    if w in model_50_da[1]['word']: 
        p_50 = model_50_da[1]['word'][w]['tc'] / model_50_da[1]['word'][w]['t'] if model_50_da[1]['word'][w]['t'] > 0 else 0
    else:
        p_50 = 0
    if w in model_80_da[1]['word']:
        p_80 = model_80_da[1]['word'][w]['tc'] / model_80_da[1]['word'][w]['t'] if model_80_da[1]['word'][w]['t'] > 0 else 0
    else:
        p_80 = 0
    print(w, 
          "{0:20.1f}".format(p_50 * 100),
          "{0:20.1f}".format(p_80 * 100))

In [None]:
for w in model_50_da[1]['word']:
    if model_50_da[1]['word'][w]['t'] > 0 and model_50_da[1]['word'][w]['tc'] / model_50_da[1]['word'][w]['t'] >= 0:
        print(w, "{0:.1f}".format(model_50_da[1]['word'][w]['tc'] / model_50_da[1]['word'][w]['t'] * 100))

In [None]:
for w in model_80_da[1]['word']:
    if model_80_da[1]['word'][w]['t'] > 0 and model_80_da[1]['word'][w]['tc'] / model_80_da[1]['word'][w]['t'] >= 0.4:
        print(w, "{0:.1f}".format(model_80_da[1]['word'][w]['tc'] / model_80_da[1]['word'][w]['t'] * 100))

### Dummy baseline

In [None]:
freq_sorted_words = [w.decode() for w,f in sorted(bow_dict['freq'].items(), reverse=True, key=lambda t: t[1])]

In [None]:
predict_top_K = 5
top_K_words = freq_sorted_words[:predict_top_K]

In [None]:
" --- ".join(top_K_words)

In [None]:
dummy_preds = [top_K_words for u in google_hyp_r0]

In [None]:
basic_precision_recall(google_utt_refs_words_bow.values(), dummy_preds)[:2]

In [None]:
basic_precision_recall(google_single_ref.values(), dummy_preds)[:2]

In [None]:
max_pred = 10

In [None]:
dummy_p_vals = np.zeros((max_pred), dtype="f")
dummy_r_vals = np.zeros((max_pred), dtype="f")

In [None]:
dummy_p_vals

In [None]:
p_r_dummy = {}
# for thresh in tqdm(np.arange(min_prob, max_prob+thresh_delta,thresh_delta)):
for num_pred in tqdm(range(0,max_pred)):
    top_K_words = freq_sorted_words[:num_pred+1]
    dummy_preds = [top_K_words for u in google_hyp_r0]
    dummy_p_vals[num_pred], dummy_r_vals[num_pred] = basic_precision_recall(google_utt_refs_words_bow.values(), 
                                                                            dummy_preds)[:2]
    

In [None]:
dummy_p_vals /= 100.0
dummy_r_vals /= 100.0

In [None]:
np.trapz(dummy_p_vals, dummy_r_vals)

In [None]:
thresh_labels = range(1,max_pred+1)

In [None]:
fig, ax = plt.subplots(figsize=(10,8))
ax.plot(dummy_r_vals*100, dummy_p_vals*100, label="Precision/Recall")
for i,j,k in zip(dummy_r_vals*100, dummy_p_vals*100, thresh_labels):
    ax.annotate(k,xy=(i,j), fontsize=12)
ax.set_ylabel("Precision", fontsize=28)
ax.set_xlabel("Recall", fontsize=26)
ax.legend(loc='upper center', bbox_to_anchor=(1.3, 0.9),
                  ncol=1, fancybox=True, shadow=True, fontsize=26)

plt.yticks(rotation=0, size=18)
plt.xticks(rotation=0, size=18)

In [None]:
# basic_precision_recall(single_dev_ref_words.values(), dummy_preds)[:2]

## Google refs vs Edin refs

In [None]:
sum([1 if len(r[0]) > 0 else 0 for r in single_dev_ref_words.values()])

In [None]:
sum([len(set(r[0])-{'_UNK'})for r in single_dev_ref_words.values()])

In [None]:
mismatch_count = 0
for u in set(google_single_ref.keys()) & set(single_dev_ref_words.keys()):
    if set(single_dev_ref_words[u][0]) - {'_UNK'} != set(google_single_ref[u][0]):
        mismatch_count += max(len(set(single_dev_ref_words[u][0]) - set(google_single_ref[u][0])), 
                              len(set(google_single_ref[u][0]) - set(single_dev_ref_words[u][0])))
        print(u, single_dev_ref_words[u], google_single_ref[u])
        print((set(single_dev_ref_words[u][0]) - set(google_single_ref[u][0])), 
                              (set(google_single_ref[u][0]) - set(single_dev_ref_words[u][0])))

In [None]:
print(mismatch_count)

In [None]:
[(set(r[0])-{'_UNK'})for r in single_dev_ref_words.values()]

In [None]:
bow_dict["w2i"]