In [1]:
import nltk
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu, modified_precision
from nltk.translate.chrf_score import sentence_chrf, corpus_chrf
from nltk.metrics import scores
import scipy.io.wavfile
from IPython.display import Audio
from IPython.display import display
from nltk.stem import *
# from nltk.stem.snowball import SnowballStemmer
from stemming.porter2 import stem
import stemming
from nltk.metrics.scores import recall

from nltk.corpus import stopwords

%matplotlib inline

In [2]:
from bow_run import *

In [3]:
%run utils.ipynb

In [4]:
smooth_fun = nltk.translate.bleu_score.SmoothingFunction()

In [5]:
def bow_basic_precision_recall(r, h, display=False):
    p_numerators = Counter() # Key = ngram order, and value = no. of ngram matches.
    p_denominators = Counter() # Key = ngram order, and value = no. of ngram in ref.
    r_numerators = Counter() # Key = ngram order, and value = no. of ngram matches.
    r_denominators = Counter() # Key = ngram order, and value = no. of ngram in ref.
    metrics = {"rc": 0, "rt": 0, "tp": 0, "tc": 0, "word": {}}

    if display:
        print("total utts={0:d}".format(len(r)))

    i=1

    for references, hypothesis in zip(r, h):
        if min([len(any_ref) for any_ref in references]) > 0:
            if len(hypothesis) > 0:
                p_i = modified_precision(references, hypothesis, i)
                p_numerators[i] += p_i.numerator
                p_denominators[i] += p_i.denominator

                metrics["tc"] += p_i.numerator
                metrics["tp"] += p_i.denominator
            else:
                p_numerators[i] += 0
                p_denominators[i] += 0

                metrics["tc"] += 0
                metrics["tp"] += 0

            #print(p_i.numerator, p_i.denominator)

            tot_match = 0
            tot_count = 0

            common_ref_words = set(references[0])
            for curr_ref in references[1:]:
                common_ref_words &= set(curr_ref)
            
            common_words = common_ref_words & set(hypothesis)
            for w in common_ref_words:
                if w not in metrics["word"]:
                    metrics["word"][w] = {"t": 0, "tp": 0, "tc": 0}
                metrics["word"][w]["t"] += 1
            
            for w in set(hypothesis):
                if w not in metrics["word"]:
                    metrics["word"][w] = {"t": 0, "tp": 0, "tc": 0}
                metrics["word"][w]["tp"] += 1
                
            for w in common_words:
                if w not in metrics["word"]:
                    metrics["word"][w] = {"t": 0, "tp": 0, "tc": 0}
                metrics["word"][w]["tc"] += 1
            
            r_numerators[i] += len(common_words)
            r_denominators[i] += len(common_ref_words)
            metrics["rc"] += len(common_words)
            metrics["rt"] += len(common_ref_words)
            

#             max_recall_match, max_tp, max_t, max_word_level_details = count_match(list(common_ref_words), list(set(hypothesis)))
#             max_recall = max_recall_match / max_t if max_t > 0 else 0

            # max_recall_match, max_tp, max_t, max_word_level_details = count_match(references[0], hypothesis)
            # max_recall = max_recall_match / max_t if max_t > 0 else 0

            # for curr_ref in references:
            #     curr_match, curr_tp, curr_t, curr_word_level_details = count_match(curr_ref, hypothesis)
            #     curr_recall = curr_match / curr_t if curr_t > 0 else 0

            #     if curr_recall > max_recall:
            #         max_recall_match = curr_match
            #         max_t = curr_t
            #         max_recall = curr_recall
            #         max_word_level_details = curr_word_level_details
            

#             r_numerators[i] += max_recall_match
#             r_denominators[i] += max_t
#             metrics["rc"] += max_recall_match
#             metrics["rt"] += max_t
#             for key in {"t","tp","tc"}:
#                 for w in max_word_level_details[key]:
#                     if w not in metrics["word"]:
#                         metrics["word"][w] = {"t": 0, "tp": 0, "tc": 0}
#                     metrics["word"][w][key] += max_word_level_details[key][w]

            
    
    
    

    prec = [(n / d) * 100 if d > 0 else 0 for n,d in zip(p_numerators.values(), p_denominators.values())]
    rec = [(n / d) * 100 if d > 0 else 0 for n,d in zip(r_numerators.values(), r_denominators.values())]

    if display:
        print("{0:10s} | {1:>8s}".format("metric", "1-gram"))
        print("-"*54)
        print("{0:10s} | {1:8.2f}".format("precision", *prec))
        print("{0:10s} | {1:8.2f}".format("recall", *rec))

    return prec[0], rec[0], metrics

In [6]:
def nmt_basic_precision_recall(r, h, display=False):
    p_numerators = Counter() # Key = ngram order, and value = no. of ngram matches.
    p_denominators = Counter() # Key = ngram order, and value = no. of ngram in ref.
    r_numerators = Counter() # Key = ngram order, and value = no. of ngram matches.
    r_denominators = Counter() # Key = ngram order, and value = no. of ngram in ref.
    metrics = {"rc": 0, "rt": 0, "tp": 0, "tc": 0, "word": {}}

    if display:
        print("total utts={0:d}".format(len(r)))

    i=1

    for references, hypothesis in zip(r, h):
        if min([len(any_ref) for any_ref in references]) > 0:
            if len(hypothesis) > 0:
                p_i = modified_precision(references, hypothesis, i)
                p_numerators[i] += p_i.numerator
                p_denominators[i] += p_i.denominator

                metrics["tc"] += p_i.numerator
                metrics["tp"] += p_i.denominator
            else:
                p_numerators[i] += 0
                p_denominators[i] += 0

                metrics["tc"] += 0
                metrics["tp"] += 0

            #print(p_i.numerator, p_i.denominator)

            tot_match = 0
            tot_count = 0

            max_recall_match, max_tp, max_t, max_word_level_details = count_match(references[0], hypothesis)
            max_recall = max_recall_match / max_t if max_t > 0 else 0

            for curr_ref in references:
                curr_match, curr_tp, curr_t, curr_word_level_details = count_match(curr_ref, hypothesis)
                curr_recall = curr_match / curr_t if curr_t > 0 else 0

                if curr_recall > max_recall:
                    max_recall_match = curr_match
                    max_t = curr_t
                    max_recall = curr_recall
                    max_word_level_details = curr_word_level_details

            r_numerators[i] += max_recall_match
            r_denominators[i] += max_t
            metrics["rc"] += max_recall_match
            metrics["rt"] += max_t
            for key in {"t","tp","tc"}:
                for w in max_word_level_details[key]:
                    if w not in metrics["word"]:
                        metrics["word"][w] = {"t": 0, "tp": 0, "tc": 0}
                    metrics["word"][w][key] += max_word_level_details[key][w]

    prec = [(n / d) * 100 if d > 0 else 0 for n,d in zip(p_numerators.values(), p_denominators.values())]
    rec = [(n / d) * 100 if d > 0 else 0 for n,d in zip(r_numerators.values(), r_denominators.values())]

    if display:
        print("{0:10s} | {1:>8s}".format("metric", "1-gram"))
        print("-"*54)
        print("{0:10s} | {1:8.2f}".format("precision", *prec))
        print("{0:10s} | {1:8.2f}".format("recall", *rec))

    return prec[0], rec[0], metrics

In [7]:
def get_model_data(nmt_path, use_google=False):
    if use_google:
        google_s2t_hyps, google_s2t_refs, nmt_4refs = get_google_data()
        nmt_hyps = google_s2t_hyps['fisher_dev_r0']
        nmt_refs = google_s2t_refs['fisher_dev_ref_0']
    else:
        nmt_refs = pickle.load(open(os.path.join(nmt_path, 
                                                 "model_s2t_refs.dict"), "rb"))
        nmt_hyps = pickle.load(open(os.path.join(nmt_path, 
                                                 "model_s2t_hyps.dict"), "rb"))
        nmt_4refs = pickle.load(open(os.path.join(nmt_path,
                                                  "model_s2t_refs_for_eval.dict"), "rb"))
    
    return nmt_refs, nmt_hyps, nmt_4refs

In [8]:
def eval_nmt_model(nmt_path, use_google=False, min_len=10):
    smooth_fun = nltk.translate.bleu_score.SmoothingFunction()
    
    nmt_refs, nmt_hyps, nmt_4refs = get_model_data(nmt_path, use_google=use_google)

    nmt_preds_bow = {}
    nmt_1_ref = {}
    nmt_refs_bow = {}

    dev_utt_ids = nmt_hyps.keys()

    for u in dev_utt_ids:
        nmt_preds_bow[u] = list(get_words_in_bow_vocab(nmt_hyps[u], bow_dict))
        nmt_refs_bow[u] = []
        nmt_1_ref[u] = [list(get_words_in_bow_vocab(nmt_refs[u], bow_dict))]
        for r in nmt_4refs[u]:
            nmt_refs_bow[u].append(list(get_words_in_bow_vocab(r, bow_dict)))
    
    
    p_bow, r_bow, metrics_1_bow = bow_basic_precision_recall(nmt_1_ref.values(), 
                                                       nmt_preds_bow.values())
    print("-"*80)
    print("BOW - using 1 reference")
    print("-"*20)
    print("precision={0:.2f}, recall={1:.2f}".format(p_bow, r_bow))
    num_1correct = len([item for item in metrics_1_bow["word"].items() 
                        if item[1]['tc'] > 0])
    
    num_all = len([item for item in metrics_1_bow["word"].items() 
                        if item[1]['t'] > 0])

    print("-"*20)
    print("{0:d} out of {1:d} retrieved with atleast 1 correct instance".format(num_1correct, num_all))
    

    p_bow, r_bow, metrics_bow = bow_basic_precision_recall(nmt_refs_bow.values(), 
                                                       nmt_preds_bow.values())
    
    print("-"*80)
    print("BOW - using all 4 references")
    print("-"*20)
    print("precision={0:.2f}, recall={1:.2f}".format(p_bow, r_bow))
    
    num_1correct = len([item for item in metrics_bow["word"].items() 
                        if item[1]['tc'] > 0])
    
    num_all = len([item for item in metrics_bow["word"].items() 
                        if item[1]['t'] > 0])

    print("-"*20)
    print("{0:d} out of {1:d} retrieved with atleast 1 correct instance".format(num_1correct, num_all))
    
    # MT PRECISION RECALL - NOOOT BOW
    p_nmt, r_nmt, metrics_nmt = nmt_basic_precision_recall(nmt_4refs.values(), 
                                                       nmt_hyps.values())
    print("-"*80)
    print("MT task - using all 4 references")
    print("-"*80)
    print("precision={0:.2f}, recall={1:.2f}".format(p_nmt, r_nmt))

    nmt_bleu = corpus_bleu(nmt_4refs.values(), 
                           nmt_hyps.values(),
                           smoothing_function=smooth_fun.method2)

    print("-"*80)
    print("4 references bleu={0:2f}".format(nmt_bleu*100))
    
    one_ref_list = []
    one_hyp_list = []
    
#     for u in nmt_refs:
#         one_ref_list.append([nmt_refs[u]])
#         one_hyp_list.append(nmt_hyps[u])
        
#     p_nmt_one, r_nmt_one, metrics_nmt_one = nmt_basic_precision_recall(one_ref_list, 
#                                                            one_hyp_list)
#     print("-"*80)
#     print("MT task - using single references")
#     print("-"*80)
#     print("precision={0:.2f}, recall={1:.2f}".format(p_nmt_one, r_nmt_one))
    
#     nmt_bleu = corpus_bleu(one_ref_list, 
#                            one_hyp_list,
#                            smoothing_function=smooth_fun.method2)

#     print("-"*80)
#     print("single reference bleu={0:2f}".format(nmt_bleu*100))
    print("-"*80)
    print("using min len filter")
    print("-"*20)
    check_bleu_with_len_filter(nmt_4refs, nmt_hyps, min_len=min_len)
    
    return metrics_1_bow, metrics_bow

In [23]:
def eval_prec_recall_for_words(nmt_path, bow_dict, use_google=False):
    smooth_fun = nltk.translate.bleu_score.SmoothingFunction()
    
    nmt_refs, nmt_hyps, nmt_4refs = get_model_data(nmt_path, use_google=use_google)

    nmt_preds_bow = {}
    nmt_1_ref = {}
    nmt_refs_bow = {}

    dev_utt_ids = nmt_hyps.keys()

    for u in dev_utt_ids:
        nmt_preds_bow[u] = list(get_words_in_bow_vocab(nmt_hyps[u], bow_dict))
        nmt_refs_bow[u] = []
        nmt_1_ref[u] = [list(get_words_in_bow_vocab(nmt_refs[u], bow_dict))]
        for r in nmt_4refs[u]:
            nmt_refs_bow[u].append(list(get_words_in_bow_vocab(r, bow_dict)))
    
    
    p_bow, r_bow, metrics_1_bow = bow_basic_precision_recall(nmt_1_ref.values(), 
                                                       nmt_preds_bow.values())
    
#     print("-"*80)
#     print("Using word list: \n{0:s}".format(" -- ".join([w.decode() for w in bow_dict["w2i"].keys()])))
#     print("number of words: {0:d}".format(len(bow_dict["w2i"])))
    print("-"*80)
    print("-"*20)
    p_bow, r_bow, metrics_bow = bow_basic_precision_recall(nmt_refs_bow.values(), 
                                                       nmt_preds_bow.values())
    
    print("-"*80)
    print("BOW - using all 4 references")
    print("-"*20)
    print("precision={0:.2f}, recall={1:.2f}".format(p_bow, r_bow))
    
    num_1correct = len([item for item in metrics_bow["word"].items() 
                        if item[1]['tc'] > 0])
    
    words_present = [item[0] for item in metrics_bow["word"].items() 
                        if item[1]['t'] > 0]
    num_all = len(words_present)
    print("-"*80)
    print("Using word list: \n{0:s}".format(" -- ".join(words_present)))
    print("number of words: {0:d}".format(num_all))
    top_five = [w[0] for w in sorted([(item[0], item[1]['t']) for item in metrics_bow["word"].items() 
                        if item[1]['t'] > 0], reverse=True, key= lambda t: t[1])[:5]]
    print("Top 5 words present: \n{0:s}".format(" -- ".join(top_five)))

    print("-"*20)
    print("{0:d} out of {1:d} retrieved with atleast 1 correct instance".format(num_1correct, num_all))
    
    
    return metrics_bow

In [10]:
# "crisis": os.path.join(m_cfg['data_path'], 
#                                                     "bow_crises_vocab.dict")

In [31]:
def eval_all_word_lists(nmt_path, use_google=False):
    metrics = {}
    eval_word_lists = {"en_freq": os.path.join(m_cfg['data_path'], "eval_en_freq_vocab.dict"),
                       "en_rare": os.path.join(m_cfg['data_path'], "eval_en_rare_vocab.dict"),
#                        "en_es_common": os.path.join(m_cfg['data_path'], 
#                                                     "eval_en_es_common_vocab.dict"),
                       "crisis": os.path.join(m_cfg['data_path'], 
                                                    "eval_en_crisis_vocab.dict")}
    for key, word_list in eval_word_lists.items():
        words = pickle.load(open(word_list, "rb"))
        metrics[key] = eval_prec_recall_for_words(nmt_path, words, use_google=use_google)
    return metrics

In [12]:
def check_bleu_with_len_filter(refs, hyps, min_len):
    sel_refs, sel_hyps = [], []
    for u in refs:
        len_ref = min([len(r) for r in refs[u]])
        if len_ref >= min_len:
            sel_refs.append(refs[u])
            sel_hyps.append(hyps[u])
    print("{0:d} out of {1:d} have len >= {2:d}".format(len(sel_refs), len(refs), min_len))
    bleu_score = corpus_bleu(sel_refs, sel_hyps, smoothing_function=smooth_fun.method2)*100
    print("BLEU={0:.2f}".format(bleu_score))
    sel_p, sel_r, _ = nmt_basic_precision_recall(sel_refs, sel_hyps)
    print("precision={0:.2f}, recall={1:.2f}".format(sel_p, sel_r))

## Edin model

In [13]:
cfg_path = "sp2bagwords/sp_0.50_trial-A/"

In [14]:
last_epoch, model, optimizer, m_cfg, t_cfg = check_model(cfg_path)

cnn_out_dim = rnn_in_units =  640




using ADAM optimizer
--------------------------------------------------------------------------------
model not found


In [15]:
%%capture
train_key = m_cfg['train_set']
dev_key = m_cfg['dev_set']
batch_size=t_cfg['batch_size']
enc_key=m_cfg['enc_key']
dec_key=m_cfg['dec_key']
input_path = os.path.join(m_cfg['data_path'], m_cfg['dev_set'])
# -------------------------------------------------------------------------
# get data dictionaries
# -------------------------------------------------------------------------
map_dict, vocab_dict, bucket_dict, bow_dict = get_data_dicts(m_cfg)
if os.path.exists(os.path.join(m_cfg['model_dir'], "model_s2t_dev_out.dict")):
    dev_utts = pickle.load(open(os.path.join(m_cfg['model_dir'], "model_s2t_dev_out.dict"), "rb"))

if os.path.exists(os.path.join(m_cfg['model_dir'], "model_s2t_train_out.dict")):
    train_utts = pickle.load(open(os.path.join(m_cfg['model_dir'], "model_s2t_train_out.dict"), "rb"))
# batch_size = {'max': 128, 'med': 128, 'min': 128, 'scale': 1}
batch_size = {'max': 64, 'med': 64, 'min': 64, 'scale': 1}
batch_size = t_cfg['batch_size']

edin_s2t_refs_for_eval_path = os.path.join("../chainer2/speech2text/both_fbank_out/", 
                                           "edin_s2t_refs_for_eval.dict")
edin_s2t_refs_for_eval = pickle.load(open(edin_s2t_refs_for_eval_path, "rb"))
single_dev_ref = [[i[0]] for i in dev_utts["refs"]]

In [16]:
map_dict["fisher_dev"]['20051009_182032_217_fsp-B-1'].keys()

dict_keys(['es_w', 'es_c', 'en_w', 'en_c', 'seg'])

In [None]:
input_path = os.path.join(m_cfg['data_path'],
                                      m_cfg['train_set'])
train_utts, train_loss = feed_model(model,
                              optimizer=optimizer,
                              m_dict=map_dict[train_key],
                              b_dict=bucket_dict[train_key],
                              vocab_dict=vocab_dict,
                              bow_dict=bow_dict,
                              batch_size=batch_size,
                              x_key=enc_key,
                              y_key=dec_key,
                              train=False,
                              input_path=input_path,
                              max_dec=m_cfg['max_en_pred'],
                              t_cfg=t_cfg,
                              use_y=True,
                              get_probs=True)

In [None]:
# pickle.dump(train_utts, open(os.path.join(m_cfg['model_dir'], "model_s2t_train_out.dict"), "wb"))

In [None]:
mean_pos_scores = np.array([0.0 for _ in bow_dict["i2w"]], dtype="f")
mean_neg_scores = np.array([0.0 for _ in bow_dict["i2w"]], dtype="f")


for i_w in range(4, len(bow_dict["i2w"])):
    this_word = bow_dict["i2w"][i_w]
    pos_indx = [i_w in r[0] for r in train_utts["refs"]]
    neg_indx = [i_w not in r[0] for r in train_utts["refs"]]
    mean_pos_scores[i_w] = np.mean(train_utts["probs"][:,i_w][pos_indx])
    mean_neg_scores[i_w] = np.mean(train_utts["probs"][:,i_w][neg_indx])

In [None]:
xp.mean(mean_pos_scores), xp.mean(mean_neg_scores)

In [None]:
train_avg_p, _ = compute_avg_precision(train_utts["probs"],
                                                     0.0, 1.0, 5,
                                                     m_cfg['max_en_pred'],
                                                     train_utts["refs"])
train_avg_p

In [None]:
THRESH = m_cfg["pred_thresh"]
train_pred_words = get_pred_words_from_probs(train_utts["probs"],
#                                              mean_pos_scores,
                                               0.5,
                                               m_cfg['max_en_pred'])

train_prec, train_rec, _ = basic_precision_recall(train_utts["refs"], train_pred_words)
train_prec, train_rec

In [None]:
input_path = os.path.join(m_cfg['data_path'], m_cfg['dev_set'])

dev_utts, dev_loss = feed_model(model,
                                optimizer=optimizer,
                                m_dict=map_dict[dev_key],
                                b_dict=bucket_dict[dev_key],
                                vocab_dict=vocab_dict,
                                bow_dict=bow_dict,
                                batch_size=batch_size,
                                x_key=enc_key,
                                y_key=dec_key,
                                train=False,
                                input_path=input_path,
                                max_dec=m_cfg['max_en_pred'],
                                t_cfg=t_cfg,
                                use_y=True,
                                get_probs=True)

In [None]:
single_dev_ref = [[i[0]] for i in dev_utts["refs"]]

In [None]:
np.min(dev_utts["probs"]), np.max(dev_utts["probs"])

In [None]:
mean_dev_pos_scores = np.array([0.0 for _ in bow_dict["i2w"]], dtype="f")
mean_dev_neg_scores = np.array([0.0 for _ in bow_dict["i2w"]], dtype="f")


for i_w in range(4, len(bow_dict["i2w"])):
    this_word = bow_dict["i2w"][i_w]
    pos_indx = [i_w in r[0] for r in dev_utts["refs"]]
    neg_indx = [i_w not in r[0] for r in dev_utts["refs"]]
    mean_dev_pos_scores[i_w] = np.mean(dev_utts["probs"][:,i_w][pos_indx])
    mean_dev_neg_scores[i_w] = np.mean(dev_utts["probs"][:,i_w][neg_indx])

In [None]:
PRED_THRESH = 0.2
print("using prediction threshold={0:.2f}".format(PRED_THRESH))
dev_pred_words = get_pred_words_from_probs(dev_utts["probs"],
                                           PRED_THRESH,
                                           len(bow_dict['i2w']))
p, r, haha = basic_precision_recall(dev_utts["refs"], dev_pred_words)
print(p,r)
print("using mean positive prediction threshold")
dev_pred_words = get_pred_words_from_probs(dev_utts["probs"],
                                               mean_dev_pos_scores,
                                               m_cfg['max_en_pred'])
p, r, _ = basic_precision_recall(dev_utts["refs"], dev_pred_words)
print(p,r)

In [None]:
PRED_THRESH = 0.1
print("using prediction threshold={0:.2f}".format(PRED_THRESH))
dev_pred_words = get_pred_words_from_probs(dev_utts["probs"],
                                           PRED_THRESH,
                                           len(bow_dict['i2w']))
p, r, _ = basic_precision_recall(single_dev_ref, dev_pred_words)
print(p,r)
print("using mean positive prediction threshold")
dev_pred_words = get_pred_words_from_probs(dev_utts["probs"],
                                               mean_dev_pos_scores,
                                               m_cfg['max_en_pred'])
p, r, _ = basic_precision_recall(single_dev_ref, dev_pred_words)
print(p,r)

In [None]:
min_prob, max_prob = float(xp.min(dev_utts["probs"])), float(xp.max(dev_utts["probs"]))
min_prob, max_prob

### Precision-Recall Plot - word level threshold

In [None]:
np.arange(-0.5, 0.5, 0.1)

In [None]:
mean_pos_scores[4:14]*1.3

In [None]:
thresh_deltas = np.asarray([0.7,0.8,0.9,1,1.1,1.2,1.3], dtype="f")

In [None]:
p_r_thresh = {}
thresh_delta = 0.05
for thresh in tqdm(np.arange(-0.5, 0.5+thresh_delta, thresh_delta)):
# for thresh in tqdm(thresh_deltas):
# for thresh in tqdm(np.linspace(min_prob, max_prob,num=20,endpoint=True)):
    p_r_thresh[thresh] = {}
    dev_pred_words_at_thresh = get_pred_words_from_probs(dev_utts["probs"],
                                                           mean_pos_scores + thresh,
                                                           len(bow_dict['i2w']))
    p_r_thresh[thresh]['p'], p_r_thresh[thresh]['r'], _ = basic_precision_recall(dev_utts["refs"], 
                                                                              dev_pred_words_at_thresh)
    

In [None]:
thresh_labels, p_vals, r_vals = [], [], []
for l in p_r_thresh:
    p_vals.append(p_r_thresh[l]["p"])
    r_vals.append(p_r_thresh[l]["r"])
    thresh_labels.append("{0:.2f}".format(l))

In [None]:
fig, ax = plt.subplots(figsize=(10,8))
ax.plot(r_vals, p_vals, label="Precision/Recall")
for i,j,k in zip(r_vals, p_vals, thresh_labels):
    ax.annotate(k,xy=(i,j), fontsize=12)
ax.set_ylabel("Precision", fontsize=28)
ax.set_xlabel("Recall", fontsize=26)
ax.legend(loc='upper center', bbox_to_anchor=(1.3, 0.9),
                  ncol=1, fancybox=True, shadow=True, fontsize=26)

plt.yticks(rotation=0, size=18)
plt.xticks(rotation=0, size=18)

### Precision-Recall Plot - fixed threshold

In [None]:
len(dev_utts["probs"][0]), len(bow_dict['i2w'])

In [None]:
def compute_avg_precision(probs, min_prob, max_prob, num_points, max_words, refs):
    p_r_thresh = {}
    for thresh in tqdm(np.linspace(min_prob, max_prob, num=num_points, endpoint=True)):
        p_r_thresh[thresh] = {}
        words_at_thresh = get_pred_words_from_probs(probs, thresh, max_words)
        p_r_thresh[thresh]['p'], p_r_thresh[thresh]['r'], _ = basic_precision_recall(refs, words_at_thresh)
    
    precision_array = np.array([p_r_thresh[i]['p']/100 for i in p_r_thresh], dtype="f")
    recall_array = np.array([p_r_thresh[i]['r']/100 for i in p_r_thresh], dtype="f")
    avg_p = np.trapz(precision_array[::-1], recall_array[::-1])
    return avg_p, p_r_thresh
    

In [None]:
avg_p, p_r_thresh = compute_avg_precision(dev_utts["probs"], 0.0, 1.0, 50, 104, dev_utts["refs"])

In [None]:
avg_p

In [None]:
# p_r_thresh = {}
# thresh_delta = 0.01
# # for thresh in tqdm(np.arange(min_prob, max_prob+thresh_delta,thresh_delta)):
# for thresh in tqdm(np.linspace(min_prob, max_prob, num=30,endpoint=True)):
#     p_r_thresh[thresh] = {}
#     dev_pred_words_at_thresh = get_pred_words_from_probs(dev_utts["probs"],
#                                                            thresh,
#                                                            len(bow_dict['i2w']))
#     p_r_thresh[thresh]['p'], p_r_thresh[thresh]['r'], _ = basic_precision_recall(dev_utts["refs"], 
#                                                                               dev_pred_words_at_thresh)
    

In [None]:
thresh_labels, p_vals, r_vals = [], [], []
for l in p_r_thresh:
    p_vals.append(p_r_thresh[l]["p"])
    r_vals.append(p_r_thresh[l]["r"])
    thresh_labels.append("{0:.2f}".format(l))

In [None]:
fig, ax = plt.subplots(figsize=(10,8))
ax.plot(r_vals, p_vals, label="Precision/Recall")
for i,j,k in zip(r_vals, p_vals, thresh_labels):
    ax.annotate(k,xy=(i,j), fontsize=12)
ax.set_ylabel("Precision", fontsize=28)
ax.set_xlabel("Recall", fontsize=26)
ax.legend(loc='upper center', bbox_to_anchor=(1.3, 0.9),
                  ncol=1, fancybox=True, shadow=True, fontsize=26)

plt.yticks(rotation=0, size=18)
plt.xticks(rotation=0, size=18)

In [None]:
PRED_THRESH = 0.15
print("using prediction threshold={0:.2f}".format(PRED_THRESH))
dev_pred_words = get_pred_words_from_probs(dev_utts["probs"],
                                           PRED_THRESH,
                                           len(bow_dict['i2w']))
p, r, _ = basic_precision_recall(single_dev_ref, dev_pred_words)
print(p,r)
# print("using mean positive prediction threshold")
# dev_mean_pred_words = get_pred_words_from_probs(dev_utts["probs"],
#                                            mean_pos_scores,
#                                            m_cfg['max_en_pred'])
# p, r, _ = basic_precision_recall(single_dev_ref, dev_mean_pred_words)
# print(p,r)

In [None]:
PRED_THRESH = 0.01
print("using prediction threshold={0:.2f}".format(PRED_THRESH))
dev_pred_words = get_pred_words_from_probs(dev_utts["probs"],
                                           PRED_THRESH,
                                           len(bow_dict['i2w']))
p, r, _ = basic_precision_recall(dev_utts["refs"], dev_pred_words)
print(p,r)
# print("using mean positive prediction threshold")
# dev_mean_pred_words = get_pred_words_from_probs(dev_utts["probs"],
#                                            mean_pos_scores,
#                                            m_cfg['max_en_pred'])
# p, r, _ = basic_precision_recall(single_dev_ref, dev_mean_pred_words)
# print(p,r)

In [None]:
precision_array = np.array([p_r_thresh[i]['p']/100 for i in p_r_thresh], dtype="f")
recall_array = np.array([p_r_thresh[i]['r']/100 for i in p_r_thresh], dtype="f")

In [None]:
precision_array[::-1]

In [None]:
np.trapz(precision_array[::-1], recall_array[::-1])

### get preds and refs in words

In [None]:
list(zip(train_pred_words, train_utts["refs"]))[:10]

In [None]:
dev_utt_preds_words = {}
dev_utt_refs_words = {}
for u, p, refs in zip(dev_utts['ids'], dev_pred_words, dev_utts["refs"]):
    dev_utt_preds_words[u] = list(set([bow_dict['i2w'][i].decode() for i in p]))
    dev_utt_refs_words[u] = []
    for r in refs:
        #print(r)
        dev_utt_refs_words[u].append([bow_dict['i2w'][i].decode() for i in set(r)])
single_dev_ref_words = {u: [dev_utt_refs_words[u][0]] for u in dev_utt_refs_words}

In [None]:
p, r, metric = basic_precision_recall(list(dev_utt_refs_words.values()), list(dev_utt_preds_words.values()))
p, r

In [None]:
ps, rs, _ = basic_precision_recall(single_dev_ref_words.values(), dev_utt_preds_words.values())
ps, rs

In [None]:
[(k, metric[k]) for k in ['rc', 'rt', 'tp', 'tc']]

In [None]:
words_correctly_predicted = [item for item in metric["word"].items() if item[1]['tc'] > 0]
print(len(words_correctly_predicted))
display(words_correctly_predicted)

In [None]:
# most common train words
[w.decode() for w, f in sorted(bow_dict['freq'].items(), reverse=True, key=lambda t: t[1])][:10]

In [None]:
list(single_dev_ref_words.items())[:5]

In [None]:
display_bow_words(single_dev_ref_words, 
                  dev_utt_preds_words, 
                  bow_dict, 
                  map_dict["fisher_dev"], display_num=100)

## Google model

In [17]:
_ = eval_nmt_model("", use_google=True, min_len=1)

eval refs found, loading
--------------------------------------------------------------------------------
BOW - using 1 reference
--------------------
precision=84.85, recall=69.54
--------------------
39 out of 40 retrieved with atleast 1 correct instance
--------------------------------------------------------------------------------
BOW - using all 4 references
--------------------
precision=92.82, recall=87.38
--------------------
36 out of 39 retrieved with atleast 1 correct instance
--------------------------------------------------------------------------------
MT task - using all 4 references
--------------------------------------------------------------------------------
precision=76.60, recall=67.74
--------------------------------------------------------------------------------
4 references bleu=45.204494
--------------------------------------------------------------------------------
using min len filter
--------------------
3977 out of 3979 have len >= 1
BLEU=45.21
precisi

In [32]:
metrics = eval_all_word_lists("", use_google=True)

eval refs found, loading
--------------------------------------------------------------------------------
--------------------
--------------------------------------------------------------------------------
BOW - using all 4 references
--------------------
precision=90.70, recall=83.11
--------------------------------------------------------------------------------
Using word list: 
always -- chicago -- colombia -- beautiful -- going -- thinking -- close -- topic -- coming -- nothing -- united -- states -- married -- example -- things -- spanish -- school -- seeing -- brothers -- stayed -- still -- living -- brother -- father -- college -- hours -- problems -- thing -- husband -- friends -- speak -- general -- everything -- happened -- never -- child -- whatever -- asked -- understand -- leave -- matter -- different -- wanted -- without -- mexico -- lives -- study -- usually -- email -- terrible -- classes -- class -- studying -- getting -- hello -- interested -- spend -- reason -- st

## Edin 150 hours model

In [None]:
sim_dict_path = os.path.join(m_cfg['data_path'], "mix_sim.dict")
sim_dict = pickle.load(open(sim_dict_path, "rb"))

In [None]:
len(bow_dict_es['freq_dev'])

In [None]:
# for w in sim_dict["w"]:
#     if len(sim_dict["w"][w]) > 1 and w in bow_dict_es["w2i"]:
#         print(w)
#         print(sim_dict["w"][w])
#         print(bow_dict_es["w2i"][w])

In [None]:
_ = eval_nmt_model("sp2enw_hyp_search/sp_1.0_l2e-4_rnn-3_drpt-0.5_cnn_96-2-2", min_len=1)

In [33]:
metrics = eval_all_word_lists("sp2enw_hyp_search/sp_1.0_l2e-4_rnn-3_drpt-0.5_cnn_96-2-2", 
                              use_google=False)

--------------------------------------------------------------------------------
--------------------
--------------------------------------------------------------------------------
BOW - using all 4 references
--------------------
precision=82.44, recall=65.59
--------------------------------------------------------------------------------
Using word list: 
spanish -- school -- going -- chicago -- brother -- college -- colombia -- friends -- everything -- things -- always -- beautiful -- thinking -- nothing -- married -- united -- states -- example -- thing -- brothers -- stayed -- still -- husband -- coming -- living -- father -- hours -- problems -- wanted -- topic -- seeing -- never -- asked -- matter -- understand -- different -- country -- happened -- child -- whatever -- leave -- words -- lives -- places -- mexico -- study -- close -- getting -- spend -- hello -- email -- speak -- class -- classes -- terrible -- general -- without -- studying -- spain -- phone -- interested -- 

### BLEU script
```
[bonnybridge]s1444673: export BLEU_SCRIPT=/afs/inf.ed.ac.uk/group/project/lowres/work/installs/mosesdecoder/scripts/generic/multi-bleu.perl
[bonnybridge]s1444673: export PREDS=sp2enw_hyp_search/sp_1.0_l2e-4_rnn-3_drpt-0.5_cnn_96-2-2
[bonnybridge]s1444673: perl $BLEU_SCRIPT $PREDS/fsh_dev_fisher_dev_en.ref* < $PREDS/fsh_dev_fisher_dev_hyp
BLEU = 29.44, 65.1/38.4/22.8/13.7 (BP=0.991, ratio=0.991, hyp_len=39719, ref_len=40096)
[bonnybridge]s1444673: perl $BLEU_SCRIPT $PREDS/fsh_test_fisher_test_en.ref* < $PREDS/fsh_test_fisher_test_hyp
BLEU = 29.64, 66.2/38.4/22.7/13.5 (BP=0.999, ratio=0.999, hyp_len=39201, ref_len=39257)
[bonnybridge]s1444673: perl $BLEU_SCRIPT $PREDS/fsh_dev_fisher_dev_en.ref[0,1,2]* < $PREDS/fsh_dev_fisher_dev_hyp

BLEU = 27.03, 62.7/35.8/20.7/12.1 (BP=0.987, ratio=0.987, hyp_len=39719, ref_len=40242)
[bonnybridge]s1444673: perl $BLEU_SCRIPT $PREDS/fsh_dev_fisher_dev_en.ref[1,2,3]* < $PREDS/fsh_dev_fisher_dev_hyp
BLEU = 27.03, 62.6/35.8/20.8/12.2 (BP=0.984, ratio=0.984, hyp_len=39719, ref_len=40353)
[bonnybridge]s1444673: perl $BLEU_SCRIPT $PREDS/fsh_dev_fisher_dev_en.ref[2,3,0]* < $PREDS/fsh_dev_fisher_dev_hyp
BLEU = 27.00, 62.8/35.9/20.8/12.1 (BP=0.984, ratio=0.984, hyp_len=39719, ref_len=40346)
[bonnybridge]s1444673: perl $BLEU_SCRIPT $PREDS/fsh_dev_fisher_dev_en.ref[1,3,0]* < $PREDS/fsh_dev_fisher_dev_hyp
BLEU = 27.10, 62.9/35.9/20.8/12.2 (BP=0.985, ratio=0.985, hyp_len=39719, ref_len=40339)
```

## Edin 50 hours model

In [None]:
old_metrics = eval_nmt_model("./sp2enw_mel-80_vocab-nltk/sp_0.33_h-256_e-128_l2e-3_lstm_drpt-0.3_cnn-32-2-2_rnn-3_b-40-50")

## Edin 25 hours model

In [None]:
_ = eval_nmt_model("./sp2enw_mel-80_vocab-nltk/sp_0.16_h-256_e-128_l2e-3_lstm_drpt-0.3_cnn-32-2-2_rnn-3_b-80-25_no-ln-bn")

## Edin 15 hours model

In [None]:
_ = eval_nmt_model("./sp2enw/sp_.10/")

## Edin 50 hours model - sample word embeddings

### seed: 0.33

In [None]:
_ = eval_nmt_model("sp2enw_mel-80_vocab-nltk/sp_0.33_h-300_e-128_l2e-6_lstm_drpt-0.5_cnn-64-2-2_rnn-3_b-80-25_no-bn-ln")

In [None]:
_ = eval_nmt_model("sp2enw_mel-80_vocab-nltk/sp_0.33_h-300_e-128_l2e-6_lstm_drpt-0.5_cnn-64-2-2_rnn-3_b-80-25_no-bn-ln_sample/")

### seed: AA

In [None]:
new50_metrics = eval_nmt_model("emb_sp2enw/sp_0.33_seed-AA")

In [None]:
new50_metrics = eval_nmt_model("emb_sp2enw/sp_0.33_seed-AA_mix-0.5")

## Edin 80 hours model - sample word embeddings

In [None]:
_ = eval_nmt_model("sp2enw_mel-80_vocab-nltk/sp_0.50_h-300_e-128_l2e-6_lstm_drpt-0.5_cnn-64-2-2_rnn-3_b-80-25_no-bn-ln")

In [None]:
_ = eval_nmt_model("sp2enw_mel-80_vocab-nltk/sp_0.50_h-300_e-128_l2e-6_lstm_drpt-0.5_cnn-64-2-2_rnn-3_b-80-25_no-bn-ln_sample/")

### Interspeech results

In [None]:
_ = eval_nmt_model("./sp2enw_interspeech/sp_0.33_seed-AB_l2e-4_drpt-0.5", min_len=1)

In [34]:
_ = eval_all_word_lists("./sp2enw_interspeech/sp_0.33_seed-AB_l2e-4_drpt-0.5")

--------------------------------------------------------------------------------
--------------------
--------------------------------------------------------------------------------
BOW - using all 4 references
--------------------
precision=62.70, recall=42.57
--------------------------------------------------------------------------------
Using word list: 
spanish -- school -- chicago -- brother -- phone -- college -- going -- married -- colombia -- friends -- everything -- things -- always -- thinking -- nothing -- united -- states -- example -- brothers -- still -- husband -- stayed -- coming -- lives -- hours -- problems -- country -- understand -- wanted -- topic -- interested -- thing -- seeing -- never -- matter -- different -- happened -- child -- living -- mexico -- study -- studying -- close -- hello -- email -- travel -- speak -- class -- classes -- terrible -- general -- without -- spain -- place -- words -- spend -- reason -- state -- parents -- neither -- whatever -- sl

In [None]:
model_50_da = eval_nmt_model("./sp2enw_interspeech/sp_0.33_seed-AB_l2e-4_drpt-0.5_sample-mix-0.5", 
                             min_len=1)

In [None]:
_ = eval_all_word_lists("./sp2enw_interspeech/sp_0.33_seed-AB_l2e-4_drpt-0.5_sample-mix-0.5")

In [None]:
_ = eval_nmt_model("./sp2enw_interspeech/sp_0.50_h-300_e-128_l2e-4_lstm_drpt-0.5_cnn-64-2-2_rnn-3_b-80-25_no-bn-ln", min_len=1)

In [35]:
_ = eval_all_word_lists("./sp2enw_interspeech/sp_0.50_h-300_e-128_l2e-4_lstm_drpt-0.5_cnn-64-2-2_rnn-3_b-80-25_no-bn-ln")

--------------------------------------------------------------------------------
--------------------
--------------------------------------------------------------------------------
BOW - using all 4 references
--------------------
precision=71.50, recall=52.60
--------------------------------------------------------------------------------
Using word list: 
spanish -- school -- chicago -- brother -- college -- colombia -- friends -- things -- thing -- always -- thinking -- nothing -- married -- united -- states -- example -- brothers -- going -- still -- stayed -- wanted -- coming -- living -- hours -- problems -- husband -- topic -- everything -- speak -- seeing -- never -- asked -- matter -- understand -- different -- happened -- child -- without -- phone -- lives -- places -- mexico -- study -- studying -- close -- spend -- hello -- email -- class -- classes -- words -- terrible -- general -- spain -- interested -- place -- spoke -- country -- reason -- state -- parents -- neither

In [None]:
model_80_da = eval_nmt_model("./sp2enw_interspeech/sp_0.50_h-300_e-128_l2e-4_lstm_drpt-0.5_cnn-64-2-2_rnn-3_b-80-25_no-bn-ln_mix-0.5/", min_len=1)

In [None]:
_ = eval_all_word_lists("./sp2enw_interspeech/sp_0.50_h-300_e-128_l2e-4_lstm_drpt-0.5_cnn-64-2-2_rnn-3_b-80-25_no-bn-ln_mix-0.5")

#### View translations

In [None]:
def clean_out_str(out_str):
    out_str = out_str.replace("`", "")
    out_str = out_str.replace('"', '')
    out_str = out_str.replace('¿', '')
    out_str = out_str.replace("''", "")
    out_str = out_str.strip()
    return out_str
    

In [None]:
def write_to_file_len_filtered_preds(nmt_path, 
                                     set_key="fisher_dev", 
                                     min_len=0, max_len=300, 
                                     use_gooogle=False):
    refs, hyps, allrefs = get_model_data(nmt_path, use_google=use_gooogle)
    filt_utts = []
    for u in refs:
        if (len(map_dict[set_key][u]["es_w"]) >= min_len and 
           len(map_dict[set_key][u]["es_w"]) <= max_len):
            filt_utts.append(u)
    
    filt_utts = sorted(filt_utts)
    print("Utts matching len filter={0:d}".format(len(filt_utts)))
    hyp_path = os.path.join(nmt_path, "hyps_min-{0:d}_max-{1:d}.en".format(min_len, max_len))
    print("writing hyps to: {0:s}".format(hyp_path))
    with open(hyp_path, "w") as out_f:
        for u in filt_utts:
            if use_gooogle:
                out_str = " ".join(hyps[u])
            else:
                out_str = ""
                for w in hyps[u]:
                    out_str += "{0:s}".format(w) if (w.startswith("'") or w=="n't") else " {0:s}".format(w)
                
                out_str = clean_out_str(out_str)
                
            out_f.write("{0:s}\n".format(out_str))
    
    for i in range(len(list(allrefs.values())[0])):
        refs_path = os.path.join(nmt_path, "ref_min-{0:d}_max-{1:d}.en{2:d}".format(min_len, 
                                                                                    max_len,
                                                                                    i))
        print("writing ref {0:d} to: {1:s}".format(i, refs_path))
        with open(refs_path, "w") as out_f:
            for u in filt_utts:
                if use_gooogle:
                    out_str = " ".join(allrefs[u][i])
                else:
                    out_str = ""
                    for w in allrefs[u][i]:
                        out_str += "{0:s}".format(w) if (w.startswith("'") or w=="n't") else " {0:s}".format(w)
                    out_str = clean_out_str(out_str)
                out_f.write("{0:s}\n".format(out_str))
    print("all done")

In [None]:
# nmt_path = "google"
# write_to_file_len_filtered_preds(nmt_path, 
#                                  set_key="fisher_dev", 
#                                  min_len=MIN_LEN, max_len=MAX_LEN, 
#                                  use_gooogle=True)

# !paste -d"\n" google/ref_min-0_max-2.en* > google/all_ref_min-0_max-2_meteor
# # !paste -d"\n" google/ref_min-{$MIN_LEN}_max-{$MAX_LEN}.en* > google/all_ref_min-{$MIN_LEN}_max-{$MAX_LEN}_meteor
# # !paste -d"\n" $nmt_path/ref_min-{$MIN_LEN}_max-{$MAX_LEN}.en* > google/$meteor_out

In [None]:
MINLEN = 25
MAXLEN = 300

In [None]:
nmt_models = ["google",
              "sp2enw_hyp_search/sp_1.0_l2e-4_rnn-3_drpt-0.5_cnn_96-2-2",
              "sp2enw_hyp_search/sp_1.0_l2e-4_rnn-3_drpt-0.5_cnn_96-2-2_mix-0.5/",
              "./sp2enw_interspeech/sp_0.50_h-300_e-128_l2e-4_lstm_drpt-0.5_cnn-64-2-2_rnn-3_b-80-25_no-bn-ln",
              "./sp2enw_interspeech/sp_0.50_h-300_e-128_l2e-4_lstm_drpt-0.5_cnn-64-2-2_rnn-3_b-80-25_no-bn-ln_mix-0.5/",
              "./sp2enw_interspeech/sp_0.33_seed-AB_l2e-4_drpt-0.5",
              "./sp2enw_interspeech/sp_0.33_seed-AB_l2e-4_drpt-0.5_sample-mix-0.5"
              
             ]

In [None]:
for nmt_path in nmt_models:
    write_to_file_len_filtered_preds(nmt_path, 
                                     set_key="fisher_dev", 
                                     min_len=MINLEN, max_len=MAXLEN, 
                                     use_gooogle = nmt_path == "google")
    print(nmt_path == "google")
    
    meteor_out = os.path.join(nmt_path, "meteor_4refs_min-{0:d}_max-{1:d}.en".format(MINLEN, MAXLEN))
    meteor_in = os.path.join(nmt_path, "ref_min-{0:d}_max-{1:d}.en".format(MINLEN, MAXLEN))
    
    !paste -d"\n" $meteor_in* > $meteor_out

```
export PREDS=haha
perl $BLEU_SCRIPT $PREDS/ref_min-0_max-300.* < $PREDS/hyps_min-0_max-300.en

java -Xmx2G -jar ../installs/meteor-1.5/meteor-*.jar $PREDS/hyps_min-0_max-300.en $PREDS/meteor_4refs_min-0_max-300.en -r 4 -l en -norm
```

In [None]:
model_keys = ["google",
              "sp_160",
              "sp_160_sample",
              "sp_80",
              "sp_80_sample",
              "sp_50",
              "sp_50_sample"
             ]

In [None]:
model_map = {model_keys[i]: nmt_models[i] for i in range(len(model_keys))}

In [None]:
len_filts = [(0,2), (3,5), (6,20), (21,40), (41,300)]

In [None]:
model_data = {}

In [None]:
for m in model_map:
    model_data[m] = get_model_data(model_map[m], use_google= m == "google")

In [None]:
u_bucks = {i:[] for i in range(len(len_filts))}
for u in map_dict["fisher_dev"]:
    es_w_len = len(map_dict["fisher_dev"][u]["es_w"])
    for i, f in enumerate(len_filts):
        if es_w_len >= f[0] and es_w_len <= f[1]:
            u_bucks[i].append(u)

In [None]:
[(i,len(v)) for i, v in u_bucks.items()]

In [None]:
random.seed("hmm")
selected_utts = []
for i in u_bucks:
    sub_set = random.sample(u_bucks[i], min(10,len(u_bucks[i])))
    selected_utts.extend(sub_set)

In [None]:
len(selected_utts)

In [None]:
play_utt(utt, m_dict)

In [None]:
with open("all_model_preds.txt", "w") as out_f:
    for u in selected_utts:
        out_f.write("------{0:s}------\n".format(u))
        es_words = " ".join([w.decode() for w in map_dict["fisher_dev"][u]["es_w"]])
        out_f.write("{0:20s} : {1:s}\n".format("es reference", es_words))
        out_f.write("{0:20s} : {1:s}\n".format("en reference", " ".join(model_data["google"][0][u])))
        for m in model_data:
            if m == "google":
                out_str =  " ".join(model_data[m][1][u])
            else:
                out_str = ""
                for w in model_data[m][1][u]:
                    out_str += "{0:s}".format(w) if (w.startswith("'") or w=="n't") else " {0:s}".format(w)
            out_f.write("{0:20s} : {1:s}\n".format(m, out_str))
        out_f.write("--------------")

In [None]:
wavs_path = os.path.join("../chainer2/speech2text/both_fbank_out/", "wavs")

STARTHERE

In [None]:
for u in selected_utts:
    print("------{0:s}------".format(u))
    play_utt(u, map_dict["fisher_dev"])
    es_words = " ".join([w.decode() for w in map_dict["fisher_dev"][u]["es_w"]])
    print("{0:20s} : {1:s}".format("es reference", es_words))
    print("{0:20s} : {1:s}".format("en reference", " ".join(model_data["google"][0][u])))
    for m in model_data:
        if m == "google":
            out_str =  " ".join(model_data[m][1][u])
        else:
            out_str = ""
            for w in model_data[m][1][u]:
                out_str += "{0:s}".format(w) if (w.startswith("'") or w=="n't") else " {0:s}".format(w)
        print("{0:20s} : {1:s}".format(m, out_str))
    print("--------------")

In [None]:
!paste -d"\n" $nmt_path/ref_min-0_max-2.en* > $nmt_path/all_ref_min-0_max-2_meteor

In [None]:
en_data = {}

In [None]:
len(list(en_data['google']['4refs'].values())[0])

In [None]:
en_data['google'] = {}
en_data['google']['refs'], en_data['google']['hyps'], en_data['google']['4refs'] = get_model_data("", 
                                                                                                  use_google=True)

In [None]:
nmt_path = "sp2enw_hyp_search/sp_1.0_l2e-4_rnn-3_drpt-0.5_cnn_96-2-2_mix-0.5/"
en_data['160hrs_da'] = {}
en_data['160hrs_da']['refs'], en_data['160hrs_da']['hyps'], en_data['160hrs_da']['4refs'] = get_model_data(nmt_path, 
                                                                                               use_google=False)

In [None]:
nmt_path = "./sp2enw_interspeech/sp_0.50_h-300_e-128_l2e-4_lstm_drpt-0.5_cnn-64-2-2_rnn-3_b-80-25_no-bn-ln"
en_data['80hrs'] = {}
en_data['80hrs']['refs'], en_data['80hrs']['hyps'], en_data['80hrs']['4refs'] = get_model_data(nmt_path, 
                                                                                               use_google=False)

In [None]:
nmt_path = "./sp2enw_interspeech/sp_0.50_h-300_e-128_l2e-4_lstm_drpt-0.5_cnn-64-2-2_rnn-3_b-80-25_no-bn-ln_mix-0.5"
en_data['80hrs_da'] = {}
en_data['80hrs_da']['refs'], en_data['80hrs_da']['hyps'], en_data['80hrs_da']['4refs'] = get_model_data(nmt_path, 
                                                                                               use_google=False)

In [None]:
nmt_path = "./sp2enw_interspeech/sp_0.33_seed-AB_l2e-4_drpt-0.5"
en_data['50hrs'] = {}
en_data['50hrs']['refs'], en_data['50hrs']['hyps'], en_data['50hrs']['4refs'] = get_model_data(nmt_path, 
                                                                                               use_google=False)

In [None]:
nmt_path = "./sp2enw_interspeech/sp_0.33_seed-AB_l2e-4_drpt-0.5_sample-mix-0.5"
en_data['50hrs_da'] = {}
en_data['50hrs_da']['refs'], en_data['50hrs_da']['hyps'], en_data['50hrs_da']['4refs'] = get_model_data(nmt_path, 
                                                                                               use_google=False)

In [None]:
utt = '20051023_232057_325_fsp-A-3'
utt = '20051017_180712_270_fsp-B-2'
utt = '20051009_182032_217_fsp-B-149'

for m in en_data:
    print(m, ' & ', " ".join(en_data[m]['hyps'][utt]), ' \\\\')
    print()
    
print(" ".join(en_data[m]['refs'][utt]))

In [None]:
", ".join([w.decode() for w in bow_dict['w2i'].keys()])

In [None]:
len(bow_dict['freq'])-4

In [None]:
keyword = "help"
data_key = "80hrs_da"

print(bow_dict["freq"][keyword.encode()], bow_dict["freq_dev"][keyword.encode()])

t_count = 0
c_count = 0
tp_count = 0
corr_utts = []

for u in en_data[data_key]["hyps"]:
    common_ref_words = set(en_data[data_key]["4refs"][u][0])
    for curr_ref in en_data[data_key]["4refs"][u][1:]:
        common_ref_words &= set(curr_ref)
#     if sum([1 if keyword in set(r) else 0 for r in en_data['50hrs_da']["4refs"][u]]) >= 4:
    if keyword in common_ref_words:
        t_count += 1
    if keyword in en_data[data_key]["hyps"][u]:
        tp_count += 1
    if keyword in en_data[data_key]["hyps"][u] and keyword in common_ref_words:
        c_count += 1
        corr_utts.append(u)

print(t_count, c_count, tp_count)
        
for u in corr_utts:
    print(" ".join(en_data[data_key]["hyps"][u]))
    for r in en_data[data_key]["4refs"][u]:
        if keyword in r:
            print(" ".join(r))
            break

In [None]:
model_50_da[1]['word'][keyword]

In [None]:
tc = sum([model_50_da[1]['word'][w]['tc'] for w in model_50_da[1]['word']])

In [None]:
tp = sum([model_50_da[1]['word'][w]['t'] for w in model_50_da[1]['word']])

In [None]:
tc / tp

In [None]:
for w in model_50_da[1]['word']:
    if w in model_50_da[1]['word']: 
        p_50 = model_50_da[1]['word'][w]['tc'] / model_50_da[1]['word'][w]['t'] if model_50_da[1]['word'][w]['t'] > 0 else 0
    else:
        p_50 = 0
    if w in model_80_da[1]['word']:
        p_80 = model_80_da[1]['word'][w]['tc'] / model_80_da[1]['word'][w]['t'] if model_80_da[1]['word'][w]['t'] > 0 else 0
    else:
        p_80 = 0
    print(w, 
          "{0:20.1f}".format(p_50 * 100),
          "{0:20.1f}".format(p_80 * 100))

In [None]:
for w in model_50_da[1]['word']:
    if model_50_da[1]['word'][w]['t'] > 0 and model_50_da[1]['word'][w]['tc'] / model_50_da[1]['word'][w]['t'] >= 0:
        print(w, "{0:.1f}".format(model_50_da[1]['word'][w]['tc'] / model_50_da[1]['word'][w]['t'] * 100))

In [None]:
for w in model_80_da[1]['word']:
    if model_80_da[1]['word'][w]['t'] > 0 and model_80_da[1]['word'][w]['tc'] / model_80_da[1]['word'][w]['t'] >= 0.4:
        print(w, "{0:.1f}".format(model_80_da[1]['word'][w]['tc'] / model_80_da[1]['word'][w]['t'] * 100))

### Dummy baseline

In [None]:
freq_sorted_words = [w.decode() for w,f in sorted(bow_dict['freq'].items(), reverse=True, key=lambda t: t[1])]

In [None]:
predict_top_K = 5
top_K_words = freq_sorted_words[:predict_top_K]

In [None]:
" --- ".join(top_K_words)

In [None]:
dummy_preds = [top_K_words for u in google_hyp_r0]

In [None]:
basic_precision_recall(google_utt_refs_words_bow.values(), dummy_preds)[:2]

In [None]:
basic_precision_recall(google_single_ref.values(), dummy_preds)[:2]

In [None]:
max_pred = 10

In [None]:
dummy_p_vals = np.zeros((max_pred), dtype="f")
dummy_r_vals = np.zeros((max_pred), dtype="f")

In [None]:
dummy_p_vals

In [None]:
p_r_dummy = {}
# for thresh in tqdm(np.arange(min_prob, max_prob+thresh_delta,thresh_delta)):
for num_pred in tqdm(range(0,max_pred)):
    top_K_words = freq_sorted_words[:num_pred+1]
    dummy_preds = [top_K_words for u in google_hyp_r0]
    dummy_p_vals[num_pred], dummy_r_vals[num_pred] = basic_precision_recall(google_utt_refs_words_bow.values(), 
                                                                            dummy_preds)[:2]
    

In [None]:
dummy_p_vals /= 100.0
dummy_r_vals /= 100.0

In [None]:
np.trapz(dummy_p_vals, dummy_r_vals)

In [None]:
thresh_labels = range(1,max_pred+1)

In [None]:
fig, ax = plt.subplots(figsize=(10,8))
ax.plot(dummy_r_vals*100, dummy_p_vals*100, label="Precision/Recall")
for i,j,k in zip(dummy_r_vals*100, dummy_p_vals*100, thresh_labels):
    ax.annotate(k,xy=(i,j), fontsize=12)
ax.set_ylabel("Precision", fontsize=28)
ax.set_xlabel("Recall", fontsize=26)
ax.legend(loc='upper center', bbox_to_anchor=(1.3, 0.9),
                  ncol=1, fancybox=True, shadow=True, fontsize=26)

plt.yticks(rotation=0, size=18)
plt.xticks(rotation=0, size=18)

In [None]:
# basic_precision_recall(single_dev_ref_words.values(), dummy_preds)[:2]

## Google refs vs Edin refs

In [None]:
sum([1 if len(r[0]) > 0 else 0 for r in single_dev_ref_words.values()])

In [None]:
sum([len(set(r[0])-{'_UNK'})for r in single_dev_ref_words.values()])

In [None]:
mismatch_count = 0
for u in set(google_single_ref.keys()) & set(single_dev_ref_words.keys()):
    if set(single_dev_ref_words[u][0]) - {'_UNK'} != set(google_single_ref[u][0]):
        mismatch_count += max(len(set(single_dev_ref_words[u][0]) - set(google_single_ref[u][0])), 
                              len(set(google_single_ref[u][0]) - set(single_dev_ref_words[u][0])))
        print(u, single_dev_ref_words[u], google_single_ref[u])
        print((set(single_dev_ref_words[u][0]) - set(google_single_ref[u][0])), 
                              (set(google_single_ref[u][0]) - set(single_dev_ref_words[u][0])))

In [None]:
print(mismatch_count)

In [None]:
[(set(r[0])-{'_UNK'})for r in single_dev_ref_words.values()]

In [28]:
bow_dict["w2i"]

{b'_EOS': 2,
 b'_GO': 1,
 b'_PAD': 0,
 b'_UNK': 3,
 b'another': 10,
 b'case': 27,
 b'change': 28,
 b'city': 13,
 b'coming': 38,
 b'even': 9,
 b'first': 17,
 b'found': 35,
 b'gets': 31,
 b'girl': 22,
 b'give': 15,
 b'help': 25,
 b'high': 37,
 b'home': 24,
 b'house': 11,
 b'huge': 42,
 b'leave': 29,
 b'life': 18,
 b'live': 8,
 b'lives': 33,
 b'love': 21,
 b'make': 16,
 b'morning': 40,
 b'name': 12,
 b'need': 20,
 b'news': 39,
 b'people': 4,
 b'remember': 19,
 b'saying': 26,
 b'send': 32,
 b'someone': 14,
 b'stay': 30,
 b'terrible': 36,
 b'time': 5,
 b'town': 41,
 b'waiting': 43,
 b'want': 7,
 b'watch': 34,
 b'women': 23,
 b'years': 6}