In [1]:
import nltk
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu, modified_precision
from nltk.translate.chrf_score import sentence_chrf, corpus_chrf
from nltk.metrics import scores
import scipy.io.wavfile
from IPython.display import Audio
from IPython.display import display
from nltk.stem import *
# from nltk.stem.snowball import SnowballStemmer
from stemming.porter2 import stem
import stemming
from nltk.metrics.scores import recall

from nltk.corpus import stopwords

%matplotlib inline

In [2]:
from bow_run import *

In [3]:
%run utils.ipynb

In [4]:
smooth_fun = nltk.translate.bleu_score.SmoothingFunction()

In [5]:
def nmt_basic_precision_recall(r, h, display=False):
    p_numerators = Counter() # Key = ngram order, and value = no. of ngram matches.
    p_denominators = Counter() # Key = ngram order, and value = no. of ngram in ref.
    r_numerators = Counter() # Key = ngram order, and value = no. of ngram matches.
    r_denominators = Counter() # Key = ngram order, and value = no. of ngram in ref.
    metrics = {"rc": 0, "rt": 0, "tp": 0, "tc": 0, "word": {}}

    if display:
        print("total utts={0:d}".format(len(r)))

    i=1

    for references, hypothesis in zip(r, h):
        if min([len(any_ref) for any_ref in references]) > 0:
            if len(hypothesis) > 0:
                p_i = modified_precision(references, hypothesis, i)
                p_numerators[i] += p_i.numerator
                p_denominators[i] += p_i.denominator

                metrics["tc"] += p_i.numerator
                metrics["tp"] += p_i.denominator
            else:
                p_numerators[i] += 0
                p_denominators[i] += 0

                metrics["tc"] += 0
                metrics["tp"] += 0

            #print(p_i.numerator, p_i.denominator)

            tot_match = 0
            tot_count = 0

            max_recall_match, max_tp, max_t, max_word_level_details = count_match(references[0], hypothesis)
            max_recall = max_recall_match / max_t if max_t > 0 else 0

            for curr_ref in references:
                curr_match, curr_tp, curr_t, curr_word_level_details = count_match(curr_ref, hypothesis)
                curr_recall = curr_match / curr_t if curr_t > 0 else 0

                if curr_recall > max_recall:
                    max_recall_match = curr_match
                    max_t = curr_t
                    max_recall = curr_recall
                    max_word_level_details = curr_word_level_details

            r_numerators[i] += max_recall_match
            r_denominators[i] += max_t
            metrics["rc"] += max_recall_match
            metrics["rt"] += max_t
            for key in {"t","tp","tc"}:
                for w in max_word_level_details[key]:
                    if w not in metrics["word"]:
                        metrics["word"][w] = {"t": 0, "tp": 0, "tc": 0}
                    metrics["word"][w][key] += max_word_level_details[key][w]

    prec = [(n / d) * 100 if d > 0 else 0 for n,d in zip(p_numerators.values(), p_denominators.values())]
    rec = [(n / d) * 100 if d > 0 else 0 for n,d in zip(r_numerators.values(), r_denominators.values())]

    if display:
        print("{0:10s} | {1:>8s}".format("metric", "1-gram"))
        print("-"*54)
        print("{0:10s} | {1:8.2f}".format("precision", *prec))
        print("{0:10s} | {1:8.2f}".format("recall", *rec))

    return prec[0], rec[0], metrics

In [6]:
def get_model_data(nmt_path, use_google=False):
    if use_google:
        google_s2t_hyps, google_s2t_refs, nmt_4refs = get_google_data()
        nmt_hyps = google_s2t_hyps['fisher_dev_r0']
        nmt_refs = google_s2t_refs['fisher_dev_ref_0']
    else:
        nmt_refs = pickle.load(open(os.path.join(nmt_path, 
                                                 "model_s2t_refs.dict"), "rb"))
        nmt_hyps = pickle.load(open(os.path.join(nmt_path, 
                                                 "model_s2t_hyps.dict"), "rb"))
        nmt_4refs = pickle.load(open(os.path.join(nmt_path,
                                                  "model_s2t_refs_for_eval.dict"), "rb"))
    
    return nmt_refs, nmt_hyps, nmt_4refs

In [29]:
def eval_nmt_model(nmt_path, use_google=False, min_len=10):
    smooth_fun = nltk.translate.bleu_score.SmoothingFunction()
    
    nmt_refs, nmt_hyps, nmt_4refs = get_model_data(nmt_path, use_google=use_google)

    nmt_preds_bow = {}
    nmt_1_ref = {}
    nmt_refs_bow = {}

    dev_utt_ids = nmt_hyps.keys()

    for u in dev_utt_ids:
        nmt_preds_bow[u] = list(get_words_in_bow_vocab(nmt_hyps[u], bow_dict))
        nmt_refs_bow[u] = []
        nmt_1_ref[u] = [list(get_words_in_bow_vocab(nmt_refs[u], bow_dict))]
        for r in nmt_4refs[u]:
            nmt_refs_bow[u].append(list(get_words_in_bow_vocab(r, bow_dict)))
    
    
    p_bow, r_bow, metrics_1_bow = basic_precision_recall(nmt_1_ref.values(), 
                                                       nmt_preds_bow.values())
    print("-"*80)
    print("BOW - using 1 reference")
    print("-"*20)
    print("precision={0:.2f}, recall={1:.2f}".format(p_bow, r_bow))
    num_1correct = len([item for item in metrics_1_bow["word"].items() 
                        if item[1]['tc'] > 0])
    
    num_all = len([item for item in metrics_1_bow["word"].items() 
                        if item[1]['t'] > 0])

    print("-"*20)
    print("{0:d} out of {1:d} retrieved with atleast 1 correct instance".format(num_1correct, num_all))
    

    p_bow, r_bow, metrics_bow = basic_precision_recall(nmt_refs_bow.values(), 
                                                       nmt_preds_bow.values())
    
    print("-"*80)
    print("BOW - using all 4 references")
    print("-"*20)
    print("precision={0:.2f}, recall={1:.2f}".format(p_bow, r_bow))
    
    num_1correct = len([item for item in metrics_bow["word"].items() 
                        if item[1]['tc'] > 0])
    
    num_all = len([item for item in metrics_bow["word"].items() 
                        if item[1]['t'] > 0])

    print("-"*20)
    print("{0:d} out of {1:d} retrieved with atleast 1 correct instance".format(num_1correct, num_all))
    
    # MT PRECISION RECALL - NOOOT BOW
    p_nmt, r_nmt, metrics_nmt = nmt_basic_precision_recall(nmt_4refs.values(), 
                                                       nmt_hyps.values())
    print("-"*80)
    print("MT task - using all 4 references")
    print("-"*80)
    print("precision={0:.2f}, recall={1:.2f}".format(p_nmt, r_nmt))

    nmt_bleu = corpus_bleu(nmt_4refs.values(), 
                           nmt_hyps.values(),
                           smoothing_function=smooth_fun.method2)

    print("-"*80)
    print("4 references bleu={0:2f}".format(nmt_bleu*100))
    
    one_ref_list = []
    one_hyp_list = []
    
#     for u in nmt_refs:
#         one_ref_list.append([nmt_refs[u]])
#         one_hyp_list.append(nmt_hyps[u])
        
#     p_nmt_one, r_nmt_one, metrics_nmt_one = nmt_basic_precision_recall(one_ref_list, 
#                                                            one_hyp_list)
#     print("-"*80)
#     print("MT task - using single references")
#     print("-"*80)
#     print("precision={0:.2f}, recall={1:.2f}".format(p_nmt_one, r_nmt_one))
    
#     nmt_bleu = corpus_bleu(one_ref_list, 
#                            one_hyp_list,
#                            smoothing_function=smooth_fun.method2)

#     print("-"*80)
#     print("single reference bleu={0:2f}".format(nmt_bleu*100))
    print("-"*80)
    print("using min len filter")
    print("-"*20)
    check_bleu_with_len_filter(nmt_4refs, nmt_hyps, min_len=min_len)
    
    return metrics_1_bow, metrics_bow

In [16]:
def check_bleu_with_len_filter(refs, hyps, min_len):
    sel_refs, sel_hyps = [], []
    for u in refs:
        len_ref = min([len(r) for r in refs[u]])
        if len_ref >= min_len:
            sel_refs.append(refs[u])
            sel_hyps.append(hyps[u])
    print("{0:d} out of {1:d} have len >= {2:d}".format(len(sel_refs), len(refs), min_len))
    bleu_score = corpus_bleu(sel_refs, sel_hyps, smoothing_function=smooth_fun.method2)*100
    print("BLEU={0:.2f}".format(bleu_score))
    sel_p, sel_r, _ = nmt_basic_precision_recall(sel_refs, sel_hyps)
    print("precision={0:.2f}, recall={1:.2f}".format(sel_p, sel_r))

## Edin model

In [9]:
cfg_path = "sp2bagwords/sp_0.50_trial-A/"

In [10]:
last_epoch, model, optimizer, m_cfg, t_cfg = check_model(cfg_path)

cnn_out_dim = rnn_in_units =  640




using ADAM optimizer
--------------------------------------------------------------------------------
model not found


In [11]:
%%capture
train_key = m_cfg['train_set']
dev_key = m_cfg['dev_set']
batch_size=t_cfg['batch_size']
enc_key=m_cfg['enc_key']
dec_key=m_cfg['dec_key']
input_path = os.path.join(m_cfg['data_path'], m_cfg['dev_set'])
# -------------------------------------------------------------------------
# get data dictionaries
# -------------------------------------------------------------------------
map_dict, vocab_dict, bucket_dict, bow_dict = get_data_dicts(m_cfg)
if os.path.exists(os.path.join(m_cfg['model_dir'], "model_s2t_dev_out.dict")):
    dev_utts = pickle.load(open(os.path.join(m_cfg['model_dir'], "model_s2t_dev_out.dict"), "rb"))

if os.path.exists(os.path.join(m_cfg['model_dir'], "model_s2t_train_out.dict")):
    train_utts = pickle.load(open(os.path.join(m_cfg['model_dir'], "model_s2t_train_out.dict"), "rb"))
# batch_size = {'max': 128, 'med': 128, 'min': 128, 'scale': 1}
batch_size = {'max': 64, 'med': 64, 'min': 64, 'scale': 1}
batch_size = t_cfg['batch_size']

edin_s2t_refs_for_eval_path = os.path.join("../chainer2/speech2text/both_fbank_out/", 
                                           "edin_s2t_refs_for_eval.dict")
edin_s2t_refs_for_eval = pickle.load(open(edin_s2t_refs_for_eval_path, "rb"))
single_dev_ref = [[i[0]] for i in dev_utts["refs"]]

In [None]:
map_dict["fisher_dev"]['20051009_182032_217_fsp-B-1'].keys()

In [None]:
input_path = os.path.join(m_cfg['data_path'],
                                      m_cfg['train_set'])
train_utts, train_loss = feed_model(model,
                              optimizer=optimizer,
                              m_dict=map_dict[train_key],
                              b_dict=bucket_dict[train_key],
                              vocab_dict=vocab_dict,
                              bow_dict=bow_dict,
                              batch_size=batch_size,
                              x_key=enc_key,
                              y_key=dec_key,
                              train=False,
                              input_path=input_path,
                              max_dec=m_cfg['max_en_pred'],
                              t_cfg=t_cfg,
                              use_y=True,
                              get_probs=True)

In [None]:
# pickle.dump(train_utts, open(os.path.join(m_cfg['model_dir'], "model_s2t_train_out.dict"), "wb"))

In [None]:
mean_pos_scores = np.array([0.0 for _ in bow_dict["i2w"]], dtype="f")
mean_neg_scores = np.array([0.0 for _ in bow_dict["i2w"]], dtype="f")


for i_w in range(4, len(bow_dict["i2w"])):
    this_word = bow_dict["i2w"][i_w]
    pos_indx = [i_w in r[0] for r in train_utts["refs"]]
    neg_indx = [i_w not in r[0] for r in train_utts["refs"]]
    mean_pos_scores[i_w] = np.mean(train_utts["probs"][:,i_w][pos_indx])
    mean_neg_scores[i_w] = np.mean(train_utts["probs"][:,i_w][neg_indx])

In [None]:
xp.mean(mean_pos_scores), xp.mean(mean_neg_scores)

In [None]:
train_avg_p, _ = compute_avg_precision(train_utts["probs"],
                                                     0.0, 1.0, 5,
                                                     m_cfg['max_en_pred'],
                                                     train_utts["refs"])
train_avg_p

In [None]:
THRESH = m_cfg["pred_thresh"]
train_pred_words = get_pred_words_from_probs(train_utts["probs"],
#                                              mean_pos_scores,
                                               0.5,
                                               m_cfg['max_en_pred'])

train_prec, train_rec, _ = basic_precision_recall(train_utts["refs"], train_pred_words)
train_prec, train_rec

In [None]:
input_path = os.path.join(m_cfg['data_path'], m_cfg['dev_set'])

dev_utts, dev_loss = feed_model(model,
                                optimizer=optimizer,
                                m_dict=map_dict[dev_key],
                                b_dict=bucket_dict[dev_key],
                                vocab_dict=vocab_dict,
                                bow_dict=bow_dict,
                                batch_size=batch_size,
                                x_key=enc_key,
                                y_key=dec_key,
                                train=False,
                                input_path=input_path,
                                max_dec=m_cfg['max_en_pred'],
                                t_cfg=t_cfg,
                                use_y=True,
                                get_probs=True)

In [None]:
single_dev_ref = [[i[0]] for i in dev_utts["refs"]]

In [None]:
np.min(dev_utts["probs"]), np.max(dev_utts["probs"])

In [None]:
mean_dev_pos_scores = np.array([0.0 for _ in bow_dict["i2w"]], dtype="f")
mean_dev_neg_scores = np.array([0.0 for _ in bow_dict["i2w"]], dtype="f")


for i_w in range(4, len(bow_dict["i2w"])):
    this_word = bow_dict["i2w"][i_w]
    pos_indx = [i_w in r[0] for r in dev_utts["refs"]]
    neg_indx = [i_w not in r[0] for r in dev_utts["refs"]]
    mean_dev_pos_scores[i_w] = np.mean(dev_utts["probs"][:,i_w][pos_indx])
    mean_dev_neg_scores[i_w] = np.mean(dev_utts["probs"][:,i_w][neg_indx])

In [None]:
PRED_THRESH = 0.2
print("using prediction threshold={0:.2f}".format(PRED_THRESH))
dev_pred_words = get_pred_words_from_probs(dev_utts["probs"],
                                           PRED_THRESH,
                                           len(bow_dict['i2w']))
p, r, haha = basic_precision_recall(dev_utts["refs"], dev_pred_words)
print(p,r)
print("using mean positive prediction threshold")
dev_pred_words = get_pred_words_from_probs(dev_utts["probs"],
                                               mean_dev_pos_scores,
                                               m_cfg['max_en_pred'])
p, r, _ = basic_precision_recall(dev_utts["refs"], dev_pred_words)
print(p,r)

In [None]:
PRED_THRESH = 0.1
print("using prediction threshold={0:.2f}".format(PRED_THRESH))
dev_pred_words = get_pred_words_from_probs(dev_utts["probs"],
                                           PRED_THRESH,
                                           len(bow_dict['i2w']))
p, r, _ = basic_precision_recall(single_dev_ref, dev_pred_words)
print(p,r)
print("using mean positive prediction threshold")
dev_pred_words = get_pred_words_from_probs(dev_utts["probs"],
                                               mean_dev_pos_scores,
                                               m_cfg['max_en_pred'])
p, r, _ = basic_precision_recall(single_dev_ref, dev_pred_words)
print(p,r)

In [None]:
min_prob, max_prob = float(xp.min(dev_utts["probs"])), float(xp.max(dev_utts["probs"]))
min_prob, max_prob

### Precision-Recall Plot - word level threshold

In [None]:
np.arange(-0.5, 0.5, 0.1)

In [None]:
mean_pos_scores[4:14]*1.3

In [None]:
thresh_deltas = np.asarray([0.7,0.8,0.9,1,1.1,1.2,1.3], dtype="f")

In [None]:
p_r_thresh = {}
thresh_delta = 0.05
for thresh in tqdm(np.arange(-0.5, 0.5+thresh_delta, thresh_delta)):
# for thresh in tqdm(thresh_deltas):
# for thresh in tqdm(np.linspace(min_prob, max_prob,num=20,endpoint=True)):
    p_r_thresh[thresh] = {}
    dev_pred_words_at_thresh = get_pred_words_from_probs(dev_utts["probs"],
                                                           mean_pos_scores + thresh,
                                                           len(bow_dict['i2w']))
    p_r_thresh[thresh]['p'], p_r_thresh[thresh]['r'], _ = basic_precision_recall(dev_utts["refs"], 
                                                                              dev_pred_words_at_thresh)
    

In [None]:
thresh_labels, p_vals, r_vals = [], [], []
for l in p_r_thresh:
    p_vals.append(p_r_thresh[l]["p"])
    r_vals.append(p_r_thresh[l]["r"])
    thresh_labels.append("{0:.2f}".format(l))

In [None]:
fig, ax = plt.subplots(figsize=(10,8))
ax.plot(r_vals, p_vals, label="Precision/Recall")
for i,j,k in zip(r_vals, p_vals, thresh_labels):
    ax.annotate(k,xy=(i,j), fontsize=12)
ax.set_ylabel("Precision", fontsize=28)
ax.set_xlabel("Recall", fontsize=26)
ax.legend(loc='upper center', bbox_to_anchor=(1.3, 0.9),
                  ncol=1, fancybox=True, shadow=True, fontsize=26)

plt.yticks(rotation=0, size=18)
plt.xticks(rotation=0, size=18)

### Precision-Recall Plot - fixed threshold

In [None]:
len(dev_utts["probs"][0]), len(bow_dict['i2w'])

In [None]:
def compute_avg_precision(probs, min_prob, max_prob, num_points, max_words, refs):
    p_r_thresh = {}
    for thresh in tqdm(np.linspace(min_prob, max_prob, num=num_points, endpoint=True)):
        p_r_thresh[thresh] = {}
        words_at_thresh = get_pred_words_from_probs(probs, thresh, max_words)
        p_r_thresh[thresh]['p'], p_r_thresh[thresh]['r'], _ = basic_precision_recall(refs, words_at_thresh)
    
    precision_array = np.array([p_r_thresh[i]['p']/100 for i in p_r_thresh], dtype="f")
    recall_array = np.array([p_r_thresh[i]['r']/100 for i in p_r_thresh], dtype="f")
    avg_p = np.trapz(precision_array[::-1], recall_array[::-1])
    return avg_p, p_r_thresh
    

In [None]:
avg_p, p_r_thresh = compute_avg_precision(dev_utts["probs"], 0.0, 1.0, 50, 104, dev_utts["refs"])

In [None]:
avg_p

In [None]:
# p_r_thresh = {}
# thresh_delta = 0.01
# # for thresh in tqdm(np.arange(min_prob, max_prob+thresh_delta,thresh_delta)):
# for thresh in tqdm(np.linspace(min_prob, max_prob, num=30,endpoint=True)):
#     p_r_thresh[thresh] = {}
#     dev_pred_words_at_thresh = get_pred_words_from_probs(dev_utts["probs"],
#                                                            thresh,
#                                                            len(bow_dict['i2w']))
#     p_r_thresh[thresh]['p'], p_r_thresh[thresh]['r'], _ = basic_precision_recall(dev_utts["refs"], 
#                                                                               dev_pred_words_at_thresh)
    

In [None]:
thresh_labels, p_vals, r_vals = [], [], []
for l in p_r_thresh:
    p_vals.append(p_r_thresh[l]["p"])
    r_vals.append(p_r_thresh[l]["r"])
    thresh_labels.append("{0:.2f}".format(l))

In [None]:
fig, ax = plt.subplots(figsize=(10,8))
ax.plot(r_vals, p_vals, label="Precision/Recall")
for i,j,k in zip(r_vals, p_vals, thresh_labels):
    ax.annotate(k,xy=(i,j), fontsize=12)
ax.set_ylabel("Precision", fontsize=28)
ax.set_xlabel("Recall", fontsize=26)
ax.legend(loc='upper center', bbox_to_anchor=(1.3, 0.9),
                  ncol=1, fancybox=True, shadow=True, fontsize=26)

plt.yticks(rotation=0, size=18)
plt.xticks(rotation=0, size=18)

In [None]:
PRED_THRESH = 0.15
print("using prediction threshold={0:.2f}".format(PRED_THRESH))
dev_pred_words = get_pred_words_from_probs(dev_utts["probs"],
                                           PRED_THRESH,
                                           len(bow_dict['i2w']))
p, r, _ = basic_precision_recall(single_dev_ref, dev_pred_words)
print(p,r)
# print("using mean positive prediction threshold")
# dev_mean_pred_words = get_pred_words_from_probs(dev_utts["probs"],
#                                            mean_pos_scores,
#                                            m_cfg['max_en_pred'])
# p, r, _ = basic_precision_recall(single_dev_ref, dev_mean_pred_words)
# print(p,r)

In [None]:
PRED_THRESH = 0.01
print("using prediction threshold={0:.2f}".format(PRED_THRESH))
dev_pred_words = get_pred_words_from_probs(dev_utts["probs"],
                                           PRED_THRESH,
                                           len(bow_dict['i2w']))
p, r, _ = basic_precision_recall(dev_utts["refs"], dev_pred_words)
print(p,r)
# print("using mean positive prediction threshold")
# dev_mean_pred_words = get_pred_words_from_probs(dev_utts["probs"],
#                                            mean_pos_scores,
#                                            m_cfg['max_en_pred'])
# p, r, _ = basic_precision_recall(single_dev_ref, dev_mean_pred_words)
# print(p,r)

In [None]:
precision_array = np.array([p_r_thresh[i]['p']/100 for i in p_r_thresh], dtype="f")
recall_array = np.array([p_r_thresh[i]['r']/100 for i in p_r_thresh], dtype="f")

In [None]:
precision_array[::-1]

In [None]:
np.trapz(precision_array[::-1], recall_array[::-1])

### get preds and refs in words

In [None]:
list(zip(train_pred_words, train_utts["refs"]))[:10]

In [None]:
dev_utt_preds_words = {}
dev_utt_refs_words = {}
for u, p, refs in zip(dev_utts['ids'], dev_pred_words, dev_utts["refs"]):
    dev_utt_preds_words[u] = list(set([bow_dict['i2w'][i].decode() for i in p]))
    dev_utt_refs_words[u] = []
    for r in refs:
        #print(r)
        dev_utt_refs_words[u].append([bow_dict['i2w'][i].decode() for i in set(r)])
single_dev_ref_words = {u: [dev_utt_refs_words[u][0]] for u in dev_utt_refs_words}

In [None]:
p, r, metric = basic_precision_recall(list(dev_utt_refs_words.values()), list(dev_utt_preds_words.values()))
p, r

In [None]:
ps, rs, _ = basic_precision_recall(single_dev_ref_words.values(), dev_utt_preds_words.values())
ps, rs

In [None]:
[(k, metric[k]) for k in ['rc', 'rt', 'tp', 'tc']]

In [None]:
words_correctly_predicted = [item for item in metric["word"].items() if item[1]['tc'] > 0]
print(len(words_correctly_predicted))
display(words_correctly_predicted)

In [None]:
# most common train words
[w.decode() for w, f in sorted(bow_dict['freq'].items(), reverse=True, key=lambda t: t[1])][:10]

In [None]:
list(single_dev_ref_words.items())[:5]

In [None]:
display_bow_words(single_dev_ref_words, 
                  dev_utt_preds_words, 
                  bow_dict, 
                  map_dict["fisher_dev"], display_num=100)

## Google model

In [30]:
_ = eval_nmt_model("", use_google=True, min_len=10)

eval refs found, loading
--------------------------------------------------------------------------------
BOW - using 1 reference
--------------------
precision=84.85, recall=69.54
--------------------
39 out of 40 retrieved with atleast 1 correct instance
--------------------------------------------------------------------------------
BOW - using all 4 references
--------------------
precision=92.82, recall=69.30
--------------------
38 out of 40 retrieved with atleast 1 correct instance
--------------------------------------------------------------------------------
MT task - using all 4 references
--------------------------------------------------------------------------------
precision=76.60, recall=67.74
--------------------------------------------------------------------------------
4 references bleu=45.204494
--------------------------------------------------------------------------------
using min len filter
--------------------
1390 out of 3979 have len >= 10
BLEU=46.59
precis

## Edin 150 hours model

In [32]:
_ = eval_nmt_model("./sp2enw/sp_1.0_h512_rnn4_l2e-4/", min_len=10)

--------------------------------------------------------------------------------
BOW - using 1 reference
--------------------
precision=67.70, recall=50.34
--------------------
37 out of 40 retrieved with atleast 1 correct instance
--------------------------------------------------------------------------------
BOW - using all 4 references
--------------------
precision=76.35, recall=49.95
--------------------
39 out of 40 retrieved with atleast 1 correct instance
--------------------------------------------------------------------------------
MT task - using all 4 references
--------------------------------------------------------------------------------
precision=57.43, recall=53.60
--------------------------------------------------------------------------------
4 references bleu=24.021382
--------------------------------------------------------------------------------
using min len filter
--------------------
1458 out of 3977 have len >= 10
BLEU=24.25
precision=57.86, recall=52.48


## Edin 50 hours model

In [None]:
old_metrics = eval_nmt_model("./sp2enw_mel-80_vocab-nltk/sp_0.33_h-256_e-128_l2e-3_lstm_drpt-0.3_cnn-32-2-2_rnn-3_b-40-50")

## Edin 25 hours model

In [None]:
_ = eval_nmt_model("./sp2enw_mel-80_vocab-nltk/sp_0.16_h-256_e-128_l2e-3_lstm_drpt-0.3_cnn-32-2-2_rnn-3_b-80-25_no-ln-bn")

## Edin 15 hours model

In [None]:
_ = eval_nmt_model("./sp2enw/sp_.10/")

## Edin 50 hours model - sample word embeddings

### seed: 0.33

In [33]:
_ = eval_nmt_model("sp2enw_mel-80_vocab-nltk/sp_0.33_h-300_e-128_l2e-6_lstm_drpt-0.5_cnn-64-2-2_rnn-3_b-80-25_no-bn-ln")

--------------------------------------------------------------------------------
BOW - using 1 reference
--------------------
precision=55.92, recall=31.10
--------------------
31 out of 40 retrieved with atleast 1 correct instance
--------------------------------------------------------------------------------
BOW - using all 4 references
--------------------
precision=65.76, recall=30.30
--------------------
31 out of 40 retrieved with atleast 1 correct instance
--------------------------------------------------------------------------------
MT task - using all 4 references
--------------------------------------------------------------------------------
precision=46.71, recall=39.97
--------------------------------------------------------------------------------
4 references bleu=12.538613
--------------------------------------------------------------------------------
using min len filter
--------------------
1430 out of 3977 have len >= 10
BLEU=12.10
precision=46.53, recall=38.32


In [34]:
_ = eval_nmt_model("sp2enw_mel-80_vocab-nltk/sp_0.33_h-300_e-128_l2e-6_lstm_drpt-0.5_cnn-64-2-2_rnn-3_b-80-25_no-bn-ln_sample/")

--------------------------------------------------------------------------------
BOW - using 1 reference
--------------------
precision=60.66, recall=33.91
--------------------
32 out of 40 retrieved with atleast 1 correct instance
--------------------------------------------------------------------------------
BOW - using all 4 references
--------------------
precision=69.80, recall=33.33
--------------------
31 out of 40 retrieved with atleast 1 correct instance
--------------------------------------------------------------------------------
MT task - using all 4 references
--------------------------------------------------------------------------------
precision=47.34, recall=41.31
--------------------------------------------------------------------------------
4 references bleu=13.377341
--------------------------------------------------------------------------------
using min len filter
--------------------
1430 out of 3977 have len >= 10
BLEU=13.02
precision=47.31, recall=39.86


### seed: AA

In [35]:
new50_metrics = eval_nmt_model("emb_sp2enw/sp_0.33_seed-AA")

--------------------------------------------------------------------------------
BOW - using 1 reference
--------------------
precision=56.43, recall=31.88
--------------------
32 out of 40 retrieved with atleast 1 correct instance
--------------------------------------------------------------------------------
BOW - using all 4 references
--------------------
precision=64.51, recall=30.20
--------------------
31 out of 40 retrieved with atleast 1 correct instance
--------------------------------------------------------------------------------
MT task - using all 4 references
--------------------------------------------------------------------------------
precision=46.74, recall=39.26
--------------------------------------------------------------------------------
4 references bleu=11.939563
--------------------------------------------------------------------------------
using min len filter
--------------------
1430 out of 3977 have len >= 10
BLEU=11.43
precision=46.31, recall=37.64


In [36]:
new50_metrics = eval_nmt_model("emb_sp2enw/sp_0.33_seed-AA_mix-0.5")

--------------------------------------------------------------------------------
BOW - using 1 reference
--------------------
precision=57.14, recall=33.72
--------------------
34 out of 40 retrieved with atleast 1 correct instance
--------------------------------------------------------------------------------
BOW - using all 4 references
--------------------
precision=63.39, recall=32.39
--------------------
31 out of 40 retrieved with atleast 1 correct instance
--------------------------------------------------------------------------------
MT task - using all 4 references
--------------------------------------------------------------------------------
precision=47.45, recall=40.50
--------------------------------------------------------------------------------
4 references bleu=13.049108
--------------------------------------------------------------------------------
using min len filter
--------------------
1430 out of 3977 have len >= 10
BLEU=12.53
precision=47.77, recall=39.10


## Edin 80 hours model - sample word embeddings

In [39]:
_ = eval_nmt_model("sp2enw_mel-80_vocab-nltk/sp_0.50_h-300_e-128_l2e-6_lstm_drpt-0.5_cnn-64-2-2_rnn-3_b-80-25_no-bn-ln")

--------------------------------------------------------------------------------
BOW - using 1 reference
--------------------
precision=65.99, recall=41.18
--------------------
35 out of 40 retrieved with atleast 1 correct instance
--------------------------------------------------------------------------------
BOW - using all 4 references
--------------------
precision=74.42, recall=40.13
--------------------
34 out of 40 retrieved with atleast 1 correct instance
--------------------------------------------------------------------------------
MT task - using all 4 references
--------------------------------------------------------------------------------
precision=53.21, recall=45.37
--------------------------------------------------------------------------------
4 references bleu=17.151387
--------------------------------------------------------------------------------
using min len filter
--------------------
1430 out of 3977 have len >= 10
BLEU=17.01
precision=53.54, recall=44.13


In [40]:
_ = eval_nmt_model("sp2enw_mel-80_vocab-nltk/sp_0.50_h-300_e-128_l2e-6_lstm_drpt-0.5_cnn-64-2-2_rnn-3_b-80-25_no-bn-ln_sample/")

--------------------------------------------------------------------------------
BOW - using 1 reference
--------------------
precision=70.20, recall=43.60
--------------------
35 out of 40 retrieved with atleast 1 correct instance
--------------------------------------------------------------------------------
BOW - using all 4 references
--------------------
precision=77.99, recall=42.22
--------------------
34 out of 40 retrieved with atleast 1 correct instance
--------------------------------------------------------------------------------
MT task - using all 4 references
--------------------------------------------------------------------------------
precision=53.41, recall=45.68
--------------------------------------------------------------------------------
4 references bleu=17.622392
--------------------------------------------------------------------------------
using min len filter
--------------------
1430 out of 3977 have len >= 10
BLEU=17.48
precision=53.72, recall=44.56


### Dummy baseline

In [None]:
freq_sorted_words = [w.decode() for w,f in sorted(bow_dict['freq'].items(), reverse=True, key=lambda t: t[1])]

In [None]:
predict_top_K = 5
top_K_words = freq_sorted_words[:predict_top_K]

In [None]:
" --- ".join(top_K_words)

In [None]:
dummy_preds = [top_K_words for u in google_hyp_r0]

In [None]:
basic_precision_recall(google_utt_refs_words_bow.values(), dummy_preds)[:2]

In [None]:
basic_precision_recall(google_single_ref.values(), dummy_preds)[:2]

In [None]:
max_pred = 10

In [None]:
dummy_p_vals = np.zeros((max_pred), dtype="f")
dummy_r_vals = np.zeros((max_pred), dtype="f")

In [None]:
dummy_p_vals

In [None]:
p_r_dummy = {}
# for thresh in tqdm(np.arange(min_prob, max_prob+thresh_delta,thresh_delta)):
for num_pred in tqdm(range(0,max_pred)):
    top_K_words = freq_sorted_words[:num_pred+1]
    dummy_preds = [top_K_words for u in google_hyp_r0]
    dummy_p_vals[num_pred], dummy_r_vals[num_pred] = basic_precision_recall(google_utt_refs_words_bow.values(), 
                                                                            dummy_preds)[:2]
    

In [None]:
dummy_p_vals /= 100.0
dummy_r_vals /= 100.0

In [None]:
np.trapz(dummy_p_vals, dummy_r_vals)

In [None]:
thresh_labels = range(1,max_pred+1)

In [None]:
fig, ax = plt.subplots(figsize=(10,8))
ax.plot(dummy_r_vals*100, dummy_p_vals*100, label="Precision/Recall")
for i,j,k in zip(dummy_r_vals*100, dummy_p_vals*100, thresh_labels):
    ax.annotate(k,xy=(i,j), fontsize=12)
ax.set_ylabel("Precision", fontsize=28)
ax.set_xlabel("Recall", fontsize=26)
ax.legend(loc='upper center', bbox_to_anchor=(1.3, 0.9),
                  ncol=1, fancybox=True, shadow=True, fontsize=26)

plt.yticks(rotation=0, size=18)
plt.xticks(rotation=0, size=18)

In [None]:
# basic_precision_recall(single_dev_ref_words.values(), dummy_preds)[:2]

## Google refs vs Edin refs

In [None]:
sum([1 if len(r[0]) > 0 else 0 for r in single_dev_ref_words.values()])

In [None]:
sum([len(set(r[0])-{'_UNK'})for r in single_dev_ref_words.values()])

In [None]:
mismatch_count = 0
for u in set(google_single_ref.keys()) & set(single_dev_ref_words.keys()):
    if set(single_dev_ref_words[u][0]) - {'_UNK'} != set(google_single_ref[u][0]):
        mismatch_count += max(len(set(single_dev_ref_words[u][0]) - set(google_single_ref[u][0])), 
                              len(set(google_single_ref[u][0]) - set(single_dev_ref_words[u][0])))
        print(u, single_dev_ref_words[u], google_single_ref[u])
        print((set(single_dev_ref_words[u][0]) - set(google_single_ref[u][0])), 
                              (set(google_single_ref[u][0]) - set(single_dev_ref_words[u][0])))

In [None]:
print(mismatch_count)

In [None]:
[(set(r[0])-{'_UNK'})for r in single_dev_ref_words.values()]