In [1]:
import nltk
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu, modified_precision
from nltk.translate.chrf_score import sentence_chrf, corpus_chrf
from nltk.metrics import scores
import scipy.io.wavfile
from IPython.display import Audio
from IPython.display import display
from nltk.stem import *
# from nltk.stem.snowball import SnowballStemmer
from stemming.porter2 import stem
import stemming
from nltk.metrics.scores import recall

from nltk.corpus import stopwords

%matplotlib inline

In [2]:
from nmt_run import *

In [3]:
def display_words(m_dict, v_dict, preds, utts, dec_key, key, play_audio=False, displayN=-1):
    if displayN == -1:
        displayN = len(utts)
    es_ref = []
    en_ref = []
    google_ref = []
    google_pred = []
    for u in utts:
        es_ref.append(" ".join([w.decode() for w in m_dict[u]['es_w']]))
        if type(m_dict[u][dec_key]) == list:
            en_ref.append(" ".join([w.decode() for w in m_dict[u]['en_w']]))
        else:
            en_ref.append(" ".join([w.decode() for w in m_dict[u]['en_w'][0]]))
        google_pred.append(" ".join(google_hyp_r0[u]))
        google_ref.append(" ".join(google_dev_ref_0[u]))

    en_pred = []
    join_str = ' ' if dec_key.endswith('_w') else ''

    for p in preds:
        if type(p) == list:
            t_str = join_str.join([v_dict['i2w'][i].decode() for i in p])
            t_str = t_str[:t_str.find('_EOS')]
            en_pred.append(t_str)
        else:
            en_pred.append("")
        

    for u, es, en, p, g, gr in sorted(list(zip(utts, es_ref, en_ref, en_pred, google_pred, google_ref)))[:displayN]:
        # for reference, 1st word is GO_ID, no need to display
        print("Utterance: {0:s}".format(u))
        display_pp = PrettyTable(["cat","sent"], hrules=True)
        display_pp.align = "l"
        display_pp.header = False
        display_pp.add_row(["es ref", textwrap.fill(es,50)])
        display_pp.add_row(["en ref", textwrap.fill(en,50)])
        display_pp.add_row(["model pred", textwrap.fill(p,50)])
        display_pp.add_row(["model bleu", "{0:.2f}".format(sentence_bleu([en], p, smoothing_function=smooth_fun.method2))])
        display_pp.add_row(["google pred", textwrap.fill(g,50)])
        display_pp.add_row(["google bleu", "{0:.2f}".format(sentence_bleu([gr], g, smoothing_function=smooth_fun.method2))])
    

        print(display_pp)
        if play_audio:
            play_utt(u, m_dict)
    

In [4]:
def make_pred(utt, X, y=None, display_limit=10):
    # get shape
    batch_size = X.shape[0]
    # encode input
    model.forward_enc(X)
    
    # ---------------------------------------------------------------------
    # initialize decoder LSTM to final encoder state
    # ---------------------------------------------------------------------
    model.set_decoder_state()
    # ---------------------------------------------------------------------
    # swap axes of the decoder batch
    if y is not None:
        y = F.swapaxes(y, 0, 1)
    # -----------------------------------------------------------------
    # predict
    # -----------------------------------------------------------------
    # make return statements consistent
    return(decode_display(utt, batch_size=batch_size,
                          pred_limit=model.m_cfg['max_en_pred'],
#                           pred_limit=20,
                          y=y, display_limit=display_limit))

In [5]:
def decode_display(utt, batch_size, pred_limit, y=None, display_limit=10):
    xp = cuda.cupy if model.gpuid >= 0 else np
    # max number of predictions to make
    # if labels are provided, this variable is not used
    stop_limit = pred_limit
    # to track number of predictions made
    npred = 0
    # to store loss
    loss = 0
    # if labels are provided, use them for computing loss
    compute_loss = True if y is not None else False
    # ---------------------------------------------------------------------
    if compute_loss:
        stop_limit = len(y)-1
        # get starting word to initialize decoder
        curr_word = y[0]
    else:
        # intialize starting word to GO_ID symbol
        curr_word = Variable(xp.full((batch_size,), GO_ID, dtype=xp.int32))
    # ---------------------------------------------------------------------
    # flag to track if all sentences in batch have predicted EOS
    # ---------------------------------------------------------------------
    with cupy.cuda.Device(model.gpuid):
        check_if_all_eos = xp.full((batch_size,), False, dtype=xp.bool_)
    # ---------------------------------------------------------------------
    a_units = m_cfg['attn_units']
    ht = Variable(xp.zeros((batch_size, a_units), dtype=xp.float32))
    # ---------------------------------------------------------------------
    prob_out = {}
    prob_print_str = []
    while npred < (stop_limit):
        # -----------------------------------------------------------------
        # decode and predict
        #print("decoding with word: {0:s}".format(vocab_dict['en_w']['i2w'][curr_word.data[0].tolist()].decode()))
        pred_out, ht = model.decode(curr_word, ht)
        pred_word = F.argmax(pred_out, axis=1)
        # -----------------------------------------------------------------
        # printing conditional probabilities
        # -----------------------------------------------------------------
        pred_probs = xp.asnumpy(F.log_softmax(pred_out).data[0])
        top_n_probs = xp.argsort(pred_probs)[-display_limit:]
        #print("-"*60)
        #print("predicting word : {0:d}".format(npred))
        prob_print_str.append("-" * 60)
        prob_print_str.append("predicting word : {0:d}".format(npred))
        
        # -----------------------------------------------------------------
#         if npred == 0:
#             sample_word = np.random.choice(range(len(pred_probs)), p=pred_probs)
#             sample_word = np.argsort(pred_probs)[-2]
#             print(np.argsort(pred_probs)[-2], np.argsort(pred_probs)[-1])
#             pred_word = Variable(xp.asarray([sample_word], dtype=xp.int32))
        # -----------------------------------------------------------------
        
        prob_out[npred] = {}
        for pi in top_n_probs[::-1]:
            prob_out[npred][v_dict['i2w'][pi].decode()] = "{0:.3f}".format(pred_probs[pi])
            #print("{0:10s} = {1:5.3f}".format(v_dict['i2w'][pi].decode(), pred_probs[pi]))
            prob_print_str.append("{0:10s} = {1:5.3f}".format(v_dict['i2w'][pi].decode(), pred_probs[pi]))
            
        # -----------------------------------------------------------------
        # save prediction at this time step
        # -----------------------------------------------------------------
        if npred == 0:
            pred_sents = pred_word.data
        else:
            pred_sents = xp.vstack((pred_sents, pred_word.data))
        # -----------------------------------------------------------------
        if compute_loss:
            # compute loss
            loss += F.softmax_cross_entropy(pred_out, y[npred+1],
                                               class_weight=model.mask_pad_id)
        # -----------------------------------------------------------------
        curr_word = pred_word
        # -----------------------------------------------------------------
        # check if EOS is predicted for all sentences
        # -----------------------------------------------------------------
        check_if_all_eos[pred_word.data == EOS_ID] = True
        if xp.all(check_if_all_eos):
            break
        # -----------------------------------------------------------------
        # increment number of predictions made
        npred += 1
        # -----------------------------------------------------------------
    
#     out_fname = os.path.join(m_cfg['model_dir'], "probs", "{0:s}_probs.json".format(utt))
#     with open(out_fname, "w") as out_f:
#         json.dump(prob_out, out_f, indent=4)
#     print("saved probs in : {0:s}".format(out_fname))
    return pred_sents.T, loss, "\n".join(prob_print_str)

In [6]:
def get_utt_data(eg_utt, curr_set):
    # get shape
    local_input_path = os.path.join(m_cfg['data_path'], curr_set)
        
    width_b = bucket_dict[dev_key]["width_b"]
    num_b = bucket_dict[dev_key]["num_b"]
    utt_list = [eg_utt]
    
    batch_data = get_batch(map_dict[curr_set], 
                           enc_key,
                           dec_key,
                           utt_list,
                           vocab_dict,
                           num_b * width_b,
                           200,
                           input_path=local_input_path)
    
    return batch_data

### Fisher dev

In [7]:
cfg_path = "interspeech/sp_80hrs"

In [8]:
last_epoch, model, optimizer, m_cfg, t_cfg = check_model(cfg_path)

cnn_out_dim = rnn_in_units =  1280
using ADAM optimizer
--------------------------------------------------------------------------------
model found = 
interspeech/sp_80hrs/seq2seq_83.model
finished loading ..
optimizer found = interspeech/sp_80hrs/train.opt
finished loading optimizer ...


In [9]:
%%capture
train_key = m_cfg['train_set']
dev_key = m_cfg['dev_set']
batch_size=t_cfg['batch_size']
enc_key=m_cfg['enc_key']
dec_key=m_cfg['dec_key']
input_path = os.path.join(m_cfg['data_path'], m_cfg['dev_set'])
# -------------------------------------------------------------------------
# get data dictionaries
# -------------------------------------------------------------------------
map_dict, vocab_dict, bucket_dict = get_data_dicts(m_cfg)
batch_size = {'max': 1, 'med': 1, 'min': 1, 'scale': 1}

In [10]:
random.seed("meh")
# random.seed("haha")

In [11]:
# Eval parameters
ref_index = -1
min_len, max_len= 0, m_cfg['max_en_pred']
# min_len, max_len = 0, 10
displayN = 50
m_dict=map_dict[dev_key]
# wavs_path = os.path.join(m_cfg['data_path'], "wavs")
wavs_path = os.path.join("../chainer2/speech2text/both_fbank_out/", "wavs")
v_dict = vocab_dict['en_w']
key = m_cfg['dev_set']

In [12]:
os.chdir("..")
os.chdir("/afs/inf.ed.ac.uk/group/project/lowres/work/speech2text")

### View model

In [13]:
model.rnn_dec

['L0_dec', 'L1_dec', 'L2_dec']

In [14]:
def get_encoder_states():
    rnn_states = {"c": [], "h": []}
    # ---------------------------------------------------------------------
    # get the hidden and cell state (LSTM) of the first RNN in the decoder
    # ---------------------------------------------------------------------
    if model.m_cfg['bi_rnn']:
        for i, (enc, rev_enc) in enumerate(zip(model.rnn_enc,
                                     model.rnn_rev_enc)):
            h_state = F.concat((model[enc].h, model[rev_enc].h))
            rnn_states["h"].append(h_state)
            if model.m_cfg['rnn_unit'] == RNN_LSTM:
                c_state = F.concat((model[enc].c, model[rev_enc].c))
                rnn_states["c"].append(c_state)
    else:
        for enc, dec in zip(model.rnn_enc, model.rnn_dec):
            rnn_states["h"].append(model[enc].h)
            if model.m_cfg['rnn_unit'] == RNN_LSTM:
                rnn_states["c"].append(model[enc].c)
            # end if
        # end for all layers
    # end if bi-rnn
    return rnn_states
    # ---------------------------------------------------------------------

In [15]:
def get_decoder_states():
    rnn_states = {"c": [], "h": []}
    # ---------------------------------------------------------------------
    # get the hidden and cell state (LSTM) of the first RNN in the decoder
    # ---------------------------------------------------------------------
    for i, dec in enumerate(model.rnn_dec):
        rnn_states["h"].append(model[dec].h)
        if model.m_cfg['rnn_unit'] == RNN_LSTM:
            rnn_states["c"].append(model[dec].c)
        # end if
    # end for all layers
    return rnn_states
    # ---------------------------------------------------------------------

In [16]:
def set_decoder_states(rnn_states):
    # ---------------------------------------------------------------------
    # set the hidden and cell state (LSTM) for the decoder
    # ---------------------------------------------------------------------
    for i, dec in enumerate(model.rnn_dec):
        if model.m_cfg['rnn_unit'] == RNN_LSTM:
            model[dec].set_state(rnn_states["c"][i], rnn_states["h"][i])
        else:
            model[dec].set_state(rnn_states["h"][i])
        # end if
    # end for all layers
    # ---------------------------------------------------------------------

In [17]:
def encode_utt_data(X):
    # get shape
    batch_size = X.shape[0]
    # encode input
    model.forward_enc(X)

In [18]:
def init_hyp():
    beam_entry = {"hyp": [GO_ID], "score": 0}
    beam_entry["dec_state"] = get_encoder_states()
    a_units = m_cfg['attn_units']
    ht = Variable(xp.zeros((1, a_units), dtype=xp.float32))
    beam_entry["attn_v"] = ht
    return beam_entry
    

In [None]:
model.set_decoder_state()

In [None]:
model.L0_dec.c[0,:5]

In [None]:
dec_state["c"][0][0,:5]

In [23]:
with chainer.using_config('train', False):
    batch_data = get_utt_data(eg_utt, "fisher_dev")
    model.forward_enc(batch_data['X'])
    print(batch_data['X'].shape)
    print(model.L2_enc.h.data[0,:5])

    decode_entry = init_hyp()
    word_id, dec_state, attn_v = (decode_entry["hyp"][-1], 
                                            decode_entry["dec_state"], 
                                            decode_entry["attn_v"])

    # set decoder state
    set_decoder_states(dec_state)
#     model.set_decoder_state()

    curr_word = Variable(xp.full((1,), word_id, dtype=xp.int32))
#     a_units = m_cfg['attn_units']
#     ht = Variable(xp.zeros((1, a_units), dtype=xp.float32))
    embed_id = model.embed_dec(curr_word)
    # ---------------------------------------------------------------------
    # apply rnn - input feeding, use previous ht
    # ---------------------------------------------------------------------
    rnn_in = F.concat((embed_id, attn_v), axis=1)
    h = model.feed_rnn(rnn_in, model.rnn_dec)
    # ---------------------------------------------------------------------
    # compute context vector
    # ---------------------------------------------------------------------
    cv, _ = model.compute_context_vector(h)
    cv_hdec = F.concat((cv, h), axis=1)
    # ---------------------------------------------------------------------
    # compute attentional hidden state
    # ---------------------------------------------------------------------
    ht = F.tanh(model.context(cv_hdec))
    # ---------------------------------------------------------------------
    # make prediction
    # ---------------------------------------------------------------------
    predicted_out = model.out(ht)
    # ---------------------------------------------------------------------
    print(predicted_out.data[0,:5])
    print(h.data[0,:5])
    print(model.L2_enc.h[0,:5])
    pred_probs = xp.asnumpy(F.log_softmax(predicted_out).data[0])
    top_n_probs = xp.argsort(pred_probs)[-3:]
    print(pred_probs[top_n_probs])
    print([v_dict['i2w'][pi].decode() for pi in top_n_probs])

(1, 393, 80)
[ 0.07863344  0.32638404 -0.40837365 -0.2300286  -0.2424436 ]
[-0.47259966 -0.47259966  7.5808063  -0.47259963  2.0798721 ]
[ 0.03225665 -0.02414448  0.03373034  0.09144363 -0.13371721]
variable([ 0.07863344  0.32638404 -0.40837365 -0.2300286  -0.2424436 ])
[-7.9359665  -3.6814308  -0.02831078]
['it', 'and', 'i']


In [40]:
def decode_beam_step(decode_entry, beam_width=3):
    xp = cuda.cupy if model.gpuid >= 0 else np
    
    with chainer.using_config('train', False):
    
        word_id, dec_state, attn_v = (decode_entry["hyp"][-1], 
                                        decode_entry["dec_state"], 
                                        decode_entry["attn_v"])

        # set decoder state
        set_decoder_states(dec_state)
        #model.set_decoder_state()

        # intialize starting word symbol
        #print("beam step curr word", v_dict['i2w'][word_id].decode())
        curr_word = Variable(xp.full((1,), word_id, dtype=xp.int32))

        prob_out = {}
        prob_print_str = []

        # -----------------------------------------------------------------
        # decode and predict
        pred_out, ht = model.decode(curr_word, attn_v)    
        # -----------------------------------------------------------------
        # printing conditional probabilities
        # -----------------------------------------------------------------
        pred_probs = xp.asnumpy(F.log_softmax(pred_out).data[0])
        top_n_probs = xp.argsort(pred_probs)[-beam_width:]
        
        new_entries = []
        
        curr_dec_state = get_decoder_states()

        for pi in top_n_probs[::-1]:
            #print("{0:10s} = {1:5.4f}".format(v_dict['i2w'][pi].decode(), pred_probs[pi]))
            new_entry = {}
            new_entry["hyp"] = decode_entry["hyp"] + [pi]
            #print(new_entry["hyp"])
            new_entry["score"] = decode_entry["score"] + pred_probs[pi]
            new_entry["dec_state"] = curr_dec_state
            new_entry["attn_v"] = ht
            
            new_entries.append(new_entry)
            
    # end with chainer test mode
    return new_entries

In [41]:
def decode_beam(utt, curr_set, stop_limit=10, max_n=5, beam_width=3):
    with chainer.using_config('train', False):
        batch_data = get_utt_data(utt, curr_set)
        model.forward_enc(batch_data['X'])

        n_best = []
        n_best.append(init_hyp())

        for i in range(stop_limit):
            #print("-"*40)
            #print(i)
            #print("-"*40)
            all_non_eos = [1 if e["hyp"][-1] != EOS_ID else 0 for e in n_best]
            if sum(all_non_eos) == 0:
                print("all eos at step={0:d}".format(i))
                break

            curr_entries = []
            for e in n_best:
                if e["hyp"][-1] != EOS_ID:
                    #print("feeding", v_dict["i2w"][e["hyp"][-1]])
                    curr_entries.extend(decode_beam_step(e, beam_width=beam_width))
                else:
                    curr_entries.append(e)

            n_best = sorted(curr_entries, reverse=True, key=lambda t: t["score"])[:max_n]
    return n_best

In [42]:
eg_utt = '20051023_232057_325_fsp-A-3'
print(" ".join(map(bytes.decode, m_dict[eg_utt]["en_w"][0])))

i 'm from puerto rico but i live here in denver colorado


In [50]:
with chainer.using_config('train', False):
    n_best = decode_beam(eg_utt, "fisher_dev", stop_limit=20, max_n=8, beam_width=3)

all eos at step=13


In [51]:
for e in n_best:
    print(" ".join([v_dict['i2w'][i].decode() for i in e["hyp"]]))
    print(e["score"])
#     print(e["hyp"])
#     print(" ".join([w.decode() for i in e["hyp"] for w in v_dict["i2w"][i]]))

_GO i 'm from puerto rico but i live here in denver colorado _EOS
-1.732635498046875
_GO i 'm from puerto rico but i live here in denver _EOS
-2.7813310623168945
_GO i 'm from puerto rico but i live here in canada _EOS
-2.857640266418457
_GO i 'm from puerto rico but i live here in toronto _EOS
-3.0372314453125
_GO i 'm from puerto rico but i live here in toronto colorado _EOS
-3.4406538009643555
_GO i am from puerto rico but i live here in denver colorado _EOS
-3.9291458129882812
_GO i 'm from puerto rico but i live here in denver canada _EOS
-3.9817543029785156
_GO i 'm from puerto rico but live here in denver colorado _EOS
-4.247453689575195


In [49]:
with chainer.using_config('train', False):
    batch_data = get_utt_data(eg_utt, curr_set="fisher_dev")
    _, _, ha = make_pred(eg_utt, batch_data['X'], y=None, display_limit=1)
    print(ha)

------------------------------------------------------------
predicting word : 0
i          = -0.028
------------------------------------------------------------
predicting word : 1
'm         = -0.128
------------------------------------------------------------
predicting word : 2
from       = -0.010
------------------------------------------------------------
predicting word : 3
puerto     = -0.022
------------------------------------------------------------
predicting word : 4
rico       = -0.003
------------------------------------------------------------
predicting word : 5
but        = -0.063
------------------------------------------------------------
predicting word : 6
i          = -0.098
------------------------------------------------------------
predicting word : 7
live       = -0.009
------------------------------------------------------------
predicting word : 8
here       = -0.085
------------------------------------------------------------
predicting word : 9
in        

In [None]:
batch_data = get_utt_data(eg_utt, curr_set="fisher_dev")

In [None]:
model.forward_enc(batch_data['X'])

In [None]:
rnn_states = get_encoder_states()

In [None]:
# rnn_states["c"][0]

In [None]:
rnn_states["c"][0].shape

In [None]:
set_decoder_states(rnn_states)

In [None]:
model.L0_enc.h[0,:5]

In [None]:
model.L0_dec.h[0,:5]

In [None]:
batch_data['X'].shape

### beam data structure

hidden states for decoder:

-- get_rnn_states(layer_names): returns list with decoder hidden states

-- set_rnn_states(layer_names, hidden_states)

-- get_e

decoder_layer_name: {"c": , "h" :}

In [None]:
def generate_translate_probs(eg_utt, curr_set="fisher_dev", display_limit=5, display_probs=True):
    
    batch_data = get_batch(map_dict[curr_set], 
                           enc_key,
                           dec_key,
                           utt_list,
                           vocab_dict,
                           (eg_utt_bucket+1) * width_b,
                           200,
                           input_path=local_input_path)
    
    with chainer.using_config('train', False):
        cuda.get_device(t_cfg['gpuid']).use()
        preds, _, probs_str = make_pred(eg_utt, X=batch_data['X'], display_limit=display_limit)
        #preds, _ = make_pred(eg_utt, X=batch_data['X'][:,-150:,:], display_limit=10)
        loss_val = 0.0
    
    display_words(map_dict[curr_set], v_dict, 
                  preds.tolist(), 
                  utt_list, dec_key, 
                  key, 
                  play_audio=play_audio, 
                  displayN=displayN)
    
    if display_probs:
        print(probs_str)

In [None]:
def find_utts_with_word(word, text_key="en_w", set_key="fisher_dev", show_max_found=10):
    total_found = 0
    out_str = []
    for utt, entry in map_dict[set_key].items():
        if "train" in set_key or text_key == "es_w":
            words_in_utt = " ".join([w.decode() for w in entry[text_key]])
        else:
            words_in_utt = " ".join([w.decode() for w in entry[text_key][0]])
        es_words_in_utt = " ".join([w.decode() for w in entry["es_w"]])        
        #if "puerto" in words_in_utt:
        if word in words_in_utt:
            out_str.append("{0:s} | {1:s} | {2:s}".format(utt, words_in_utt, es_words_in_utt))
            total_found += 1
    
    print("-" * 80)
    print("total instances found = {0:d}".format(total_found))
    print("-" * 80)
    print("\n".join(out_str[:show_max_found]))

In [None]:
find_utts_with_word("claro", text_key="es_w", set_key="fisher_dev")

In [None]:
eg_utt = "20051010_212418_225_fsp-A-32"
generate_translate_probs(eg_utt)

In [None]:
eg_utt = "20051023_232057_325_fsp-A-3"
generate_translate_probs(eg_utt)

In [None]:
MIN_LEN = 5

In [None]:
random.seed("aha")
sel_utts = random.sample([u for u in google_dev_ref_0.keys() if len(google_dev_ref_0[u]) > MIN_LEN], 20)

In [None]:
sel_utts

In [None]:
len(sel_utts)

In [None]:
for i, u in enumerate(sel_utts):
    print("-"*80)
    print("{0:d}".format(i))
    print("-"*80)
    generate_translate_probs(u, curr_set='fisher_dev', display_limit=3, display_probs=True)
    loss_v, loss_by_w = check_loss(u, curr_set='fisher_dev')
    #print("{0:20s} ||| {1:5.2f} ||| {2:5.2f} ||| {3:5.2f}".format(u, l, loss_v, loss_by_w))
    print("{0:20s} ||| {1:5.2f}".format(u, loss_by_w))

In [None]:
for i, u in enumerate(sel_utts):
    print("-"*80)
    print("{0:d}".format(i))
    print("-"*80)
    generate_translate_probs(u, curr_set='fisher_dev', display_limit=3, display_probs=True)
    loss_v, loss_by_w = check_loss(u, curr_set='fisher_dev')
    #print("{0:20s} ||| {1:5.2f} ||| {2:5.2f} ||| {3:5.2f}".format(u, l, loss_v, loss_by_w))
    print("{0:20s} ||| {1:5.2f}".format(u, loss_by_w))

In [None]:
prob_fname = os.path.join(m_cfg['model_dir'], "{0:s}_probs.json".format(eg_utt))

In [None]:
eg_utt = "20051017_234550_276_fsp-B-34"
print(check_loss(eg_utt, curr_set='fisher_dev'))
generate_translate_probs(eg_utt)

In [None]:
vocab_dict['en_w']['i2w'][2]

In [None]:
eg_utt = "20051026_180724_341_fsp-A-26"
generate_translate_probs(eg_utt)
check_loss(eg_utt, curr_set='fisher_dev')

In [None]:
eg_utt = "20051017_234550_276_fsp-A-13"
generate_translate_probs(eg_utt)
check_loss(eg_utt, curr_set='fisher_dev')

In [None]:
eg_utt = "20051018_210220_279_fsp-A-26"
generate_translate_probs(eg_utt)
check_loss(eg_utt, curr_set='fisher_dev')

In [None]:
find_utts_with_word("mhm", set_key="fisher_dev")

In [None]:
eg_utt = "20051019_210146_289_fsp-A-54"
generate_translate_probs(eg_utt, curr_set='fisher_dev')
check_loss(eg_utt, curr_set='fisher_dev')

In [None]:
utt_loss = {}

In [None]:
eg_utt = "20051017_220530_275_fsp-B-21.npy"
try:
    check_loss(eg_utt, curr_set='fisher_dev')
except:
    print("{0:s} not found".format(utt))

In [None]:
i = 0
utt_loss = {}
for utt in tqdm(map_dict['fisher_dev'], ncols=50):
    if utt not in utt_loss:
        try:
            loss = check_loss(utt, curr_set='fisher_dev')
            utt_loss[utt] = loss.data.tolist()
        except:
            print("{0:s} not found".format(utt))
    #     print(utt, "{0:5.3f}".format(loss.data.tolist()))
#     i += 1
#     if i > 5:
#         break

In [None]:
# normalize by length
utt_loss_normalize = {}
for utt in tqdm(utt_loss, ncols=50):
    utt_loss_normalize[utt] = utt_loss[utt] / (len(map_dict['fisher_dev'][utt]['en_w'][0])+1)

In [None]:
list(utt_loss.items())[:10]

In [None]:
pickle.dump(utt_loss, open(os.path.join(cfg_path, "dev_utts_loss.dict"), "wb"))

In [None]:
pickle.dump(utt_loss_normalize, open(os.path.join(cfg_path, "dev_utts_loss_normalized.dict"), "wb"))

In [None]:
utt_loss = pickle.load(open(os.path.join(cfg_path, "dev_utts_loss.dict"), "rb"))

In [None]:
utt_loss_normalize = pickle.load(open(os.path.join(cfg_path, "dev_utts_loss_normalized.dict"), "rb"))

In [None]:
bad_utts = sorted(utt_loss_normalize.items(), reverse=True, key=lambda t: t[1])

In [None]:
N_BAD_UTTS = 30

In [None]:
u = '20051026_180724_341_fsp-A-26'
generate_translate_probs(eg_utt, curr_set='fisher_dev')
check_loss(eg_utt, curr_set='fisher_dev')

In [None]:
bad_utts[:10]

In [None]:
bad_utts[-10:]

In [None]:
x, y = zip(*utt_loss_normalize.items())

In [None]:
sum([1 if i < 1 else 0 for i in y]), sum([1 if i > 5 else 0 for i in y]), len(y)

### dev utts - avg loss per word in utt

In [None]:
fig, ax = plt.subplots()
fig.set_size_inches(8,5)

ax = sns.distplot(y, kde=False, rug=False, ax=ax, color=tableau20[0]);
ax.set_xlabel("dev utts - avg loss per word in utt", size=20)

In [None]:
for i, (u, l) in enumerate(sorted(utt_loss_normalize.items(), reverse=True, key=lambda t: t[1])[:50]):
    print("-"*80)
    print("{0:d}".format(i))
    print("-"*80)
    generate_translate_probs(u, curr_set='fisher_dev', display_limit=3, display_probs=True)
    loss_v, loss_by_w = check_loss(u, curr_set='fisher_dev')
    #print("{0:20s} ||| {1:5.2f} ||| {2:5.2f} ||| {3:5.2f}".format(u, l, loss_v, loss_by_w))
    print("{0:20s} ||| {3:5.2f}".format(u, l, loss_v, loss_by_w))

To share:
Several utterrance labels have typos, giving a misleading signal about the prediction quality

"20051017_180712_270_fsp-B-62"
dogs barking

"20051018_210220_279_fsp-A-71"
monopoly money -- monopoly occurs only 5 times in the train set, and never in the context of the game

"20051017_220530_275_fsp-B-61"
the decode probabilities show that maybe beam decoding (? or probably language model) will help catch up to Google. the Google model outputs Texas. We have Texas as the second most probable word as per the acoustic model. 


In [None]:
x = np.array([[ .759,  0.141,  .053]], dtype=np.float32)
t = np.array([1]).astype('i')
y = F.softmax_cross_entropy(x, t)
y

In [None]:
batch_data = get_utt_data(eg_utt, curr_set='fisher_dev')

In [None]:
X, y = batch_data['X'], batch_data['y']

In [None]:
[vocab_dict['en_w']['i2w'][i] for i in xp.asnumpy(y.data[0])]

### multilabel classification

In [None]:
bucket_dict['fisher_train']['buckets'][0][:5]

In [None]:
utt_list = bucket_dict['fisher_train']['buckets'][0][:5]
width_b = bucket_dict['fisher_train']['width_b']
local_input_path = os.path.join(m_cfg['data_path'], m_cfg['train_set'])

In [None]:
batch_data = get_batch(map_dict["fisher_train"], 
                       enc_key,
                       dec_key,
                       utt_list,
                       vocab_dict,
                       (0+1) * width_b,
                       200,
                       input_path=local_input_path)
    
X, y = batch_data['X'], batch_data['y']

In [None]:
y.shape, X.shape

In [None]:
# encode input
model.forward_enc(X)

In [None]:
y_t = F.swapaxes(y, 0, 1)

In [None]:
y_t.shape

In [None]:
len(y_t)

In [None]:
next_word.data

In [None]:
t = np.zeros(shape=(len(next_word),10), dtype='i')

In [None]:
t

In [None]:
t[0,[1]] = 1

In [None]:
sim_dict = pickle.load(open("../speech2text/fbanks_80dim_nltk/sim.dict", "rb"))

In [None]:
sim_dict['i'][4]

In [None]:
for next_word in y_t:
    print(next_word)
    t = np.zeros(shape=(len(next_word.data), 17000), dtype='i')
    print(next_word.data.tolist())
    for i,w in enumerate(next_word.data.tolist()):
        t[i,sim_dict['i'][w]] = 1
        print(t[i,[4,1044, 1045, 2477]])
    #print(t)

In [None]:
labels = xp.zeros((5,10)).astype('i')

In [None]:
next_word = [1,2,3,4,5]

In [None]:
for i, w in enumerate(next_word):
    print(i, w)
    labels[i,[w]] = 1

In [None]:
labels

In [None]:
next_word.data.tolist()

In [None]:
x = np.random.randn(1,17000).astype('f')
x = np.zeros((1,17000)).astype('f')

In [None]:
x[0,2] = 10.0
x[0,0] = -10.0

In [None]:
x

In [None]:
F.log_softmax(x).data

In [None]:
t = 0 * np.ones((1,17000), dtype='i')

In [None]:
t[0,2] = 1

In [None]:
x, t

In [None]:
F.sigmoid_cross_entropy(x, t, reduce='no', normalize=False)

In [None]:
F.sigmoid_cross_entropy(x, t, reduce='no', normalize=True)

In [None]:
F.sigmoid_cross_entropy(x, t, reduce='mean', normalize=True)

In [None]:
F.sigmoid_cross_entropy(x, t, reduce='mean', normalize=False)

In [None]:
F.sigmoid_cross_entropy(x, t, normalize=True)

In [None]:
x = np.array([[-2.0, 3.0, -2.0], [-2.0, 3.0, -2.0]]).astype('f')
t = np.array([[-1, -1, -1], [0, 1, 0]]).astype('i')

In [None]:
F.sigmoid_cross_entropy(x, t)

In [None]:
x = np.zeros((1,10), dtype="f")

In [None]:
x[0,[2]] = 10.0

In [None]:
x[:,list(range(2))+list(range(3,10))] = -.2

In [None]:
x[:,:10]

In [None]:
t = np.zeros((1,10), dtype="i")
t[0,5] = 1

In [None]:
F.sigmoid_cross_entropy(x, t)

In [None]:
predicted_out[:1], xp.argmax(predicted_out[:1].data)

In [None]:
labels = 0 * xp.ones(predicted_out[:1].shape).astype('i')

In [None]:
labels.shape

In [None]:
t = xp.zeros(predicted_out[:1].shape).astype('f')

In [None]:
t.shape

In [None]:
t[0,[2,5]] = 50.0

In [None]:
labels[0,[2,5]] = 1

In [None]:
F.sigmoid_cross_entropy(t, labels)

In [None]:
F.sigmoid_cross_entropy(F.softmax(predicted_out[:1]), xp.expand_dims(labels, axis=0))

In [None]:
from gensim.models import KeyedVectors

In [None]:
sim_dict['w'][b'rico'], sim_dict['i'][4]

In [None]:
xp.random.choice(sim_dict['i'][4], 1), sim_dict['i'][w]

In [None]:
for i in range(len(t_alt)):
    print(t_alt, i)
    print(t_alt[i])
    #t_alt[i] = xp.random.sample(sim_dict['i'][t_alt[i]])

In [None]:
print(t_alt)
for i in range(len(t_alt)):
    print(xp.random.choice(sim_dict['i'][int(t_alt[i])],1))
    print(t_alt[i],type(t_alt[i]), int(t_alt[i]))

In [None]:
decoder_batch = y_t
batch_size = decoder_batch.shape[1]
loss = 0
# ---------------------------------------------------------------------
# initialize hidden states as a zero vector
# ---------------------------------------------------------------------
a_units = model.m_cfg['attn_units']
ht = Variable(xp.zeros((batch_size, a_units), dtype=xp.float32))
# ---------------------------------------------------------------------
decoder_input = decoder_batch[0]
# for all sequences in the batch, feed the characters one by one
for curr_word, next_word in zip(decoder_batch, decoder_batch[1:]):
    print(curr_word, next_word)
    decoder_input = curr_word
    # -----------------------------------------------------------------
    # encode tokens
    # -----------------------------------------------------------------
    predicted_out, ht = model.decode(decoder_input, ht)
    decoder_input = F.argmax(predicted_out, axis=1)
    #print(decoder_input)
    # -----------------------------------------------------------------
    # compute loss
    # -----------------------------------------------------------------
    t_alt = xp.copy(next_word.data)
    print(t_alt)
    for i in range(len(t_alt)):
        t_alt[i] = xp.random.choice(sim_dict['i'][int(t_alt[i])],1)
        #print(t[i,[4,1044, 1045, 2477]])
    print(t_alt)

#     t = xp.zeros(shape=predicted_out.shape, dtype='i')
#     print(next_word.data.tolist())
#     print(next_word.shape)
#     for i,w in enumerate(next_word.data.tolist()):
#         if w == PAD_ID:
#             t[i,:] = -1
#         else:
#             t[i,sim_dict['i'][w]] = 1
#         #print(t[i,[4,1044, 1045, 2477]])
#     loss_arr = F.sigmoid_cross_entropy(predicted_out, t, normalize=True)
    loss_arr = F.softmax_cross_entropy(predicted_out, t_alt, normalize=True)
    print("softmax cross entropy:", F.softmax_cross_entropy(predicted_out, next_word), "sigmoid:", loss_arr.data.tolist())
    loss += loss_arr
    
    # -----------------------------------------------------------------
#print(loss, loss / (y.shape[0]-2), y.shape)
print(loss.data.tolist(), (loss / (y.shape[0]-1)).data.tolist())

### Compare BLEU scores at utterance level

In [None]:
eg_utt = "20051023_232057_325_fsp-A-3"
print(sentence_bleu([google_dev_ref_0[eg_utt]], google_hyp_r0[eg_utt], smoothing_function=smooth_fun.method2))
print(sentence_bleu([model_s2t_refs[eg_utt]], model_s2t_hyps[eg_utt], smoothing_function=smooth_fun.method2))

In [None]:
eg_utt = "20051019_190221_288_fsp-B-1"
print(sentence_bleu([google_dev_ref_0[eg_utt]], google_hyp_r0[eg_utt], smoothing_function=smooth_fun.method2))
print(sentence_bleu([model_s2t_refs[eg_utt]], model_s2t_hyps[eg_utt], smoothing_function=smooth_fun.method2))
print(google_dev_ref_0[eg_utt], google_hyp_r0[eg_utt])
print(model_s2t_refs[eg_utt], model_s2t_hyps[eg_utt])

In [None]:
random.seed("haha")
dev_utts = list(model_s2t_refs.keys())
random.shuffle(dev_utts)

In [None]:
os.makedirs(os.path.join(m_cfg['model_dir'], "probs"))

In [None]:
print("google beats model by factor of 2")

count = 0
# print("-"*80)
# print("{0:>5s} ||| {1:30s} ||| {2:>15s} || {3:>15s}".format("sn", "utt", "google utt bleu", "model utt bleu"))
# print("-"*80)
for utt in dev_utts:
    if len(model_s2t_refs[utt]) < 10:
        google_utt_bleu = sentence_bleu([google_dev_ref_0[utt]], google_hyp_r0[utt], smoothing_function=smooth_fun.method2)
        model_utt_bleu = sentence_bleu([model_s2t_refs[utt]], model_s2t_hyps[utt], smoothing_function=smooth_fun.method2)
        if google_utt_bleu >= (2 * model_utt_bleu) and google_utt_bleu >= 0.5:
            count += 1
            #print("{0:5d} ||| {1:30s} ||| {2:15.2f} || {3:15.2f}".format(count, utt, google_utt_bleu, model_utt_bleu))
            print("-"*80)
            print(count)
            print("-"*80)
    #         display_pp = PrettyTable(["cat","sent"], hrules=True)
    #         display_pp.align = "l"
    #         display_pp.header = False
    #         display_pp.add_row(["en ref", textwrap.fill(" ".join(model_s2t_refs[utt]),50)])
    #         display_pp.add_row(["model pred", textwrap.fill(" ".join(model_s2t_hyps[utt]),50)])
    #         display_pp.add_row(["model utt bleu", "{0:.2f}".format(model_utt_bleu)])
    #         display_pp.add_row(["google pred", textwrap.fill(" ".join(google_hyp_r0[utt]),50)])
    #         display_pp.add_row(["google utt bleu", "{0:.2f}".format(google_utt_bleu)])
    #         print(display_pp)
    #         play_utt(utt, map_dict['fisher_dev'])
            generate_translate_probs(utt)
        if count > 50:
            break


In [None]:
vocab_dict['en_w']['i2w'][494]

In [None]:
eg_utt = "20051009_210519_219_fsp-A-16"
generate_translate_probs(eg_utt)

In [None]:
print("model beats google by factor of 2")

count = 0
# print("-"*80)
# print("{0:>5s} ||| {1:30s} ||| {2:>15s} || {3:>15s}".format("sn", "utt", "google utt bleu", "model utt bleu"))
# print("-"*80)
for utt in dev_utts:
    if len(model_s2t_refs[utt]) > 3 and len(model_s2t_refs[utt]) < 20:
        google_utt_bleu = sentence_bleu([google_dev_ref_0[utt]], google_hyp_r0[utt], smoothing_function=smooth_fun.method2)
        model_utt_bleu = sentence_bleu([model_s2t_refs[utt]], model_s2t_hyps[utt], smoothing_function=smooth_fun.method2)
        if model_utt_bleu >= (1.5 * google_utt_bleu) and model_utt_bleu >= 0.5:
            count += 1
            #print("{0:5d} ||| {1:30s} ||| {2:15.2f} || {3:15.2f}".format(count, utt, google_utt_bleu, model_utt_bleu))
            print("-"*80)
            print(count)
            print("-"*80)
    #         display_pp = PrettyTable(["cat","sent"], hrules=True)
    #         display_pp.align = "l"
    #         display_pp.header = False
    #         display_pp.add_row(["en ref", textwrap.fill(" ".join(model_s2t_refs[utt]),50)])
    #         display_pp.add_row(["model pred", textwrap.fill(" ".join(model_s2t_hyps[utt]),50)])
    #         display_pp.add_row(["model utt bleu", "{0:.2f}".format(model_utt_bleu)])
    #         display_pp.add_row(["google pred", textwrap.fill(" ".join(google_hyp_r0[utt]),50)])
    #         display_pp.add_row(["google utt bleu", "{0:.2f}".format(google_utt_bleu)])
    #         print(display_pp)
    #         play_utt(utt, map_dict['fisher_dev'])
            generate_translate_probs(utt)
        if count > 50:
            break


In [None]:
len(vocab_dict['es_w']['w2i']), len(vocab_dict['en_w']['w2i'])

In [None]:
es_words = set(vocab_dict['es_w']['w2i'].keys())
en_words = set(vocab_dict['en_w']['w2i'].keys())

In [None]:
len(es_words), len(en_words)

In [None]:
common_words = es_words & en_words

In [None]:
len(common_words)

In [None]:
freq_common_es = {w: vocab_dict['es_w']['freq'][w] for w in common_words}
freq_common_en = {w: vocab_dict['en_w']['freq'][w] for w in common_words}
freq_common_both = {w: (vocab_dict['en_w']['freq'][w], vocab_dict['es_w']['freq'][w]) for w in common_words}


In [None]:
len(freq_common_es), len(freq_common_en)

In [None]:
sum(freq_common_es.values()), sum(freq_common_en.values())

In [None]:
len(vocab_dict['es_w']['freq']), sum(vocab_dict['es_w']['freq'].values()), len(vocab_dict['en_w']['freq']), sum(vocab_dict['en_w']['freq'].values())

In [None]:
844202 / 1496796, 1282482 / 1497356

In [None]:
freq_common_en[b'que']

In [None]:
sorted(freq_common_es.items(), reverse=True, key= lambda t: t[1])

In [None]:
sorted(freq_common_en.items(), reverse=True, key= lambda t: t[1])

In [None]:
sorted(freq_common_both.items(), reverse=True, key= lambda t: t[1])

In [None]:
C = 20
common_in_both = [w for w, (c1, c2) in freq_common_both.items() if c1 >= C and c2 >= C]

In [None]:
len(common_in_both)

In [None]:
common_in_both