In [1]:
import nltk
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu, modified_precision
from nltk.translate.chrf_score import sentence_chrf, corpus_chrf
from nltk.metrics import scores
import scipy.io.wavfile
from IPython.display import Audio
from IPython.display import display
from nltk.stem import *
# from nltk.stem.snowball import SnowballStemmer
from stemming.porter2 import stem
import stemming
from nltk.metrics.scores import recall

from nltk.corpus import stopwords

%matplotlib inline

In [2]:
tableau20 = [(31, 119, 180), (174, 199, 232), (255, 127, 14), (255, 187, 120),    
             (44, 160, 44), (152, 223, 138), (214, 39, 40), (255, 152, 150),    
             (148, 103, 189), (197, 176, 213), (140, 86, 75), (196, 156, 148),    
             (227, 119, 194), (247, 182, 210), (127, 127, 127), (199, 199, 199),    
             (188, 189, 34), (219, 219, 141), (23, 190, 207), (158, 218, 229)]    
# Scale the RGB values to the [0, 1] range, which is the format matplotlib accepts.    
for i in range(len(tableau20)):    
    r, g, b = tableau20[i]    
    tableau20[i] = (r / 255., g / 255., b / 255.)

In [3]:
smooth_fun = nltk.translate.bleu_score.SmoothingFunction()

In [4]:
from nmt_run import *

In [5]:
def play_utt(utt, m_dict):
    sr, y = scipy.io.wavfile.read(os.path.join(wavs_path, utt.rsplit("-",1)[0]+'.wav'))
    start_t = min(seg['start'] for seg in m_dict[utt]['seg'])
    end_t = max(seg['end'] for seg in m_dict[utt]['seg'])
    print(start_t, end_t)
    start_t_samples, end_t_samples = int(start_t*sr), int(end_t*sr)
    display(Audio(y[start_t_samples:end_t_samples], rate=sr))

In [6]:
def display_words(m_dict, v_dict, preds, utts, dec_key, key, play_audio=False, displayN=-1):
    if displayN == -1:
        displayN = len(utts)
    es_ref = []
    en_ref = []
    google_pred = []
    for u in utts:
        es_ref.append(" ".join([w.decode() for w in m_dict[u]['es_w']]))
        if type(m_dict[u][dec_key]) == list:
            en_ref.append(" ".join([w.decode() for w in m_dict[u]['en_w']]))
        else:
            en_ref.append(" ".join([w.decode() for w in m_dict[u]['en_w'][0]]))
        google_pred.append(" ".join(google_hyp_r0[u]))

    en_pred = []
    join_str = ' ' if dec_key.endswith('_w') else ''

    for p in preds:
        if type(p) == list:
            t_str = join_str.join([v_dict['i2w'][i].decode() for i in p])
            t_str = t_str[:t_str.find('_EOS')]
            en_pred.append(t_str)
        else:
            en_pred.append("")
        

    for u, es, en, p, g in sorted(list(zip(utts, es_ref, en_ref, en_pred, google_pred)))[:displayN]:
        # for reference, 1st word is GO_ID, no need to display
        print("Utterance: {0:s}".format(u))
        display_pp = PrettyTable(["cat","sent"], hrules=True)
        display_pp.align = "l"
        display_pp.header = False
        display_pp.add_row(["es ref", textwrap.fill(es,50)])
        display_pp.add_row(["en ref", textwrap.fill(en,50)])
        display_pp.add_row(["en pred", textwrap.fill(p,50)])
        display_pp.add_row(["google pred", textwrap.fill(g,50)])
        

        print(display_pp)
        if play_audio:
            play_utt(u, m_dict)
    

In [7]:
def make_pred(utt, X, y=None, display_limit=10):
    # get shape
    batch_size = X.shape[0]
    # encode input
    model.forward_enc(X)
    # ---------------------------------------------------------------------
    # initialize decoder LSTM to final encoder state
    # ---------------------------------------------------------------------
    model.set_decoder_state()
    # ---------------------------------------------------------------------
    # swap axes of the decoder batch
    if y is not None:
        y = F.swapaxes(y, 0, 1)
    # -----------------------------------------------------------------
    # predict
    # -----------------------------------------------------------------
    # make return statements consistent
    return(decode_display(utt, batch_size=batch_size,
                          pred_limit=model.m_cfg['max_en_pred'],
#                           pred_limit=20,
                          y=y, display_limit=display_limit))

In [8]:
def decode_display(utt, batch_size, pred_limit, y=None, display_limit=10):
    xp = cuda.cupy if model.gpuid >= 0 else np
    # max number of predictions to make
    # if labels are provided, this variable is not used
    stop_limit = pred_limit
    # to track number of predictions made
    npred = 0
    # to store loss
    loss = 0
    # if labels are provided, use them for computing loss
    compute_loss = True if y is not None else False
    # ---------------------------------------------------------------------
    if compute_loss:
        stop_limit = len(y)-1
        # get starting word to initialize decoder
        curr_word = y[0]
    else:
        # intialize starting word to GO_ID symbol
        curr_word = Variable(xp.full((batch_size,), GO_ID, dtype=xp.int32))
    # ---------------------------------------------------------------------
    # flag to track if all sentences in batch have predicted EOS
    # ---------------------------------------------------------------------
    with cupy.cuda.Device(model.gpuid):
        check_if_all_eos = xp.full((batch_size,), False, dtype=xp.bool_)
    # ---------------------------------------------------------------------
    a_units = m_cfg['attn_units']
    ht = Variable(xp.zeros((batch_size, a_units), dtype=xp.float32))
    # ---------------------------------------------------------------------
    prob_out = {}
    prob_print_str = []
    while npred < (stop_limit):
        # -----------------------------------------------------------------
        # decode and predict
        pred_out, ht = model.decode(curr_word, ht)
        pred_word = F.argmax(pred_out, axis=1)
        # -----------------------------------------------------------------
        # printing conditional probabilities
        # -----------------------------------------------------------------
        pred_probs = xp.asnumpy(F.softmax(pred_out).data[0])
        top_n_probs = np.argsort(pred_probs)[-display_limit:]
        #print("-"*60)
        #print("predicting word : {0:d}".format(npred))
        prob_print_str.append("-" * 60)
        prob_print_str.append("predicting word : {0:d}".format(npred))
        
        prob_out[npred] = {}
        for pi in top_n_probs[::-1]:
            prob_out[npred][v_dict['i2w'][pi].decode()] = "{0:.3f}".format(pred_probs[pi])
            #print("{0:10s} = {1:5.3f}".format(v_dict['i2w'][pi].decode(), pred_probs[pi]))
            prob_print_str.append("{0:10s} = {1:5.3f}".format(v_dict['i2w'][pi].decode(), pred_probs[pi]))
            
        # -----------------------------------------------------------------
        # save prediction at this time step
        # -----------------------------------------------------------------
        if npred == 0:
            pred_sents = pred_word.data
        else:
            pred_sents = xp.vstack((pred_sents, pred_word.data))
        # -----------------------------------------------------------------
        if compute_loss:
            # compute loss
            loss += F.softmax_cross_entropy(pred_out, y[npred+1],
                                               class_weight=model.mask_pad_id)
        # -----------------------------------------------------------------
        curr_word = pred_word
        # -----------------------------------------------------------------
        # check if EOS is predicted for all sentences
        # -----------------------------------------------------------------
        check_if_all_eos[pred_word.data == EOS_ID] = True
        if xp.all(check_if_all_eos):
            break
        # -----------------------------------------------------------------
        # increment number of predictions made
        npred += 1
        # -----------------------------------------------------------------
    
    out_fname = os.path.join(m_cfg['model_dir'], "probs", "{0:s}_probs.json".format(utt))
    with open(out_fname, "w") as out_f:
        json.dump(prob_out, out_f, indent=4)
    print("saved probs in : {0:s}".format(out_fname))
    return pred_sents.T, loss, "\n".join(prob_print_str)

In [9]:
def check_loss(eg_utt, curr_set='fisher_dev', teacher_ratio=1.0):
    # get shape
    if "train" in curr_set:
        local_input_path = os.path.join(m_cfg['data_path'], m_cfg['train_set'])
        play_audio = False
    else:
        local_input_path = os.path.join(m_cfg['data_path'], m_cfg['dev_set'])
        play_audio = True
        
    eg_utt_bucket = -1
    for i, bucket in enumerate(bucket_dict[curr_set]["buckets"]):
        if eg_utt in bucket:
            eg_utt_bucket = i
            #print("found")
        # end if
    # end for
    #print("found in bucket : {0:d}".format(eg_utt_bucket))
    width_b = bucket_dict[dev_key]["width_b"]
    utt_list = [eg_utt]
    
    batch_data = get_batch(map_dict[curr_set], 
                           enc_key,
                           dec_key,
                           utt_list,
                           vocab_dict,
                           (eg_utt_bucket+1) * width_b,
                           200,
                           input_path=local_input_path)
    
    X, y = batch_data['X'], batch_data['y']
    
    batch_size = X.shape[0]
    # encode input
    model.forward_enc(X)
    # ---------------------------------------------------------------------
    # initialize decoder LSTM to final encoder state
    # ---------------------------------------------------------------------
    model.set_decoder_state()
    # ---------------------------------------------------------------------
    y = F.swapaxes(y, 0, 1)
        
    xp = cuda.cupy if model.gpuid >= 0 else np
    
    decoder_batch = y 
    batch_size = decoder_batch.shape[1]
    loss = 0
    # ---------------------------------------------------------------------
    # initialize hidden states as a zero vector
    # ---------------------------------------------------------------------
    a_units = model.m_cfg['attn_units']
    ht = Variable(xp.zeros((batch_size, a_units), dtype=xp.float32))
    # ---------------------------------------------------------------------
    decoder_input = decoder_batch[0]
    # for all sequences in the batch, feed the characters one by one
    for curr_word, next_word in zip(decoder_batch, decoder_batch[1:]):
        #print(curr_word, next_word)
        # -----------------------------------------------------------------
        # teacher forcing logic
        # -----------------------------------------------------------------
        use_label = True if random.random() < teacher_ratio else False
        if use_label:
            decoder_input = curr_word
        # -----------------------------------------------------------------
        # encode tokens
        # -----------------------------------------------------------------
        predicted_out, ht = model.decode(decoder_input, ht)
        decoder_input = F.argmax(predicted_out, axis=1)
        #print(decoder_input)
        # -----------------------------------------------------------------
        # compute loss
        # -----------------------------------------------------------------
        loss_arr = F.softmax_cross_entropy(predicted_out, next_word,
                                           class_weight=model.mask_pad_id)
        #print(loss_arr.data.tolist())
        loss += loss_arr
        # -----------------------------------------------------------------
    #print(loss, loss / (y.shape[0]-2), y.shape)
    return loss.data.tolist(), (loss / (y.shape[0]-1)).data.tolist()

In [10]:
def get_utt_data(eg_utt, curr_set='fisher_dev'):
    # get shape
    if "train" in curr_set:
        local_input_path = os.path.join(m_cfg['data_path'], m_cfg['train_set'])
        play_audio = False
    else:
        local_input_path = os.path.join(m_cfg['data_path'], m_cfg['dev_set'])
        play_audio = True
        
    eg_utt_bucket = -1
    for i, bucket in enumerate(bucket_dict[curr_set]["buckets"]):
        if eg_utt in bucket:
            eg_utt_bucket = i
            #print("found")
        # end if
    # end for
    #print("found in bucket : {0:d}".format(eg_utt_bucket))
    width_b = bucket_dict[dev_key]["width_b"]
    utt_list = [eg_utt]
    
    
    batch_data = get_batch(map_dict[curr_set], 
                           enc_key,
                           dec_key,
                           utt_list,
                           vocab_dict,
                           (eg_utt_bucket+1) * width_b,
                           200,
                           input_path=local_input_path)
    
    return batch_data

### Fisher dev

In [11]:
cfg_path = "./sp2enw_mel-80_vocab-nltk/sp_1.0"

In [12]:
last_epoch, model, optimizer, m_cfg, t_cfg = check_model(cfg_path)

cnn_out_dim = rnn_in_units =  640




using SGD optimizer
--------------------------------------------------------------------------------
model found = 
./sp2enw_mel-80_vocab-nltk/sp_1.0/seq2seq_83.model
finished loading ..
optimizer found = ./sp2enw_mel-80_vocab-nltk/sp_1.0/train.opt
finished loading optimizer ...


In [13]:
train_key = m_cfg['train_set']
dev_key = m_cfg['dev_set']
batch_size=t_cfg['batch_size']
enc_key=m_cfg['enc_key']
dec_key=m_cfg['dec_key']
input_path = os.path.join(m_cfg['data_path'], m_cfg['dev_set'])
# -------------------------------------------------------------------------
# get data dictionaries
# -------------------------------------------------------------------------
map_dict, vocab_dict, bucket_dict = get_data_dicts(m_cfg)
batch_size = {'max': 96, 'med': 128, 'min': 256, 'scale': 1}

--------------------------------------------------
loading dict: fbanks_80dim_nltk/map.dict


100%|██████████| 3977/3977 [00:00<00:00, 484553.29it/s]
100%|██████████| 3960/3960 [00:00<00:00, 351984.48it/s]
  0%|          | 0/3641 [00:00<?, ?it/s]

loading dict: fbanks_80dim_nltk/train_vocab.dict
--------------------------------------------------
--------------------------------------------------
loading info_dict from=fbanks_80dim_nltk/info.dict
--------------------------------------------------
creating buckets for: fisher_dev
creating buckets for key: sp
creating buckets for: fisher_dev2
creating buckets for key: sp
creating buckets for: fisher_test
creating buckets for key: sp


100%|██████████| 3641/3641 [00:00<00:00, 492675.45it/s]
 36%|███▌      | 49338/138720 [00:00<00:00, 493235.83it/s]

creating buckets for: fisher_train
creating buckets for key: sp


100%|██████████| 138720/138720 [00:00<00:00, 547085.19it/s]
100%|██████████| 3803/3803 [00:00<00:00, 498217.71it/s]
100%|██████████| 1824/1824 [00:00<00:00, 506381.42it/s]
100%|██████████| 14294/14294 [00:00<00:00, 503627.94it/s]


creating buckets for: callhome_devtest
creating buckets for key: sp
creating buckets for: callhome_evltest
creating buckets for key: sp
creating buckets for: callhome_train
creating buckets for key: sp
--------------------------------------------------
saving info dict in: fbanks_80dim_nltk/buckets_sp.dict
all done ...
loading dict: fbanks_80dim_nltk/buckets_sp.dict
--------------------------------------------------
utterances in fisher_dev = 3979
utterances in fisher_dev2 = 3961
utterances in fisher_test = 3641
utterances in fisher_train = 138819
utterances in callhome_devtest = 3966
utterances in callhome_evltest = 1829
utterances in callhome_train = 15080
vocab size for sp = 0
vocab size for en_w = 17834


In [14]:
random.seed("meh")
# random.seed("haha")

In [15]:
# Eval parameters
ref_index = -1
min_len, max_len= 0, m_cfg['max_en_pred']
# min_len, max_len = 0, 5
displayN = 50
m_dict=map_dict[dev_key]
# wavs_path = os.path.join(m_cfg['data_path'], "wavs")
wavs_path = os.path.join("../chainer2/speech2text/both_fbank_out/", "wavs")
v_dict = vocab_dict['en_w']
key = m_cfg['dev_set']

In [16]:
os.chdir("..")
os.chdir("/afs/inf.ed.ac.uk/group/project/lowres/work/speech2text")

### Load google refs and preds

In [17]:
google_s2t_refs_path = os.path.join("../chainer2/speech2text/both_fbank_out/", "google_s2t_refs.dict")
google_s2t_hyps_path = os.path.join("../chainer2/speech2text/both_fbank_out/", "google_s2t_hyps.dict")

In [18]:
google_s2t_hyps = pickle.load(open(google_s2t_hyps_path, "rb"))
google_hyp_r0 = google_s2t_hyps['fisher_dev_r0']

google_s2t_refs = pickle.load(open(google_s2t_refs_path, "rb"))
google_dev_ref_0 = google_s2t_refs['fisher_dev_ref_0']

In [19]:
model_s2t_refs = pickle.load(open(os.path.join(cfg_path, "model_s2t_refs.dict"), "rb"))
model_s2t_hyps = pickle.load(open(os.path.join(cfg_path, "model_s2t_hyps.dict"), "rb"))

### View model

In [20]:
def generate_translate_probs(eg_utt, curr_set="fisher_dev", display_limit=5, display_probs=True):
    if "train" in curr_set:
        local_input_path = os.path.join(m_cfg['data_path'], m_cfg['train_set'])
        play_audio = False
    else:
        local_input_path = os.path.join(m_cfg['data_path'], m_cfg['dev_set'])
        play_audio = True
        
    eg_utt_bucket = -1
    for i, bucket in enumerate(bucket_dict[curr_set]["buckets"]):
        if eg_utt in bucket:
            eg_utt_bucket = i
            #make_predprint("found")
        # end if
    # end for
    #print("found in bucket : {0:d}".format(eg_utt_bucket))
    width_b = bucket_dict[dev_key]["width_b"]
    utt_list = [eg_utt]
    
    
    batch_data = get_batch(map_dict[curr_set], 
                           enc_key,
                           dec_key,
                           utt_list,
                           vocab_dict,
                           (eg_utt_bucket+1) * width_b,
                           200,
                           input_path=local_input_path)
    
    with chainer.using_config('train', False):
        cuda.get_device(t_cfg['gpuid']).use()
        preds, _, probs_str = make_pred(eg_utt, X=batch_data['X'], display_limit=display_limit)
        #preds, _ = make_pred(eg_utt, X=batch_data['X'][:,-150:,:], display_limit=10)
        loss_val = 0.0
    
    display_words(map_dict[curr_set], v_dict, 
                  preds.tolist(), 
                  utt_list, dec_key, 
                  key, 
                  play_audio=play_audio, 
                  displayN=displayN)
    
    if display_probs:
        print(probs_str)

In [21]:
def find_utts_with_word(word, set_key="fisher_dev", show_max_found=10):
    total_found = 0
    out_str = []
    for utt, entry in map_dict[set_key].items():
        if "train" in set_key:
            words_in_utt = " ".join([w.decode() for w in entry['en_w']])
        else:
            words_in_utt = " ".join([w.decode() for w in entry['en_w'][0]])
        es_words_in_utt = " ".join([w.decode() for w in entry['es_w']])        
        #if "puerto" in words_in_utt:
        if word in words_in_utt:
            out_str.append("{0:s} | {1:s} | {2:s}".format(utt, words_in_utt, es_words_in_utt))
            total_found += 1
    
    print("-" * 80)
    print("total instances found = {0:d}".format(total_found))
    print("-" * 80)
    print("\n".join(out_str[:show_max_found]))

In [None]:
find_utts_with_word("puerto", set_key="fisher_dev")

In [None]:
eg_utt = "20051023_232057_325_fsp-A-3"
generate_translate_probs(eg_utt)

In [None]:
prob_fname = os.path.join(m_cfg['model_dir'], "{0:s}_probs.json".format(eg_utt))

In [None]:
eg_utt = "20051017_234550_276_fsp-B-34"
print(check_loss(eg_utt, curr_set='fisher_dev'))
generate_translate_probs(eg_utt)

In [None]:
vocab_dict['en_w']['i2w'][2]

In [None]:
eg_utt = "20051026_180724_341_fsp-A-26"
generate_translate_probs(eg_utt)
check_loss(eg_utt, curr_set='fisher_dev')

In [None]:
eg_utt = "20051017_234550_276_fsp-A-13"
generate_translate_probs(eg_utt)
check_loss(eg_utt, curr_set='fisher_dev')

In [None]:
eg_utt = "20051018_210220_279_fsp-A-26"
generate_translate_probs(eg_utt)
check_loss(eg_utt, curr_set='fisher_dev')

In [None]:
find_utts_with_word("mhm", set_key="fisher_dev")

In [None]:
eg_utt = "20051019_210146_289_fsp-A-54"
generate_translate_probs(eg_utt, curr_set='fisher_dev')
check_loss(eg_utt, curr_set='fisher_dev')

In [None]:
utt_loss = {}

In [None]:
eg_utt = "20051017_220530_275_fsp-B-21.npy"
try:
    check_loss(eg_utt, curr_set='fisher_dev')
except:
    print("{0:s} not found".format(utt))

In [None]:
i = 0
utt_loss = {}
for utt in tqdm(map_dict['fisher_dev'], ncols=50):
    if utt not in utt_loss:
        try:
            loss = check_loss(utt, curr_set='fisher_dev')
            utt_loss[utt] = loss.data.tolist()
        except:
            print("{0:s} not found".format(utt))
    #     print(utt, "{0:5.3f}".format(loss.data.tolist()))
#     i += 1
#     if i > 5:
#         break

In [None]:
# normalize by length
utt_loss_normalize = {}
for utt in tqdm(utt_loss, ncols=50):
    utt_loss_normalize[utt] = utt_loss[utt] / (len(map_dict['fisher_dev'][utt]['en_w'][0])+1)

In [None]:
list(utt_loss.items())[:10]

In [None]:
pickle.dump(utt_loss, open(os.path.join(cfg_path, "dev_utts_loss.dict"), "wb"))

In [None]:
pickle.dump(utt_loss_normalize, open(os.path.join(cfg_path, "dev_utts_loss_normalized.dict"), "wb"))

In [None]:
utt_loss = pickle.load(open(os.path.join(cfg_path, "dev_utts_loss.dict"), "rb"))

In [None]:
utt_loss_normalize = pickle.load(open(os.path.join(cfg_path, "dev_utts_loss_normalized.dict"), "rb"))

In [None]:
bad_utts = sorted(utt_loss_normalize.items(), reverse=True, key=lambda t: t[1])

In [None]:
N_BAD_UTTS = 30

In [None]:
u = '20051026_180724_341_fsp-A-26'
generate_translate_probs(eg_utt, curr_set='fisher_dev')
check_loss(eg_utt, curr_set='fisher_dev')

In [None]:
bad_utts[:10]

In [None]:
bad_utts[-10:]

In [None]:
x, y = zip(*utt_loss_normalize.items())

In [None]:
sum([1 if i < 1 else 0 for i in y]), sum([1 if i > 5 else 0 for i in y]), len(y)

### dev utts - avg loss per word in utt

In [None]:
fig, ax = plt.subplots()
fig.set_size_inches(8,5)

ax = sns.distplot(y, kde=False, rug=False, ax=ax, color=tableau20[0]);
ax.set_xlabel("dev utts - avg loss per word in utt", size=20)

In [None]:
for i, (u, l) in enumerate(sorted(utt_loss_normalize.items(), reverse=True, key=lambda t: t[1])[:50]):
    print("-"*80)
    print("{0:d}".format(i))
    print("-"*80)
    generate_translate_probs(u, curr_set='fisher_dev', display_limit=3, display_probs=True)
    loss_v, loss_by_w = check_loss(u, curr_set='fisher_dev')
    #print("{0:20s} ||| {1:5.2f} ||| {2:5.2f} ||| {3:5.2f}".format(u, l, loss_v, loss_by_w))
    print("{0:20s} ||| {3:5.2f}".format(u, l, loss_v, loss_by_w))

To share:
Several utterrance labels have typos, giving a misleading signal about the prediction quality

"20051017_180712_270_fsp-B-62"
dogs barking

"20051018_210220_279_fsp-A-71"
monopoly money -- monopoly occurs only 5 times in the train set, and never in the context of the game

"20051017_220530_275_fsp-B-61"
the decode probabilities show that maybe beam decoding (? or probably language model) will help catch up to Google. the Google model outputs Texas. We have Texas as the second most probable word as per the acoustic model. 


In [None]:
x = np.array([[ .759,  0.141,  .053]], dtype=np.float32)
t = np.array([1]).astype('i')
y = F.softmax_cross_entropy(x, t)
y

In [None]:
batch_data = get_utt_data(eg_utt, curr_set='fisher_dev')

In [None]:
X, y = batch_data['X'], batch_data['y']

In [None]:
[vocab_dict['en_w']['i2w'][i] for i in xp.asnumpy(y.data[0])]

### multilabel classification

In [22]:
bucket_dict['fisher_train']['buckets'][0][:5]

['20050908_182943_22_fsp-A-1',
 '20050908_182943_22_fsp-B-1',
 '20050908_182943_22_fsp-A-2',
 '20050908_182943_22_fsp-B-2',
 '20050908_182943_22_fsp-A-3']

In [23]:
utt_list = bucket_dict['fisher_train']['buckets'][0][:5]
width_b = bucket_dict['fisher_train']['width_b']
local_input_path = os.path.join(m_cfg['data_path'], m_cfg['train_set'])

In [24]:
batch_data = get_batch(map_dict["fisher_train"], 
                       enc_key,
                       dec_key,
                       utt_list,
                       vocab_dict,
                       (0+1) * width_b,
                       200,
                       input_path=local_input_path)
    
X, y = batch_data['X'], batch_data['y']

In [25]:
y.shape, X.shape

((5, 7), (5, 150, 80))

In [26]:
# encode input
model.forward_enc(X)

In [27]:
y_t = F.swapaxes(y, 0, 1)

In [28]:
y_t.shape

(7, 5)

In [29]:
len(y_t)

7

In [30]:
next_word.data

NameError: name 'next_word' is not defined

In [47]:
t = np.zeros(shape=(len(next_word),10), dtype='i')

In [48]:
t

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int32)

In [49]:
t[0,[1]] = 1

In [32]:
sim_dict = pickle.load(open("../speech2text/fbanks_80dim_nltk/sim.dict", "rb"))

In [59]:
sim_dict['i'][4]

[4, 1044, 2477, 13760, 3510, 842, 1148, 3382, 3305, 1345, 2846]

In [66]:
for next_word in y_t:
    print(next_word)
    t = np.zeros(shape=(len(next_word.data), 17000), dtype='i')
    print(next_word.data.tolist())
    for i,w in enumerate(next_word.data.tolist()):
        t[i,sim_dict['i'][w]] = 1
        print(t[i,[4,1044, 1045, 2477]])
    #print(t)

variable([1 1 1 1 1])
[1, 1, 1, 1, 1]
[0 0 0 0]
[0 0 0 0]
[0 0 0 0]
[0 0 0 0]
[0 0 0 0]
variable([4 4 4 4 5])
[4, 4, 4, 4, 5]
[1 1 0 1]
[1 1 0 1]
[1 1 0 1]
[1 1 0 1]
[0 0 0 0]
variable([2 2 2 2 6])
[2, 2, 2, 2, 6]
[0 0 0 0]
[0 0 0 0]
[0 0 0 0]
[0 0 0 0]
[0 0 0 0]
variable([0 0 0 0 7])
[0, 0, 0, 0, 7]
[0 0 0 0]
[0 0 0 0]
[0 0 0 0]
[0 0 0 0]
[0 0 0 0]
variable([0 0 0 0 8])
[0, 0, 0, 0, 8]
[0 0 0 0]
[0 0 0 0]
[0 0 0 0]
[0 0 0 0]
[0 0 0 0]
variable([0 0 0 0 9])
[0, 0, 0, 0, 9]
[0 0 0 0]
[0 0 0 0]
[0 0 0 0]
[0 0 0 0]
[0 0 0 0]
variable([0 0 0 0 2])
[0, 0, 0, 0, 2]
[0 0 0 0]
[0 0 0 0]
[0 0 0 0]
[0 0 0 0]
[0 0 0 0]


In [29]:
labels = xp.zeros((5,10)).astype('i')

In [30]:
next_word = [1,2,3,4,5]

In [31]:
for i, w in enumerate(next_word):
    print(i, w)
    labels[i,[w]] = 1

0 1
1 2
2 3
3 4
4 5


In [32]:
labels

array([[0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 0, 0, 0]], dtype=int32)

In [33]:
next_word.data.tolist()

AttributeError: 'list' object has no attribute 'data'

In [158]:
x = np.random.randn(1,17000).astype('f')
x = np.zeros((1,17000)).astype('f')

In [194]:
x[0,2] = 10.0
x[0,0] = -10.0

In [164]:
x

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.]], dtype=float32)

In [165]:
F.log_softmax(x).data

array([[-9.7409687, -9.7409687, -9.7409687, ..., -9.7409687, -9.7409687,
        -9.7409687]], dtype=float32)

In [182]:
t = 0 * np.ones((1,17000), dtype='i')

In [211]:
t[0,2] = 1

In [215]:
x, t

(array([[-2.,  3., -2.],
        [-2.,  3., -2.]], dtype=float32), array([[-1, -1, -1],
        [ 0,  1,  0]], dtype=int32))

In [218]:
F.sigmoid_cross_entropy(x, t, reduce='no', normalize=False)

variable([[-0.        ,  0.        , -0.        ],
          [ 0.126928  ,  0.04858735,  0.126928  ]])

In [219]:
F.sigmoid_cross_entropy(x, t, reduce='no', normalize=True)

variable([[-0.        ,  0.        , -0.        ],
          [ 0.126928  ,  0.04858735,  0.126928  ]])

In [220]:
F.sigmoid_cross_entropy(x, t, reduce='mean', normalize=True)

variable(0.10081445425748825)

In [221]:
F.sigmoid_cross_entropy(x, t, reduce='mean', normalize=False)

variable(0.15122167766094208)

In [223]:
F.sigmoid_cross_entropy(x, t, normalize=True)

variable(0.10081445425748825)

In [214]:
x = np.array([[-2.0, 3.0, -2.0], [-2.0, 3.0, -2.0]]).astype('f')
t = np.array([[-1, -1, -1], [0, 1, 0]]).astype('i')

In [209]:
F.sigmoid_cross_entropy(x, t)

variable(0.10081445425748825)

In [42]:
x = np.zeros((1,10), dtype="f")

In [45]:
x[0,[2]] = 10.0

In [380]:
x[:,list(range(2))+list(range(3,10))] = -.2

In [381]:
x[:,:10]

array([[ -0.2,  -0.2,  10. ,  -0.2,  -0.2,  -0.2,  -0.2,  -0.2,  -0.2,
         -0.2]], dtype=float32)

In [384]:
t = np.zeros((1,10), dtype="i")
t[0,5] = 1

In [385]:
F.sigmoid_cross_entropy(x, t)

variable(1.558329463005066)

In [241]:
predicted_out[:1], xp.argmax(predicted_out[:1].data)

(variable([[ -0.54293811,  -0.54772395,  13.52175999, ...,  -0.53887445,
             -0.40857357,  -0.45140707]]), array(2, dtype=int64))

In [294]:
labels = 0 * xp.ones(predicted_out[:1].shape).astype('i')

In [295]:
labels.shape

(1, 17834)

In [316]:
t = xp.zeros(predicted_out[:1].shape).astype('f')

In [317]:
t.shape

(1, 17834)

In [318]:
t[0,[2,5]] = 50.0

In [319]:
labels[0,[2,5]] = 1

In [320]:
F.sigmoid_cross_entropy(t, labels)

variable(0.6930694580078125)

In [321]:
labels[:,:10]

array([[0, 0, 1, 0, 0, 1, 0, 0, 0, 0]], dtype=int32)

In [322]:
t[:,:10]

array([[  0.,   0.,  50.,   0.,   0.,  50.,   0.,   0.,   0.,   0.]], dtype=float32)

In [219]:
F.sigmoid_cross_entropy(F.softmax(predicted_out[:1]), xp.expand_dims(labels, axis=0))

variable(0.6931415796279907)

In [196]:
xp.where(labels > 0)

(array([0]), array([2]))

In [204]:
xp.where(labels > 0)

(array([0]), array([2]))

In [386]:
from gensim.models import KeyedVectors

In [47]:
sim_dict['w'][b'rico'], sim_dict['i'][4]

([b'rico', b'puerto', b'puertorican', b'ricardo', b'rica'],
 [4, 1044, 2477, 13760, 3510, 842, 1148, 3382, 3305, 1345, 2846])

In [48]:
xp.random.choice(sim_dict['i'][4], 1), sim_dict['i'][w]

(array([3305]), [2])

In [54]:
for i in range(len(t_alt)):
    print(t_alt, i)
    print(t_alt[i])
    #t_alt[i] = xp.random.sample(sim_dict['i'][t_alt[i]])

[4 4 4 4 5] 0
4
[4 4 4 4 5] 1
4
[4 4 4 4 5] 2
4
[4 4 4 4 5] 3
4
[4 4 4 4 5] 4
5


In [74]:
print(t_alt)
for i in range(len(t_alt)):
    print(xp.random.choice(sim_dict['i'][int(t_alt[i])],1))
    print(t_alt[i],type(t_alt[i]), int(t_alt[i]))

[4 4 4 4 5]
[1044]
4 <class 'cupy.core.core.ndarray'> 4
[3510]
4 <class 'cupy.core.core.ndarray'> 4
[842]
4 <class 'cupy.core.core.ndarray'> 4
[2477]
4 <class 'cupy.core.core.ndarray'> 4
[5]
5 <class 'cupy.core.core.ndarray'> 5


In [76]:
decoder_batch = y_t
batch_size = decoder_batch.shape[1]
loss = 0
# ---------------------------------------------------------------------
# initialize hidden states as a zero vector
# ---------------------------------------------------------------------
a_units = model.m_cfg['attn_units']
ht = Variable(xp.zeros((batch_size, a_units), dtype=xp.float32))
# ---------------------------------------------------------------------
decoder_input = decoder_batch[0]
# for all sequences in the batch, feed the characters one by one
for curr_word, next_word in zip(decoder_batch, decoder_batch[1:]):
    print(curr_word, next_word)
    decoder_input = curr_word
    # -----------------------------------------------------------------
    # encode tokens
    # -----------------------------------------------------------------
    predicted_out, ht = model.decode(decoder_input, ht)
    decoder_input = F.argmax(predicted_out, axis=1)
    #print(decoder_input)
    # -----------------------------------------------------------------
    # compute loss
    # -----------------------------------------------------------------
    t_alt = xp.copy(next_word.data)
    print(t_alt)
    for i in range(len(t_alt)):
        t_alt[i] = xp.random.choice(sim_dict['i'][int(t_alt[i])],1)
        #print(t[i,[4,1044, 1045, 2477]])
    print(t_alt)

#     t = xp.zeros(shape=predicted_out.shape, dtype='i')
#     print(next_word.data.tolist())
#     print(next_word.shape)
#     for i,w in enumerate(next_word.data.tolist()):
#         if w == PAD_ID:
#             t[i,:] = -1
#         else:
#             t[i,sim_dict['i'][w]] = 1
#         #print(t[i,[4,1044, 1045, 2477]])
#     loss_arr = F.sigmoid_cross_entropy(predicted_out, t, normalize=True)
    loss_arr = F.softmax_cross_entropy(predicted_out, t_alt, normalize=True)
    print("softmax cross entropy:", F.softmax_cross_entropy(predicted_out, next_word), "sigmoid:", loss_arr.data.tolist())
    loss += loss_arr
    
    # -----------------------------------------------------------------
#print(loss, loss / (y.shape[0]-2), y.shape)
print(loss.data.tolist(), (loss / (y.shape[0]-1)).data.tolist())

variable([1 1 1 1 1]) variable([4 4 4 4 5])
[4 4 4 4 5]
[2477  842  842 1148    5]
softmax cross entropy: variable(6.027211666107178) sigmoid: 8.526057243347168
variable([4 4 4 4 5]) variable([2 2 2 2 6])
[2 2 2 2 6]
[2 2 2 2 6]
softmax cross entropy: variable(0.9709686636924744) sigmoid: 0.9709686636924744
variable([2 2 2 2 6]) variable([0 0 0 0 7])
[0 0 0 0 7]
[ 0  0  0  0 29]
softmax cross entropy: variable(10.59409236907959) sigmoid: 11.633125305175781
variable([0 0 0 0 7]) variable([0 0 0 0 8])
[0 0 0 0 8]
[0 0 0 0 8]
softmax cross entropy: variable(10.959638595581055) sigmoid: 10.959638595581055
variable([0 0 0 0 8]) variable([0 0 0 0 9])
[0 0 0 0 9]
[0 0 0 0 9]
softmax cross entropy: variable(11.310018539428711) sigmoid: 11.310018539428711
variable([0 0 0 0 9]) variable([0 0 0 0 2])
[0 0 0 0 2]
[0 0 0 0 2]
softmax cross entropy: variable(11.700035095214844) sigmoid: 11.700035095214844
55.09984588623047 13.774961471557617


In [None]:
batch_size = X.shape[0]
# encode input
model.forward_enc(X)
# ---------------------------------------------------------------------
# initialize decoder LSTM to final encoder state
# ---------------------------------------------------------------------
model.set_decoder_state()
# ---------------------------------------------------------------------
y = F.swapaxes(y, 0, 1)

In [None]:
xp = cuda.cupy if model.gpuid >= 0 else np

In [None]:
decoder_batch = y 
batch_size = decoder_batch.shape[1]
loss = 0

In [None]:
for curr_word, next_word in zip(decoder_batch, decoder_batch[1:]):
    print(curr_word, next_word)

In [None]:
dummy_x = Variable(xp.asarray([[0,1,0]], dtype=xp.float32))

In [None]:
dummy_y = Variable(xp.asarray([[0,1,0]], dtype=xp.int32))

In [None]:
x = np.array([[-10, 5.0, -10.0]]).astype('f')
x

In [None]:
t = np.array([[0, 1, 0]]).astype('i')
t

In [None]:
F.softmax_cross_entropy(x, np.array([1]).astype('i'))

In [None]:
F.sigmoid_cross_entropy(x, t, normalize=True)

In [None]:
# ---------------------------------------------------------------------
# initialize hidden states as a zero vector
# ---------------------------------------------------------------------
a_units = model.m_cfg['attn_units']
ht = Variable(xp.zeros((batch_size, a_units), dtype=xp.float32))
# ---------------------------------------------------------------------
decoder_input = decoder_batch[0]
# for all sequences in the batch, feed the characters one by one
for curr_word, next_word in zip(decoder_batch, decoder_batch[1:]):
    # -----------------------------------------------------------------
    # teacher forcing logic
    # -----------------------------------------------------------------
    use_label = True if random.random() < teacher_ratio else False
    if use_label:
        decoder_input = curr_word
    # -----------------------------------------------------------------
    # encode tokens
    # -----------------------------------------------------------------
    predicted_out, ht = model.decode(decoder_input, ht)
    decoder_input = F.argmax(predicted_out, axis=1)
    # -----------------------------------------------------------------
    # compute loss
    # -----------------------------------------------------------------
    loss_arr = F.softmax_cross_entropy(predicted_out, next_word,
                                       class_weight=model.mask_pad_id)
    loss += loss_arr
    # -----------------------------------------------------------------
return loss

### Compare BLEU scores at utterance level

In [None]:
eg_utt = "20051023_232057_325_fsp-A-3"
print(sentence_bleu([google_dev_ref_0[eg_utt]], google_hyp_r0[eg_utt], smoothing_function=smooth_fun.method2))
print(sentence_bleu([model_s2t_refs[eg_utt]], model_s2t_hyps[eg_utt], smoothing_function=smooth_fun.method2))

In [None]:
eg_utt = "20051019_190221_288_fsp-B-1"
print(sentence_bleu([google_dev_ref_0[eg_utt]], google_hyp_r0[eg_utt], smoothing_function=smooth_fun.method2))
print(sentence_bleu([model_s2t_refs[eg_utt]], model_s2t_hyps[eg_utt], smoothing_function=smooth_fun.method2))
print(google_dev_ref_0[eg_utt], google_hyp_r0[eg_utt])
print(model_s2t_refs[eg_utt], model_s2t_hyps[eg_utt])

In [None]:
random.seed("haha")
dev_utts = list(model_s2t_refs.keys())
random.shuffle(dev_utts)

In [None]:
print("google beats model by factor of 2")

count = 0
# print("-"*80)
# print("{0:>5s} ||| {1:30s} ||| {2:>15s} || {3:>15s}".format("sn", "utt", "google utt bleu", "model utt bleu"))
# print("-"*80)
for utt in dev_utts:
    google_utt_bleu = sentence_bleu([google_dev_ref_0[utt]], google_hyp_r0[utt], smoothing_function=smooth_fun.method2)
    model_utt_bleu = sentence_bleu([model_s2t_refs[utt]], model_s2t_hyps[utt], smoothing_function=smooth_fun.method2)
    if google_utt_bleu >= (2 * model_utt_bleu) and google_utt_bleu >= 0.5:
        count += 1
        #print("{0:5d} ||| {1:30s} ||| {2:15.2f} || {3:15.2f}".format(count, utt, google_utt_bleu, model_utt_bleu))
        print("-"*80)
        print(count)
        print("-"*80)
        display_pp = PrettyTable(["cat","sent"], hrules=True)
        display_pp.align = "l"
        display_pp.header = False
        display_pp.add_row(["en ref", textwrap.fill(" ".join(model_s2t_refs[utt]),50)])
        display_pp.add_row(["model pred", textwrap.fill(" ".join(model_s2t_hyps[utt]),50)])
        display_pp.add_row(["model utt bleu", "{0:.2f}".format(model_utt_bleu)])
        display_pp.add_row(["google pred", textwrap.fill(" ".join(google_hyp_r0[utt]),50)])
        display_pp.add_row(["google utt bleu", "{0:.2f}".format(google_utt_bleu)])
        print(display_pp)
        #play_utt(utt, map_dict['fisher_dev'])
#         generate_translate_probs(utt)
#     if count > 100:
#         break
