In [1]:
import nltk
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu, modified_precision
from nltk.translate.chrf_score import sentence_chrf, corpus_chrf
from nltk.metrics import scores
import scipy.io.wavfile
from IPython.display import Audio
from IPython.display import display
from nltk.stem import *
# from nltk.stem.snowball import SnowballStemmer
from stemming.porter2 import stem
import stemming
from nltk.metrics.scores import recall

from nltk.corpus import stopwords

%matplotlib inline

In [2]:
tableau20 = [(31, 119, 180), (174, 199, 232), (255, 127, 14), (255, 187, 120),    
             (44, 160, 44), (152, 223, 138), (214, 39, 40), (255, 152, 150),    
             (148, 103, 189), (197, 176, 213), (140, 86, 75), (196, 156, 148),    
             (227, 119, 194), (247, 182, 210), (127, 127, 127), (199, 199, 199),    
             (188, 189, 34), (219, 219, 141), (23, 190, 207), (158, 218, 229)]    
# Scale the RGB values to the [0, 1] range, which is the format matplotlib accepts.    
for i in range(len(tableau20)):    
    r, g, b = tableau20[i]    
    tableau20[i] = (r / 255., g / 255., b / 255.)

In [3]:
smooth_fun = nltk.translate.bleu_score.SmoothingFunction()

In [4]:
from nmt_run import *

In [5]:
def play_utt(utt, m_dict):
    sr, y = scipy.io.wavfile.read(os.path.join(wavs_path, utt.rsplit("-",1)[0]+'.wav'))
    start_t = min(seg['start'] for seg in m_dict[utt]['seg'])
    end_t = max(seg['end'] for seg in m_dict[utt]['seg'])
    print(start_t, end_t)
    start_t_samples, end_t_samples = int(start_t*sr), int(end_t*sr)
    display(Audio(y[start_t_samples:end_t_samples], rate=sr))

In [6]:
def display_words(m_dict, v_dict, preds, utts, dec_key, key, play_audio=False, displayN=-1):
    if displayN == -1:
        displayN = len(utts)
    es_ref = []
    en_ref = []
    google_ref = []
    google_pred = []
    for u in utts:
        es_ref.append(" ".join([w.decode() for w in m_dict[u]['es_w']]))
        if type(m_dict[u][dec_key]) == list:
            en_ref.append(" ".join([w.decode() for w in m_dict[u]['en_w']]))
        else:
            en_ref.append(" ".join([w.decode() for w in m_dict[u]['en_w'][0]]))
        google_pred.append(" ".join(google_hyp_r0[u]))
        google_ref.append(" ".join(google_dev_ref_0[u]))

    en_pred = []
    join_str = ' ' if dec_key.endswith('_w') else ''

    for p in preds:
        if type(p) == list:
            t_str = join_str.join([v_dict['i2w'][i].decode() for i in p])
            t_str = t_str[:t_str.find('_EOS')]
            en_pred.append(t_str)
        else:
            en_pred.append("")
        

    for u, es, en, p, g, gr in sorted(list(zip(utts, es_ref, en_ref, en_pred, google_pred, google_ref)))[:displayN]:
        # for reference, 1st word is GO_ID, no need to display
        print("Utterance: {0:s}".format(u))
        display_pp = PrettyTable(["cat","sent"], hrules=True)
        display_pp.align = "l"
        display_pp.header = False
        display_pp.add_row(["es ref", textwrap.fill(es,50)])
        display_pp.add_row(["en ref", textwrap.fill(en,50)])
        display_pp.add_row(["model pred", textwrap.fill(p,50)])
        display_pp.add_row(["model bleu", "{0:.2f}".format(sentence_bleu([en], p, smoothing_function=smooth_fun.method2))])
        display_pp.add_row(["google pred", textwrap.fill(g,50)])
        display_pp.add_row(["google bleu", "{0:.2f}".format(sentence_bleu([gr], g, smoothing_function=smooth_fun.method2))])
    

        print(display_pp)
        if play_audio:
            play_utt(u, m_dict)
    

In [7]:
def make_pred(utt, X, y=None, display_limit=10):
    # get shape
    batch_size = X.shape[0]
    # encode input
    model.forward_enc(X)
    # ---------------------------------------------------------------------
    # initialize decoder LSTM to final encoder state
    # ---------------------------------------------------------------------
    model.set_decoder_state()
    # ---------------------------------------------------------------------
    # swap axes of the decoder batch
    if y is not None:
        y = F.swapaxes(y, 0, 1)
    # -----------------------------------------------------------------
    # predict
    # -----------------------------------------------------------------
    # make return statements consistent
    return(decode_display(utt, batch_size=batch_size,
                          pred_limit=model.m_cfg['max_en_pred'],
#                           pred_limit=20,
                          y=y, display_limit=display_limit))

In [8]:
def decode_display(utt, batch_size, pred_limit, y=None, display_limit=10):
    xp = cuda.cupy if model.gpuid >= 0 else np
    # max number of predictions to make
    # if labels are provided, this variable is not used
    stop_limit = pred_limit
    # to track number of predictions made
    npred = 0
    # to store loss
    loss = 0
    # if labels are provided, use them for computing loss
    compute_loss = True if y is not None else False
    # ---------------------------------------------------------------------
    if compute_loss:
        stop_limit = len(y)-1
        # get starting word to initialize decoder
        curr_word = y[0]
    else:
        # intialize starting word to GO_ID symbol
        curr_word = Variable(xp.full((batch_size,), GO_ID, dtype=xp.int32))
    # ---------------------------------------------------------------------
    # flag to track if all sentences in batch have predicted EOS
    # ---------------------------------------------------------------------
    with cupy.cuda.Device(model.gpuid):
        check_if_all_eos = xp.full((batch_size,), False, dtype=xp.bool_)
    # ---------------------------------------------------------------------
    a_units = m_cfg['attn_units']
    ht = Variable(xp.zeros((batch_size, a_units), dtype=xp.float32))
    # ---------------------------------------------------------------------
    prob_out = {}
    prob_print_str = []
    while npred < (stop_limit):
        # -----------------------------------------------------------------
        # decode and predict
        #print("decoding with word: {0:s}".format(vocab_dict['en_w']['i2w'][curr_word.data[0].tolist()].decode()))
        pred_out, ht = model.decode(curr_word, ht)
        pred_word = F.argmax(pred_out, axis=1)
        # -----------------------------------------------------------------
        # printing conditional probabilities
        # -----------------------------------------------------------------
        pred_probs = xp.asnumpy(F.softmax(pred_out).data[0])
        top_n_probs = np.argsort(pred_probs)[-display_limit:]
        #print("-"*60)
        #print("predicting word : {0:d}".format(npred))
        prob_print_str.append("-" * 60)
        prob_print_str.append("predicting word : {0:d}".format(npred))
        
        # -----------------------------------------------------------------
#         if npred == 0:
#             sample_word = np.random.choice(range(len(pred_probs)), p=pred_probs)
#             sample_word = np.argsort(pred_probs)[-2]
#             print(np.argsort(pred_probs)[-2], np.argsort(pred_probs)[-1])
#             pred_word = Variable(xp.asarray([sample_word], dtype=xp.int32))
        # -----------------------------------------------------------------
        
        prob_out[npred] = {}
        for pi in top_n_probs[::-1]:
            prob_out[npred][v_dict['i2w'][pi].decode()] = "{0:.3f}".format(pred_probs[pi])
            #print("{0:10s} = {1:5.3f}".format(v_dict['i2w'][pi].decode(), pred_probs[pi]))
            prob_print_str.append("{0:10s} = {1:5.3f}".format(v_dict['i2w'][pi].decode(), pred_probs[pi]))
            
        # -----------------------------------------------------------------
        # save prediction at this time step
        # -----------------------------------------------------------------
        if npred == 0:
            pred_sents = pred_word.data
        else:
            pred_sents = xp.vstack((pred_sents, pred_word.data))
        # -----------------------------------------------------------------
        if compute_loss:
            # compute loss
            loss += F.softmax_cross_entropy(pred_out, y[npred+1],
                                               class_weight=model.mask_pad_id)
        # -----------------------------------------------------------------
        curr_word = pred_word
        # -----------------------------------------------------------------
        # check if EOS is predicted for all sentences
        # -----------------------------------------------------------------
        check_if_all_eos[pred_word.data == EOS_ID] = True
        if xp.all(check_if_all_eos):
            break
        # -----------------------------------------------------------------
        # increment number of predictions made
        npred += 1
        # -----------------------------------------------------------------
    
    out_fname = os.path.join(m_cfg['model_dir'], "probs", "{0:s}_probs.json".format(utt))
    with open(out_fname, "w") as out_f:
        json.dump(prob_out, out_f, indent=4)
    print("saved probs in : {0:s}".format(out_fname))
    return pred_sents.T, loss, "\n".join(prob_print_str)

In [9]:
def check_loss(eg_utt, curr_set='fisher_dev', teacher_ratio=1.0):
    # get shape
    if "train" in curr_set:
        local_input_path = os.path.join(m_cfg['data_path'], m_cfg['train_set'])
        play_audio = False
    else:
        local_input_path = os.path.join(m_cfg['data_path'], m_cfg['dev_set'])
        play_audio = True
        
    eg_utt_bucket = -1
    for i, bucket in enumerate(bucket_dict[curr_set]["buckets"]):
        if eg_utt in bucket:
            eg_utt_bucket = i
            #print("found")
        # end if
    # end for
    #print("found in bucket : {0:d}".format(eg_utt_bucket))
    width_b = bucket_dict[dev_key]["width_b"]
    utt_list = [eg_utt]
    
    batch_data = get_batch(map_dict[curr_set], 
                           enc_key,
                           dec_key,
                           utt_list,
                           vocab_dict,
                           (eg_utt_bucket+1) * width_b,
                           200,
                           input_path=local_input_path)
    
    X, y = batch_data['X'], batch_data['y']
    
    batch_size = X.shape[0]
    # encode input
    model.forward_enc(X)
    # ---------------------------------------------------------------------
    # initialize decoder LSTM to final encoder state
    # ---------------------------------------------------------------------
    model.set_decoder_state()
    # ---------------------------------------------------------------------
    y = F.swapaxes(y, 0, 1)
        
    xp = cuda.cupy if model.gpuid >= 0 else np
    
    decoder_batch = y 
    batch_size = decoder_batch.shape[1]
    loss = 0
    # ---------------------------------------------------------------------
    # initialize hidden states as a zero vector
    # ---------------------------------------------------------------------
    a_units = model.m_cfg['attn_units']
    ht = Variable(xp.zeros((batch_size, a_units), dtype=xp.float32))
    # ---------------------------------------------------------------------
    decoder_input = decoder_batch[0]
    # for all sequences in the batch, feed the characters one by one
    for curr_word, next_word in zip(decoder_batch, decoder_batch[1:]):
        #print(curr_word, next_word)
        # -----------------------------------------------------------------
        # teacher forcing logic
        # -----------------------------------------------------------------
        use_label = True if random.random() < teacher_ratio else False
        if use_label:
            decoder_input = curr_word
        # -----------------------------------------------------------------
        # encode tokens
        # -----------------------------------------------------------------
        predicted_out, ht = model.decode(decoder_input, ht)
        decoder_input = F.argmax(predicted_out, axis=1)
        #print(decoder_input)
        # -----------------------------------------------------------------
        # compute loss
        # -----------------------------------------------------------------
        loss_arr = F.softmax_cross_entropy(predicted_out, next_word,
                                           class_weight=model.mask_pad_id)
        #print(loss_arr.data.tolist())
        loss += loss_arr
        # -----------------------------------------------------------------
    #print(loss, loss / (y.shape[0]-2), y.shape)
    return loss.data.tolist(), (loss / (y.shape[0]-1)).data.tolist()

In [10]:
def get_utt_data(eg_utt, curr_set='fisher_dev'):
    # get shape
    if "train" in curr_set:
        local_input_path = os.path.join(m_cfg['data_path'], m_cfg['train_set'])
        play_audio = False
    else:
        local_input_path = os.path.join(m_cfg['data_path'], m_cfg['dev_set'])
        play_audio = True
        
    eg_utt_bucket = -1
    for i, bucket in enumerate(bucket_dict[curr_set]["buckets"]):
        if eg_utt in bucket:
            eg_utt_bucket = i
            #print("found")
        # end if
    # end for
    #print("found in bucket : {0:d}".format(eg_utt_bucket))
    width_b = bucket_dict[dev_key]["width_b"]
    utt_list = [eg_utt]
    
    
    batch_data = get_batch(map_dict[curr_set], 
                           enc_key,
                           dec_key,
                           utt_list,
                           vocab_dict,
                           (eg_utt_bucket+1) * width_b,
                           200,
                           input_path=local_input_path)
    
    return batch_data

### Fisher dev

In [11]:
cfg_path = "./sp2enw_mel-80_vocab-nltk/sp_1.0_h-256_e-128_drpt-rnn-.3"

In [12]:
last_epoch, model, optimizer, m_cfg, t_cfg = check_model(cfg_path)

cnn_out_dim = rnn_in_units =  640




using ADAM optimizer
--------------------------------------------------------------------------------
model found = 
./sp2enw_mel-80_vocab-nltk/sp_1.0_h-256_e-128_drpt-rnn-.3/seq2seq_40.model
finished loading ..
optimizer not found


In [13]:
train_key = m_cfg['train_set']
dev_key = m_cfg['dev_set']
batch_size=t_cfg['batch_size']
enc_key=m_cfg['enc_key']
dec_key=m_cfg['dec_key']
input_path = os.path.join(m_cfg['data_path'], m_cfg['dev_set'])
# -------------------------------------------------------------------------
# get data dictionaries
# -------------------------------------------------------------------------
map_dict, vocab_dict, bucket_dict = get_data_dicts(m_cfg)
batch_size = {'max': 96, 'med': 128, 'min': 256, 'scale': 1}

--------------------------------------------------
loading dict: fbanks_80dim_nltk/map.dict


  0%|          | 0/3977 [00:00<?, ?it/s]

loading dict: fbanks_80dim_nltk/train_vocab.dict
--------------------------------------------------
--------------------------------------------------
loading info_dict from=fbanks_80dim_nltk/info.dict
--------------------------------------------------
creating buckets for: fisher_dev
creating buckets for key: sp


100%|██████████| 3977/3977 [00:00<00:00, 458678.11it/s]
100%|██████████| 3960/3960 [00:00<00:00, 410748.67it/s]
100%|██████████| 3641/3641 [00:00<00:00, 476146.94it/s]
 33%|███▎      | 45717/138720 [00:00<00:00, 457068.00it/s]

creating buckets for: fisher_dev2
creating buckets for key: sp
creating buckets for: fisher_test
creating buckets for key: sp
creating buckets for: fisher_train
creating buckets for key: sp


100%|██████████| 138720/138720 [00:00<00:00, 512106.91it/s]
100%|██████████| 3803/3803 [00:00<00:00, 483200.69it/s]
100%|██████████| 1824/1824 [00:00<00:00, 454488.83it/s]
100%|██████████| 14294/14294 [00:00<00:00, 461975.88it/s]


creating buckets for: callhome_devtest
creating buckets for key: sp
creating buckets for: callhome_evltest
creating buckets for key: sp
creating buckets for: callhome_train
creating buckets for key: sp
--------------------------------------------------
saving info dict in: fbanks_80dim_nltk/buckets_sp.dict
all done ...
loading dict: fbanks_80dim_nltk/buckets_sp.dict
--------------------------------------------------
utterances in fisher_dev = 3979
utterances in fisher_dev2 = 3961
utterances in fisher_test = 3641
utterances in fisher_train = 138819
utterances in callhome_devtest = 3966
utterances in callhome_evltest = 1829
utterances in callhome_train = 15080
vocab size for sp = 0
vocab size for en_w = 17834


In [14]:
random.seed("meh")
# random.seed("haha")

In [15]:
# Eval parameters
ref_index = -1
min_len, max_len= 0, m_cfg['max_en_pred']
# min_len, max_len = 0, 10
displayN = 50
m_dict=map_dict[dev_key]
# wavs_path = os.path.join(m_cfg['data_path'], "wavs")
wavs_path = os.path.join("../chainer2/speech2text/both_fbank_out/", "wavs")
v_dict = vocab_dict['en_w']
key = m_cfg['dev_set']

In [16]:
os.chdir("..")
os.chdir("/afs/inf.ed.ac.uk/group/project/lowres/work/speech2text")

### Load google refs and preds

In [17]:
google_s2t_refs_path = os.path.join("../chainer2/speech2text/both_fbank_out/", "google_s2t_refs.dict")
google_s2t_hyps_path = os.path.join("../chainer2/speech2text/both_fbank_out/", "google_s2t_hyps.dict")

In [18]:
google_s2t_hyps = pickle.load(open(google_s2t_hyps_path, "rb"))
google_hyp_r0 = google_s2t_hyps['fisher_dev_r0']

google_s2t_refs = pickle.load(open(google_s2t_refs_path, "rb"))
google_dev_ref_0 = google_s2t_refs['fisher_dev_ref_0']

In [19]:
model_s2t_refs = pickle.load(open(os.path.join(cfg_path, "model_s2t_refs.dict"), "rb"))
model_s2t_hyps = pickle.load(open(os.path.join(cfg_path, "model_s2t_hyps.dict"), "rb"))

### View model

In [20]:
def generate_translate_probs(eg_utt, curr_set="fisher_dev", display_limit=5, display_probs=True):
    if "train" in curr_set:
        local_input_path = os.path.join(m_cfg['data_path'], m_cfg['train_set'])
        play_audio = False
    else:
        local_input_path = os.path.join(m_cfg['data_path'], m_cfg['dev_set'])
        play_audio = True
        
    eg_utt_bucket = -1
    for i, bucket in enumerate(bucket_dict[curr_set]["buckets"]):
        if eg_utt in bucket:
            eg_utt_bucket = i
            #make_predprint("found")
        # end if
    # end for
    #print("found in bucket : {0:d}".format(eg_utt_bucket))
    width_b = bucket_dict[dev_key]["width_b"]
    utt_list = [eg_utt]
    
    
    batch_data = get_batch(map_dict[curr_set], 
                           enc_key,
                           dec_key,
                           utt_list,
                           vocab_dict,
                           (eg_utt_bucket+1) * width_b,
                           200,
                           input_path=local_input_path)
    
    with chainer.using_config('train', False):
        cuda.get_device(t_cfg['gpuid']).use()
        preds, _, probs_str = make_pred(eg_utt, X=batch_data['X'], display_limit=display_limit)
        #preds, _ = make_pred(eg_utt, X=batch_data['X'][:,-150:,:], display_limit=10)
        loss_val = 0.0
    
    display_words(map_dict[curr_set], v_dict, 
                  preds.tolist(), 
                  utt_list, dec_key, 
                  key, 
                  play_audio=play_audio, 
                  displayN=displayN)
    
    if display_probs:
        print(probs_str)

In [21]:
def find_utts_with_word(word, set_key="fisher_dev", show_max_found=10):
    total_found = 0
    out_str = []
    for utt, entry in map_dict[set_key].items():
        if "train" in set_key:
            words_in_utt = " ".join([w.decode() for w in entry['en_w']])
        else:
            words_in_utt = " ".join([w.decode() for w in entry['en_w'][0]])
        es_words_in_utt = " ".join([w.decode() for w in entry['es_w']])        
        #if "puerto" in words_in_utt:
        if word in words_in_utt:
            out_str.append("{0:s} | {1:s} | {2:s}".format(utt, words_in_utt, es_words_in_utt))
            total_found += 1
    
    print("-" * 80)
    print("total instances found = {0:d}".format(total_found))
    print("-" * 80)
    print("\n".join(out_str[:show_max_found]))

In [None]:
find_utts_with_word("puerto", set_key="fisher_dev")

In [None]:
eg_utt = "20051023_232057_325_fsp-A-3"
generate_translate_probs(eg_utt)

In [None]:
prob_fname = os.path.join(m_cfg['model_dir'], "{0:s}_probs.json".format(eg_utt))

In [None]:
eg_utt = "20051017_234550_276_fsp-B-34"
print(check_loss(eg_utt, curr_set='fisher_dev'))
generate_translate_probs(eg_utt)

In [None]:
vocab_dict['en_w']['i2w'][2]

In [None]:
eg_utt = "20051026_180724_341_fsp-A-26"
generate_translate_probs(eg_utt)
check_loss(eg_utt, curr_set='fisher_dev')

In [None]:
eg_utt = "20051017_234550_276_fsp-A-13"
generate_translate_probs(eg_utt)
check_loss(eg_utt, curr_set='fisher_dev')

In [None]:
eg_utt = "20051018_210220_279_fsp-A-26"
generate_translate_probs(eg_utt)
check_loss(eg_utt, curr_set='fisher_dev')

In [None]:
find_utts_with_word("mhm", set_key="fisher_dev")

In [None]:
eg_utt = "20051019_210146_289_fsp-A-54"
generate_translate_probs(eg_utt, curr_set='fisher_dev')
check_loss(eg_utt, curr_set='fisher_dev')

In [None]:
utt_loss = {}

In [None]:
eg_utt = "20051017_220530_275_fsp-B-21.npy"
try:
    check_loss(eg_utt, curr_set='fisher_dev')
except:
    print("{0:s} not found".format(utt))

In [None]:
i = 0
utt_loss = {}
for utt in tqdm(map_dict['fisher_dev'], ncols=50):
    if utt not in utt_loss:
        try:
            loss = check_loss(utt, curr_set='fisher_dev')
            utt_loss[utt] = loss.data.tolist()
        except:
            print("{0:s} not found".format(utt))
    #     print(utt, "{0:5.3f}".format(loss.data.tolist()))
#     i += 1
#     if i > 5:
#         break

In [None]:
# normalize by length
utt_loss_normalize = {}
for utt in tqdm(utt_loss, ncols=50):
    utt_loss_normalize[utt] = utt_loss[utt] / (len(map_dict['fisher_dev'][utt]['en_w'][0])+1)

In [None]:
list(utt_loss.items())[:10]

In [None]:
pickle.dump(utt_loss, open(os.path.join(cfg_path, "dev_utts_loss.dict"), "wb"))

In [None]:
pickle.dump(utt_loss_normalize, open(os.path.join(cfg_path, "dev_utts_loss_normalized.dict"), "wb"))

In [None]:
utt_loss = pickle.load(open(os.path.join(cfg_path, "dev_utts_loss.dict"), "rb"))

In [None]:
utt_loss_normalize = pickle.load(open(os.path.join(cfg_path, "dev_utts_loss_normalized.dict"), "rb"))

In [None]:
bad_utts = sorted(utt_loss_normalize.items(), reverse=True, key=lambda t: t[1])

In [None]:
N_BAD_UTTS = 30

In [None]:
u = '20051026_180724_341_fsp-A-26'
generate_translate_probs(eg_utt, curr_set='fisher_dev')
check_loss(eg_utt, curr_set='fisher_dev')

In [None]:
bad_utts[:10]

In [None]:
bad_utts[-10:]

In [None]:
x, y = zip(*utt_loss_normalize.items())

In [None]:
sum([1 if i < 1 else 0 for i in y]), sum([1 if i > 5 else 0 for i in y]), len(y)

### dev utts - avg loss per word in utt

In [None]:
fig, ax = plt.subplots()
fig.set_size_inches(8,5)

ax = sns.distplot(y, kde=False, rug=False, ax=ax, color=tableau20[0]);
ax.set_xlabel("dev utts - avg loss per word in utt", size=20)

In [None]:
for i, (u, l) in enumerate(sorted(utt_loss_normalize.items(), reverse=True, key=lambda t: t[1])[:50]):
    print("-"*80)
    print("{0:d}".format(i))
    print("-"*80)
    generate_translate_probs(u, curr_set='fisher_dev', display_limit=3, display_probs=True)
    loss_v, loss_by_w = check_loss(u, curr_set='fisher_dev')
    #print("{0:20s} ||| {1:5.2f} ||| {2:5.2f} ||| {3:5.2f}".format(u, l, loss_v, loss_by_w))
    print("{0:20s} ||| {3:5.2f}".format(u, l, loss_v, loss_by_w))

To share:
Several utterrance labels have typos, giving a misleading signal about the prediction quality

"20051017_180712_270_fsp-B-62"
dogs barking

"20051018_210220_279_fsp-A-71"
monopoly money -- monopoly occurs only 5 times in the train set, and never in the context of the game

"20051017_220530_275_fsp-B-61"
the decode probabilities show that maybe beam decoding (? or probably language model) will help catch up to Google. the Google model outputs Texas. We have Texas as the second most probable word as per the acoustic model. 


In [None]:
x = np.array([[ .759,  0.141,  .053]], dtype=np.float32)
t = np.array([1]).astype('i')
y = F.softmax_cross_entropy(x, t)
y

In [None]:
batch_data = get_utt_data(eg_utt, curr_set='fisher_dev')

In [None]:
X, y = batch_data['X'], batch_data['y']

In [None]:
[vocab_dict['en_w']['i2w'][i] for i in xp.asnumpy(y.data[0])]

### multilabel classification

In [None]:
bucket_dict['fisher_train']['buckets'][0][:5]

In [None]:
utt_list = bucket_dict['fisher_train']['buckets'][0][:5]
width_b = bucket_dict['fisher_train']['width_b']
local_input_path = os.path.join(m_cfg['data_path'], m_cfg['train_set'])

In [None]:
batch_data = get_batch(map_dict["fisher_train"], 
                       enc_key,
                       dec_key,
                       utt_list,
                       vocab_dict,
                       (0+1) * width_b,
                       200,
                       input_path=local_input_path)
    
X, y = batch_data['X'], batch_data['y']

In [None]:
y.shape, X.shape

In [None]:
# encode input
model.forward_enc(X)

In [None]:
y_t = F.swapaxes(y, 0, 1)

In [None]:
y_t.shape

In [None]:
len(y_t)

In [None]:
next_word.data

In [None]:
t = np.zeros(shape=(len(next_word),10), dtype='i')

In [None]:
t

In [None]:
t[0,[1]] = 1

In [None]:
sim_dict = pickle.load(open("../speech2text/fbanks_80dim_nltk/sim.dict", "rb"))

In [None]:
sim_dict['i'][4]

In [None]:
for next_word in y_t:
    print(next_word)
    t = np.zeros(shape=(len(next_word.data), 17000), dtype='i')
    print(next_word.data.tolist())
    for i,w in enumerate(next_word.data.tolist()):
        t[i,sim_dict['i'][w]] = 1
        print(t[i,[4,1044, 1045, 2477]])
    #print(t)

In [None]:
labels = xp.zeros((5,10)).astype('i')

In [None]:
next_word = [1,2,3,4,5]

In [None]:
for i, w in enumerate(next_word):
    print(i, w)
    labels[i,[w]] = 1

In [None]:
labels

In [None]:
next_word.data.tolist()

In [None]:
x = np.random.randn(1,17000).astype('f')
x = np.zeros((1,17000)).astype('f')

In [None]:
x[0,2] = 10.0
x[0,0] = -10.0

In [None]:
x

In [None]:
F.log_softmax(x).data

In [None]:
t = 0 * np.ones((1,17000), dtype='i')

In [None]:
t[0,2] = 1

In [None]:
x, t

In [None]:
F.sigmoid_cross_entropy(x, t, reduce='no', normalize=False)

In [None]:
F.sigmoid_cross_entropy(x, t, reduce='no', normalize=True)

In [None]:
F.sigmoid_cross_entropy(x, t, reduce='mean', normalize=True)

In [None]:
F.sigmoid_cross_entropy(x, t, reduce='mean', normalize=False)

In [None]:
F.sigmoid_cross_entropy(x, t, normalize=True)

In [None]:
x = np.array([[-2.0, 3.0, -2.0], [-2.0, 3.0, -2.0]]).astype('f')
t = np.array([[-1, -1, -1], [0, 1, 0]]).astype('i')

In [None]:
F.sigmoid_cross_entropy(x, t)

In [None]:
x = np.zeros((1,10), dtype="f")

In [None]:
x[0,[2]] = 10.0

In [None]:
x[:,list(range(2))+list(range(3,10))] = -.2

In [None]:
x[:,:10]

In [None]:
t = np.zeros((1,10), dtype="i")
t[0,5] = 1

In [None]:
F.sigmoid_cross_entropy(x, t)

In [None]:
predicted_out[:1], xp.argmax(predicted_out[:1].data)

In [None]:
labels = 0 * xp.ones(predicted_out[:1].shape).astype('i')

In [None]:
labels.shape

In [None]:
t = xp.zeros(predicted_out[:1].shape).astype('f')

In [None]:
t.shape

In [None]:
t[0,[2,5]] = 50.0

In [None]:
labels[0,[2,5]] = 1

In [None]:
F.sigmoid_cross_entropy(t, labels)

In [None]:
F.sigmoid_cross_entropy(F.softmax(predicted_out[:1]), xp.expand_dims(labels, axis=0))

In [None]:
from gensim.models import KeyedVectors

In [None]:
sim_dict['w'][b'rico'], sim_dict['i'][4]

In [None]:
xp.random.choice(sim_dict['i'][4], 1), sim_dict['i'][w]

In [None]:
for i in range(len(t_alt)):
    print(t_alt, i)
    print(t_alt[i])
    #t_alt[i] = xp.random.sample(sim_dict['i'][t_alt[i]])

In [None]:
print(t_alt)
for i in range(len(t_alt)):
    print(xp.random.choice(sim_dict['i'][int(t_alt[i])],1))
    print(t_alt[i],type(t_alt[i]), int(t_alt[i]))

In [None]:
decoder_batch = y_t
batch_size = decoder_batch.shape[1]
loss = 0
# ---------------------------------------------------------------------
# initialize hidden states as a zero vector
# ---------------------------------------------------------------------
a_units = model.m_cfg['attn_units']
ht = Variable(xp.zeros((batch_size, a_units), dtype=xp.float32))
# ---------------------------------------------------------------------
decoder_input = decoder_batch[0]
# for all sequences in the batch, feed the characters one by one
for curr_word, next_word in zip(decoder_batch, decoder_batch[1:]):
    print(curr_word, next_word)
    decoder_input = curr_word
    # -----------------------------------------------------------------
    # encode tokens
    # -----------------------------------------------------------------
    predicted_out, ht = model.decode(decoder_input, ht)
    decoder_input = F.argmax(predicted_out, axis=1)
    #print(decoder_input)
    # -----------------------------------------------------------------
    # compute loss
    # -----------------------------------------------------------------
    t_alt = xp.copy(next_word.data)
    print(t_alt)
    for i in range(len(t_alt)):
        t_alt[i] = xp.random.choice(sim_dict['i'][int(t_alt[i])],1)
        #print(t[i,[4,1044, 1045, 2477]])
    print(t_alt)

#     t = xp.zeros(shape=predicted_out.shape, dtype='i')
#     print(next_word.data.tolist())
#     print(next_word.shape)
#     for i,w in enumerate(next_word.data.tolist()):
#         if w == PAD_ID:
#             t[i,:] = -1
#         else:
#             t[i,sim_dict['i'][w]] = 1
#         #print(t[i,[4,1044, 1045, 2477]])
#     loss_arr = F.sigmoid_cross_entropy(predicted_out, t, normalize=True)
    loss_arr = F.softmax_cross_entropy(predicted_out, t_alt, normalize=True)
    print("softmax cross entropy:", F.softmax_cross_entropy(predicted_out, next_word), "sigmoid:", loss_arr.data.tolist())
    loss += loss_arr
    
    # -----------------------------------------------------------------
#print(loss, loss / (y.shape[0]-2), y.shape)
print(loss.data.tolist(), (loss / (y.shape[0]-1)).data.tolist())

### Compare BLEU scores at utterance level

In [24]:
eg_utt = "20051023_232057_325_fsp-A-3"
print(sentence_bleu([google_dev_ref_0[eg_utt]], google_hyp_r0[eg_utt], smoothing_function=smooth_fun.method2))
print(sentence_bleu([model_s2t_refs[eg_utt]], model_s2t_hyps[eg_utt], smoothing_function=smooth_fun.method2))

1.0
0.8432240968976374


In [25]:
eg_utt = "20051019_190221_288_fsp-B-1"
print(sentence_bleu([google_dev_ref_0[eg_utt]], google_hyp_r0[eg_utt], smoothing_function=smooth_fun.method2))
print(sentence_bleu([model_s2t_refs[eg_utt]], model_s2t_hyps[eg_utt], smoothing_function=smooth_fun.method2))
print(google_dev_ref_0[eg_utt], google_hyp_r0[eg_utt])
print(model_s2t_refs[eg_utt], model_s2t_hyps[eg_utt])

0
0
['hi'] ['hello']
['hi'] ['hello']


In [26]:
random.seed("haha")
dev_utts = list(model_s2t_refs.keys())
random.shuffle(dev_utts)

In [30]:
os.makedirs(os.path.join(m_cfg['model_dir'], "probs"))

In [87]:
print("google beats model by factor of 2")

count = 0
# print("-"*80)
# print("{0:>5s} ||| {1:30s} ||| {2:>15s} || {3:>15s}".format("sn", "utt", "google utt bleu", "model utt bleu"))
# print("-"*80)
for utt in dev_utts:
    if len(model_s2t_refs[utt]) < 10:
        google_utt_bleu = sentence_bleu([google_dev_ref_0[utt]], google_hyp_r0[utt], smoothing_function=smooth_fun.method2)
        model_utt_bleu = sentence_bleu([model_s2t_refs[utt]], model_s2t_hyps[utt], smoothing_function=smooth_fun.method2)
        if google_utt_bleu >= (2 * model_utt_bleu) and google_utt_bleu >= 0.5:
            count += 1
            #print("{0:5d} ||| {1:30s} ||| {2:15.2f} || {3:15.2f}".format(count, utt, google_utt_bleu, model_utt_bleu))
            print("-"*80)
            print(count)
            print("-"*80)
    #         display_pp = PrettyTable(["cat","sent"], hrules=True)
    #         display_pp.align = "l"
    #         display_pp.header = False
    #         display_pp.add_row(["en ref", textwrap.fill(" ".join(model_s2t_refs[utt]),50)])
    #         display_pp.add_row(["model pred", textwrap.fill(" ".join(model_s2t_hyps[utt]),50)])
    #         display_pp.add_row(["model utt bleu", "{0:.2f}".format(model_utt_bleu)])
    #         display_pp.add_row(["google pred", textwrap.fill(" ".join(google_hyp_r0[utt]),50)])
    #         display_pp.add_row(["google utt bleu", "{0:.2f}".format(google_utt_bleu)])
    #         print(display_pp)
    #         play_utt(utt, map_dict['fisher_dev'])
            generate_translate_probs(utt)
        if count > 50:
            break


google beats model by factor of 2
--------------------------------------------------------------------------------
1
--------------------------------------------------------------------------------
saved probs in : ./sp2enw_mel-80_vocab-nltk/sp_1.0_h-256_e-128_drpt-rnn-.3/probs/20051016_210626_267_fsp-A-51_probs.json
Utterance: 20051016_210626_267_fsp-A-51
+-------------+------+
| es ref      | sí   |
+-------------+------+
| en ref      | yes  |
+-------------+------+
| model pred  | oh   |
+-------------+------+
| model bleu  | 0.00 |
+-------------+------+
| google pred | yes  |
+-------------+------+
| google bleu | 0.84 |
+-------------+------+
377.84 378.6


------------------------------------------------------------
predicting word : 0
oh         = 0.463
ah         = 0.213
mm         = 0.055
uh         = 0.031
hmm        = 0.021
------------------------------------------------------------
predicting word : 1
_EOS       = 0.735
that       = 0.029
no         = 0.024
well       = 0.016
yes        = 0.015
--------------------------------------------------------------------------------
2
--------------------------------------------------------------------------------
saved probs in : ./sp2enw_mel-80_vocab-nltk/sp_1.0_h-256_e-128_drpt-rnn-.3/probs/20051019_230329_292_fsp-A-60_probs.json
Utterance: 20051019_230329_292_fsp-A-60
+-------------+------+
| es ref      | mhm  |
+-------------+------+
| en ref      | mm   |
+-------------+------+
| model pred  | mhm  |
+-------------+------+
| model bleu  | 0.40 |
+-------------+------+
| google pred | mm   |
+-------------+------+
| google bleu | 0.71 |
+-------------+------+
529.02 530.14


------------------------------------------------------------
predicting word : 0
mhm        = 0.291
mm         = 0.190
hmm        = 0.165
hm         = 0.142
uh         = 0.073
------------------------------------------------------------
predicting word : 1
_EOS       = 0.965
to         = 0.006
me         = 0.002
''         = 0.002
and        = 0.001
--------------------------------------------------------------------------------
3
--------------------------------------------------------------------------------
saved probs in : ./sp2enw_mel-80_vocab-nltk/sp_1.0_h-256_e-128_drpt-rnn-.3/probs/20051009_210519_219_fsp-A-16_probs.json
Utterance: 20051009_210519_219_fsp-A-16
+-------------+-------------------------------------+
| es ref      | yea la religión es como la política |
+-------------+-------------------------------------+
| en ref      | yeah religion is like politics      |
+-------------+-------------------------------------+
| model pred  | yes the majority is like politics   |

------------------------------------------------------------
predicting word : 0
yes        = 0.423
yeah       = 0.277
i          = 0.059
and        = 0.041
yea        = 0.029
------------------------------------------------------------
predicting word : 1
the        = 0.307
thank      = 0.124
religion   = 0.099
i          = 0.048
god        = 0.028
------------------------------------------------------------
predicting word : 2
majority   = 0.197
truth      = 0.121
religion   = 0.039
reality    = 0.035
radio      = 0.033
------------------------------------------------------------
predicting word : 3
is         = 0.575
how        = 0.098
of         = 0.083
like       = 0.073
are        = 0.046
------------------------------------------------------------
predicting word : 4
like       = 0.773
how        = 0.054
about      = 0.015
the        = 0.014
is         = 0.007
------------------------------------------------------------
predicting word : 5
politics   = 0.616
a          = 0.264
t

------------------------------------------------------------
predicting word : 0
yes        = 0.340
you        = 0.119
eh         = 0.054
yeah       = 0.052
so         = 0.048
------------------------------------------------------------
predicting word : 1
that       = 0.311
you        = 0.290
what       = 0.115
it         = 0.035
who        = 0.020
------------------------------------------------------------
predicting word : 2
you        = 0.695
has        = 0.049
he         = 0.034
they       = 0.026
have       = 0.025
------------------------------------------------------------
predicting word : 3
have       = 0.475
can        = 0.090
're        = 0.085
are        = 0.071
agree      = 0.034
------------------------------------------------------------
predicting word : 4
to         = 0.499
the        = 0.061
careful    = 0.057
in         = 0.040
a          = 0.033
------------------------------------------------------------
predicting word : 5
take       = 0.286
be         = 0.192
c

------------------------------------------------------------
predicting word : 0
mm         = 0.219
yes        = 0.179
um         = 0.128
hmm        = 0.108
eh         = 0.079
------------------------------------------------------------
predicting word : 1
_EOS       = 0.977
yes        = 0.003
no         = 0.002
but        = 0.002
and        = 0.001
--------------------------------------------------------------------------------
6
--------------------------------------------------------------------------------
saved probs in : ./sp2enw_mel-80_vocab-nltk/sp_1.0_h-256_e-128_drpt-rnn-.3/probs/20051018_210744_280_fsp-B-12_probs.json
Utterance: 20051018_210744_280_fsp-B-12
+-------------+----------------------------------------------+
| es ref      | y estoy esperando otro bebé en dos semanas   |
+-------------+----------------------------------------------+
| en ref      | and i am expecting another baby in two weeks |
+-------------+----------------------------------------------+
| model 

------------------------------------------------------------
predicting word : 0
i          = 0.491
and        = 0.270
that       = 0.121
what       = 0.018
who        = 0.013
------------------------------------------------------------
predicting word : 1
'm         = 0.694
am         = 0.267
went       = 0.018
was        = 0.003
left       = 0.003
------------------------------------------------------------
predicting word : 2
waiting    = 0.511
in         = 0.021
studying   = 0.017
giving     = 0.016
working    = 0.014
------------------------------------------------------------
predicting word : 3
for        = 0.356
another    = 0.274
to         = 0.086
other      = 0.058
eight      = 0.038
------------------------------------------------------------
predicting word : 4
another    = 0.500
other      = 0.188
the        = 0.044
a          = 0.035
eight      = 0.035
------------------------------------------------------------
predicting word : 5
other      = 0.089
one        = 0.069
t

------------------------------------------------------------
predicting word : 0
i          = 0.206
that       = 0.189
what       = 0.115
they       = 0.070
it         = 0.063
------------------------------------------------------------
predicting word : 1
know       = 0.333
mean       = 0.247
think      = 0.103
do         = 0.086
thought    = 0.038
------------------------------------------------------------
predicting word : 2
that       = 0.510
it         = 0.148
i          = 0.023
near       = 0.020
they       = 0.018
------------------------------------------------------------
predicting word : 3
it         = 0.209
's         = 0.158
he         = 0.044
would      = 0.037
they       = 0.036
------------------------------------------------------------
predicting word : 4
's         = 0.300
should     = 0.150
would      = 0.100
will       = 0.089
was        = 0.087
------------------------------------------------------------
predicting word : 5
true       = 0.341
really     = 0.058
g

------------------------------------------------------------
predicting word : 0
that       = 0.563
what       = 0.114
to         = 0.032
you        = 0.032
who        = 0.022
------------------------------------------------------------
predicting word : 1
you        = 0.407
's         = 0.085
they       = 0.062
it         = 0.051
is         = 0.047
------------------------------------------------------------
predicting word : 2
're        = 0.211
are        = 0.107
do         = 0.079
get        = 0.063
've        = 0.056
------------------------------------------------------------
predicting word : 3
going      = 0.395
doing      = 0.094
used       = 0.029
already    = 0.027
health     = 0.022
------------------------------------------------------------
predicting word : 4
to         = 0.962
out        = 0.009
used       = 0.004
on         = 0.003
for        = 0.001
------------------------------------------------------------
predicting word : 5
be         = 0.210
use        = 0.089
m

------------------------------------------------------------
predicting word : 0
so         = 0.523
then       = 0.467
and        = 0.005
that       = 0.000
well       = 0.000
------------------------------------------------------------
predicting word : 1
he         = 0.787
they       = 0.053
it         = 0.045
she        = 0.038
the        = 0.020
------------------------------------------------------------
predicting word : 2
says       = 0.868
said       = 0.062
tells      = 0.042
told       = 0.007
he         = 0.003
------------------------------------------------------------
predicting word : 3
that       = 0.716
he         = 0.096
they       = 0.025
it         = 0.019
she        = 0.017
------------------------------------------------------------
predicting word : 4
we         = 0.211
he         = 0.169
people     = 0.153
they       = 0.150
it         = 0.059
------------------------------------------------------------
predicting word : 5
're        = 0.385
are        = 0.379
s

------------------------------------------------------------
predicting word : 0
sure       = 0.586
of         = 0.064
how        = 0.056
yes        = 0.046
what       = 0.034
------------------------------------------------------------
predicting word : 1
sure       = 0.329
how        = 0.164
like       = 0.143
what       = 0.062
_EOS       = 0.036
------------------------------------------------------------
predicting word : 2
_EOS       = 0.989
sure       = 0.001
so         = 0.001
how        = 0.001
and        = 0.001
--------------------------------------------------------------------------------
11
--------------------------------------------------------------------------------
saved probs in : ./sp2enw_mel-80_vocab-nltk/sp_1.0_h-256_e-128_drpt-rnn-.3/probs/20051026_180724_341_fsp-A-57_probs.json
Utterance: 20051026_180724_341_fsp-A-57
+-------------+----------------+
| es ref      | exactly        |
+-------------+----------------+
| en ref      | exactly        |
+-------------

------------------------------------------------------------
predicting word : 0
in         = 0.369
exactly    = 0.049
the        = 0.036
uh         = 0.028
iraq       = 0.015
------------------------------------------------------------
predicting word : 1
connecticut = 0.131
english    = 0.116
iraq       = 0.087
the        = 0.071
canada     = 0.047
------------------------------------------------------------
predicting word : 2
_EOS       = 0.987
in         = 0.001
here       = 0.001
right      = 0.001
you        = 0.001
--------------------------------------------------------------------------------
12
--------------------------------------------------------------------------------
saved probs in : ./sp2enw_mel-80_vocab-nltk/sp_1.0_h-256_e-128_drpt-rnn-.3/probs/20051017_180712_270_fsp-B-66_probs.json
Utterance: 20051017_180712_270_fsp-B-66
+-------------+-------+
| es ref      | hm mm |
+-------------+-------+
| en ref      | hm mm |
+-------------+-------+
| model pred  | mhm   |
+

------------------------------------------------------------
predicting word : 0
mhm        = 0.229
hmm        = 0.205
mm         = 0.177
hm         = 0.165
uh         = 0.075
------------------------------------------------------------
predicting word : 1
_EOS       = 0.982
to         = 0.003
''         = 0.001
and        = 0.001
me         = 0.001
--------------------------------------------------------------------------------
13
--------------------------------------------------------------------------------
saved probs in : ./sp2enw_mel-80_vocab-nltk/sp_1.0_h-256_e-128_drpt-rnn-.3/probs/20051017_234550_276_fsp-B-109_probs.json
Utterance: 20051017_234550_276_fsp-B-109
+-------------+--------------------------------------------------+
| es ref      | el el título de lo que están hablando el tema    |
+-------------+--------------------------------------------------+
| en ref      | title of what they 're talking about the theme   |
+-------------+-------------------------------------

------------------------------------------------------------
predicting word : 0
the        = 0.797
eh         = 0.073
he         = 0.027
and        = 0.016
uh         = 0.015
------------------------------------------------------------
predicting word : 1
the        = 0.311
attitude   = 0.027
typical    = 0.018
eh         = 0.014
one        = 0.010
------------------------------------------------------------
predicting word : 2
the        = 0.236
one        = 0.025
attitude   = 0.022
typical    = 0.014
beginning  = 0.011
------------------------------------------------------------
predicting word : 3
the        = 0.181
one        = 0.059
attitude   = 0.016
place      = 0.014
last       = 0.013
------------------------------------------------------------
predicting word : 4
the        = 0.143
one        = 0.122
place      = 0.019
last       = 0.015
_EOS       = 0.011
------------------------------------------------------------
predicting word : 5
one        = 0.200
the        = 0.120
p

------------------------------------------------------------
predicting word : 0
this       = 0.169
is         = 0.097
it         = 0.056
in         = 0.049
i          = 0.044
------------------------------------------------------------
predicting word : 1
is         = 0.521
place      = 0.034
kind       = 0.023
university = 0.019
can        = 0.013
------------------------------------------------------------
predicting word : 2
the        = 0.115
a          = 0.040
good       = 0.040
going      = 0.025
divorced   = 0.024
------------------------------------------------------------
predicting word : 3
university = 0.169
same       = 0.150
west       = 0.041
place      = 0.024
system     = 0.020
------------------------------------------------------------
predicting word : 4
there      = 0.739
and        = 0.088
is         = 0.021
in         = 0.016
of         = 0.011
------------------------------------------------------------
predicting word : 5
's         = 0.472
are        = 0.297
i

------------------------------------------------------------
predicting word : 0
in         = 0.243
eh         = 0.211
but        = 0.182
oh         = 0.024
yes        = 0.019
------------------------------------------------------------
predicting word : 1
new        = 0.321
a          = 0.290
the        = 0.080
an         = 0.056
in         = 0.050
------------------------------------------------------------
predicting word : 2
university = 0.781
york       = 0.140
cities     = 0.011
city       = 0.006
college    = 0.004
------------------------------------------------------------
predicting word : 3
is         = 0.395
very       = 0.260
it         = 0.136
are        = 0.022
a          = 0.014
------------------------------------------------------------
predicting word : 4
very       = 0.866
really     = 0.031
not        = 0.017
a          = 0.013
because    = 0.010
------------------------------------------------------------
predicting word : 5
small      = 0.716
little     = 0.020
d

------------------------------------------------------------
predicting word : 0
we         = 0.691
eh         = 0.102
uh         = 0.052
have       = 0.029
and        = 0.020
------------------------------------------------------------
predicting word : 1
have       = 0.781
had        = 0.035
we         = 0.022
are        = 0.021
've        = 0.018
------------------------------------------------------------
predicting word : 2
two        = 0.497
a          = 0.069
three      = 0.042
to         = 0.038
been       = 0.016
------------------------------------------------------------
predicting word : 3
or         = 0.057
minutes    = 0.053
friends    = 0.035
years      = 0.031
bedrooms   = 0.027
------------------------------------------------------------
predicting word : 4
two        = 0.221
a          = 0.108
three      = 0.021
half       = 0.018
me         = 0.015
------------------------------------------------------------
predicting word : 5
_EOS       = 0.093
or         = 0.078
y

------------------------------------------------------------
predicting word : 0
ah         = 0.319
oh         = 0.176
eh         = 0.083
uh         = 0.068
yes        = 0.052
------------------------------------------------------------
predicting word : 1
_EOS       = 0.921
no         = 0.006
yes        = 0.005
that       = 0.004
and        = 0.004
--------------------------------------------------------------------------------
18
--------------------------------------------------------------------------------
saved probs in : ./sp2enw_mel-80_vocab-nltk/sp_1.0_h-256_e-128_drpt-rnn-.3/probs/20051022_180817_311_fsp-B-14_probs.json
Utterance: 20051022_180817_311_fsp-B-14
+-------------+----------------------------------+
| es ref      | no ¿a usted le gusta el reguetón |
+-------------+----------------------------------+
| en ref      | no do you like reggaeton         |
+-------------+----------------------------------+
| model pred  | no but you are from toronto      |
+-------------+-

------------------------------------------------------------
predicting word : 0
no         = 0.705
ah         = 0.085
oh         = 0.063
nothing    = 0.025
not        = 0.018
------------------------------------------------------------
predicting word : 1
but        = 0.187
the        = 0.115
at         = 0.073
to         = 0.053
in         = 0.041
------------------------------------------------------------
predicting word : 2
you        = 0.131
of         = 0.070
where      = 0.061
in         = 0.052
they       = 0.052
------------------------------------------------------------
predicting word : 3
are        = 0.248
're        = 0.202
were       = 0.055
try        = 0.036
have       = 0.031
------------------------------------------------------------
predicting word : 4
from       = 0.552
in         = 0.058
single     = 0.030
_EOS       = 0.020
here       = 0.018
------------------------------------------------------------
predicting word : 5
toronto    = 0.497
_EOS       = 0.111
w

------------------------------------------------------------
predicting word : 0
exactly    = 0.420
yes        = 0.224
that       = 0.046
how        = 0.035
houston    = 0.026
------------------------------------------------------------
predicting word : 1
_EOS       = 0.991
right      = 0.001
also       = 0.001
me         = 0.001
that       = 0.001
--------------------------------------------------------------------------------
20
--------------------------------------------------------------------------------
saved probs in : ./sp2enw_mel-80_vocab-nltk/sp_1.0_h-256_e-128_drpt-rnn-.3/probs/20051017_220530_275_fsp-A-26_probs.json
Utterance: 20051017_220530_275_fsp-A-26
+-------------+-----------------------------------+
| es ref      | eh no hables con esa persona      |
+-------------+-----------------------------------+
| en ref      | uhm do not talk to that person    |
+-------------+-----------------------------------+
| model pred  | in new york it 's not that person |
+---------

------------------------------------------------------------
predicting word : 0
in         = 0.477
eh         = 0.330
uh         = 0.047
oh         = 0.015
em         = 0.010
------------------------------------------------------------
predicting word : 1
new        = 0.219
north      = 0.125
in         = 0.060
no         = 0.040
orlando    = 0.022
------------------------------------------------------------
predicting word : 2
york       = 0.394
orleans    = 0.064
to         = 0.063
school     = 0.063
or         = 0.019
------------------------------------------------------------
predicting word : 3
it         = 0.094
that       = 0.088
you        = 0.056
with       = 0.045
_EOS       = 0.041
------------------------------------------------------------
predicting word : 4
's         = 0.384
was        = 0.151
does       = 0.151
is         = 0.045
can        = 0.022
------------------------------------------------------------
predicting word : 5
not        = 0.495
the        = 0.061
a

------------------------------------------------------------
predicting word : 0
that       = 0.840
this       = 0.081
those      = 0.021
so         = 0.019
it         = 0.006
------------------------------------------------------------
predicting word : 1
's         = 0.495
that       = 0.302
was        = 0.049
is         = 0.042
this       = 0.039
------------------------------------------------------------
predicting word : 2
what       = 0.382
that       = 0.202
why        = 0.089
it         = 0.084
not        = 0.043
------------------------------------------------------------
predicting word : 3
i          = 0.586
they       = 0.163
she        = 0.032
it         = 0.025
he         = 0.023
------------------------------------------------------------
predicting word : 4
did        = 0.179
never      = 0.093
had        = 0.072
'm         = 0.068
've        = 0.063
------------------------------------------------------------
predicting word : 5
n't        = 0.951
never      = 0.019
n

------------------------------------------------------------
predicting word : 0
christians = 0.226
christian  = 0.186
the        = 0.028
of         = 0.023
that       = 0.011
------------------------------------------------------------
predicting word : 1
that       = 0.440
who        = 0.044
i          = 0.037
than       = 0.030
is         = 0.015
------------------------------------------------------------
predicting word : 2
really     = 0.067
general    = 0.059
gives      = 0.049
they       = 0.042
would      = 0.041
------------------------------------------------------------
predicting word : 3
_EOS       = 0.355
and        = 0.219
is         = 0.039
right      = 0.027
strange    = 0.017
--------------------------------------------------------------------------------
23
--------------------------------------------------------------------------------
saved probs in : ./sp2enw_mel-80_vocab-nltk/sp_1.0_h-256_e-128_drpt-rnn-.3/probs/20051026_211309_346_fsp-B-32_probs.json
Utterance:

------------------------------------------------------------
predicting word : 0
so         = 0.640
then       = 0.308
that       = 0.008
and        = 0.006
this       = 0.006
------------------------------------------------------------
predicting word : 1
_EOS       = 0.770
then       = 0.056
much       = 0.045
i          = 0.015
that       = 0.013
--------------------------------------------------------------------------------
24
--------------------------------------------------------------------------------
saved probs in : ./sp2enw_mel-80_vocab-nltk/sp_1.0_h-256_e-128_drpt-rnn-.3/probs/20051024_180453_327_fsp-A-27_probs.json
Utterance: 20051024_180453_327_fsp-A-27
+-------------+-------------------+
| es ref      | odale ¿en serio   |
+-------------+-------------------+
| en ref      | wow really        |
+-------------+-------------------+
| model pred  | now the seriously |
+-------------+-------------------+
| model bleu  | 0.16              |
+-------------+-------------------

------------------------------------------------------------
predicting word : 0
now        = 0.191
no         = 0.115
hello      = 0.080
oh         = 0.079
wow        = 0.072
------------------------------------------------------------
predicting word : 1
the        = 0.243
in         = 0.229
really     = 0.178
seriously  = 0.077
on         = 0.034
------------------------------------------------------------
predicting word : 2
seriously  = 0.115
cold       = 0.081
serious    = 0.059
really     = 0.029
difference = 0.028
------------------------------------------------------------
predicting word : 3
_EOS       = 0.731
seriously  = 0.022
of         = 0.019
in         = 0.017
would      = 0.013
--------------------------------------------------------------------------------
25
--------------------------------------------------------------------------------
saved probs in : ./sp2enw_mel-80_vocab-nltk/sp_1.0_h-256_e-128_drpt-rnn-.3/probs/20051016_210626_267_fsp-B-20_probs.json
Utterance:

------------------------------------------------------------
predicting word : 0
oh         = 0.801
ah         = 0.074
or         = 0.020
wow        = 0.017
uh         = 0.010
------------------------------------------------------------
predicting word : 1
oh         = 0.210
wow        = 0.116
boy        = 0.090
poor       = 0.067
_EOS       = 0.061
------------------------------------------------------------
predicting word : 2
oh         = 0.158
_EOS       = 0.141
boy        = 0.101
wow        = 0.089
my         = 0.065
------------------------------------------------------------
predicting word : 3
_EOS       = 0.286
oh         = 0.171
boy        = 0.069
wow        = 0.067
my         = 0.055
--------------------------------------------------------------------------------
26
--------------------------------------------------------------------------------
saved probs in : ./sp2enw_mel-80_vocab-nltk/sp_1.0_h-256_e-128_drpt-rnn-.3/probs/20051016_210626_267_fsp-A-70_probs.json
Utterance:

------------------------------------------------------------
predicting word : 0
and        = 0.764
what       = 0.052
¿and       = 0.045
if         = 0.026
so         = 0.019
------------------------------------------------------------
predicting word : 1
what       = 0.514
who        = 0.054
you        = 0.048
want       = 0.045
that       = 0.043
------------------------------------------------------------
predicting word : 2
do         = 0.267
's         = 0.107
are        = 0.083
have       = 0.075
a          = 0.074
------------------------------------------------------------
predicting word : 3
you        = 0.828
they       = 0.101
he         = 0.024
i          = 0.023
she        = 0.005
------------------------------------------------------------
predicting word : 4
want       = 0.475
have       = 0.461
do         = 0.021
like       = 0.006
come       = 0.003
------------------------------------------------------------
predicting word : 5
to         = 0.923
work       = 0.013
a

------------------------------------------------------------
predicting word : 0
for        = 0.240
to         = 0.138
so         = 0.086
that       = 0.073
it         = 0.047
------------------------------------------------------------
predicting word : 1
tired      = 0.355
being      = 0.029
thinking   = 0.027
you        = 0.026
so         = 0.025
------------------------------------------------------------
predicting word : 2
_EOS       = 0.692
of         = 0.222
about      = 0.025
you        = 0.006
for        = 0.004
--------------------------------------------------------------------------------
28
--------------------------------------------------------------------------------
saved probs in : ./sp2enw_mel-80_vocab-nltk/sp_1.0_h-256_e-128_drpt-rnn-.3/probs/20051026_211309_346_fsp-B-40_probs.json
Utterance: 20051026_211309_346_fsp-B-40
+-------------+---------+
| es ref      | mm mhm  |
+-------------+---------+
| en ref      | mm hmm  |
+-------------+---------+
| model pred  | 

------------------------------------------------------------
predicting word : 0
hmm        = 0.241
mhm        = 0.238
mm         = 0.194
uh         = 0.110
mmm        = 0.063
------------------------------------------------------------
predicting word : 1
hmm        = 0.589
_EOS       = 0.186
mm         = 0.070
mhm        = 0.043
uh         = 0.031
------------------------------------------------------------
predicting word : 2
_EOS       = 0.948
but        = 0.002
way        = 0.002
thing      = 0.002
one        = 0.002
--------------------------------------------------------------------------------
29
--------------------------------------------------------------------------------
saved probs in : ./sp2enw_mel-80_vocab-nltk/sp_1.0_h-256_e-128_drpt-rnn-.3/probs/20051022_180817_311_fsp-B-70_probs.json
Utterance: 20051022_180817_311_fsp-B-70
+-------------+-------------------+
| es ref      | sí que bueno      |
+-------------+-------------------+
| en ref      | yeah that 's good |
+-

------------------------------------------------------------
predicting word : 0
yes        = 0.898
yeah       = 0.074
so         = 0.005
yea        = 0.005
and        = 0.004
------------------------------------------------------------
predicting word : 1
how        = 0.441
that       = 0.198
well       = 0.139
what       = 0.042
of         = 0.031
------------------------------------------------------------
predicting word : 2
nice       = 0.562
good       = 0.363
well       = 0.019
how        = 0.008
_EOS       = 0.006
------------------------------------------------------------
predicting word : 3
_EOS       = 0.984
to         = 0.001
and        = 0.001
i          = 0.001
are        = 0.001
--------------------------------------------------------------------------------
30
--------------------------------------------------------------------------------
saved probs in : ./sp2enw_mel-80_vocab-nltk/sp_1.0_h-256_e-128_drpt-rnn-.3/probs/20051017_180712_270_fsp-B-31_probs.json
Utterance:

------------------------------------------------------------
predicting word : 0
mhm        = 0.280
hmm        = 0.223
mm         = 0.177
hm         = 0.094
uh         = 0.076
------------------------------------------------------------
predicting word : 1
mhm        = 0.863
_EOS       = 0.067
hmm        = 0.019
mm         = 0.016
hm         = 0.006
------------------------------------------------------------
predicting word : 2
_EOS       = 0.979
to         = 0.005
and        = 0.001
''         = 0.001
that       = 0.001
--------------------------------------------------------------------------------
31
--------------------------------------------------------------------------------
saved probs in : ./sp2enw_mel-80_vocab-nltk/sp_1.0_h-256_e-128_drpt-rnn-.3/probs/20051017_234550_276_fsp-A-124_probs.json
Utterance: 20051017_234550_276_fsp-A-124
+-------------+---------------------------------------------+
| es ref      | no me gusta de todo ahora                   |
+-------------+-----

------------------------------------------------------------
predicting word : 0
no         = 0.452
i          = 0.427
they       = 0.025
do         = 0.024
not        = 0.018
------------------------------------------------------------
predicting word : 1
i          = 0.935
like       = 0.016
they       = 0.006
he         = 0.005
do         = 0.004
------------------------------------------------------------
predicting word : 2
do         = 0.684
like       = 0.273
really     = 0.011
did        = 0.005
liked      = 0.004
------------------------------------------------------------
predicting word : 3
n't        = 0.989
like       = 0.008
not        = 0.003
really     = 0.000
i          = 0.000
------------------------------------------------------------
predicting word : 4
like       = 0.991
really     = 0.003
do         = 0.001
i          = 0.001
know       = 0.000
------------------------------------------------------------
predicting word : 5
everything = 0.731
all        = 0.109
i

------------------------------------------------------------
predicting word : 0
no         = 0.243
it         = 0.084
that       = 0.061
you        = 0.058
but        = 0.052
------------------------------------------------------------
predicting word : 1
the        = 0.151
it         = 0.129
no         = 0.094
but        = 0.063
number     = 0.043
------------------------------------------------------------
predicting word : 2
truth      = 0.162
first      = 0.133
number     = 0.124
same       = 0.057
worst      = 0.025
------------------------------------------------------------
predicting word : 3
is         = 0.592
yes        = 0.143
like       = 0.078
no         = 0.014
_EOS       = 0.012
------------------------------------------------------------
predicting word : 4
that       = 0.414
like       = 0.097
right      = 0.057
yes        = 0.056
_EOS       = 0.041
------------------------------------------------------------
predicting word : 5
_EOS       = 0.613
's         = 0.103
y

------------------------------------------------------------
predicting word : 0
was        = 0.588
he         = 0.101
it         = 0.095
i          = 0.041
were       = 0.032
------------------------------------------------------------
predicting word : 1
still      = 0.123
living     = 0.057
all        = 0.041
watching   = 0.030
there      = 0.028
------------------------------------------------------------
predicting word : 2
the        = 0.124
my         = 0.059
friends    = 0.052
sunday     = 0.031
two        = 0.023
------------------------------------------------------------
predicting word : 3
same       = 0.624
sunday     = 0.062
friends    = 0.049
doctor     = 0.032
friend     = 0.024
------------------------------------------------------------
predicting word : 4
_EOS       = 0.809
thing      = 0.036
one        = 0.035
time       = 0.012
for        = 0.010
--------------------------------------------------------------------------------
34
------------------------------------

------------------------------------------------------------
predicting word : 0
yes        = 0.158
so         = 0.139
i          = 0.099
that       = 0.073
in         = 0.052
------------------------------------------------------------
predicting word : 1
that       = 0.487
how        = 0.300
well       = 0.052
good       = 0.039
nice       = 0.031
------------------------------------------------------------
predicting word : 2
's         = 0.967
good       = 0.009
well       = 0.006
is         = 0.006
nice       = 0.004
------------------------------------------------------------
predicting word : 3
good       = 0.866
nice       = 0.111
great      = 0.009
how        = 0.004
fine       = 0.001
------------------------------------------------------------
predicting word : 4
_EOS       = 0.974
that       = 0.004
to         = 0.003
for        = 0.002
and        = 0.001
--------------------------------------------------------------------------------
35
------------------------------------

------------------------------------------------------------
predicting word : 0
it         = 0.121
super      = 0.085
yes        = 0.071
a          = 0.055
if         = 0.044
------------------------------------------------------------
predicting word : 1
's         = 0.518
was        = 0.271
is         = 0.030
does       = 0.015
looks      = 0.015
------------------------------------------------------------
predicting word : 2
a          = 0.199
an         = 0.064
hot        = 0.059
good       = 0.058
super      = 0.045
------------------------------------------------------------
predicting word : 3
good       = 0.103
lot        = 0.060
very       = 0.053
little     = 0.045
hot        = 0.037
------------------------------------------------------------
predicting word : 4
experience = 0.103
area       = 0.071
part       = 0.070
company    = 0.028
hot        = 0.027
------------------------------------------------------------
predicting word : 5
but        = 0.962
_EOS       = 0.004
o

------------------------------------------------------------
predicting word : 0
ah         = 0.717
oh         = 0.174
okay       = 0.049
ok         = 0.032
uh         = 0.006
------------------------------------------------------------
predicting word : 1
okay       = 0.640
ok         = 0.344
ah         = 0.004
yes        = 0.002
oh         = 0.002
------------------------------------------------------------
predicting word : 2
_EOS       = 0.976
okay       = 0.008
ok         = 0.001
yes        = 0.001
and        = 0.001
--------------------------------------------------------------------------------
37
--------------------------------------------------------------------------------
saved probs in : ./sp2enw_mel-80_vocab-nltk/sp_1.0_h-256_e-128_drpt-rnn-.3/probs/20051023_232057_325_fsp-A-83_probs.json
Utterance: 20051023_232057_325_fsp-A-83
+-------------+---------------------------------------------+
| es ref      | pero el tráfico es horrible déjame decirte  |
+-------------+-------

------------------------------------------------------------
predicting word : 0
but        = 0.883
the        = 0.012
it         = 0.008
right      = 0.005
so         = 0.004
------------------------------------------------------------
predicting word : 1
traffic    = 0.435
the        = 0.128
of         = 0.040
in         = 0.026
mexico     = 0.025
------------------------------------------------------------
predicting word : 2
that       = 0.253
those      = 0.150
is         = 0.100
this       = 0.053
and        = 0.048
------------------------------------------------------------
predicting word : 3
's         = 0.879
is         = 0.073
are        = 0.012
comes      = 0.002
those      = 0.002
------------------------------------------------------------
predicting word : 4
horrible   = 0.254
nice       = 0.066
the        = 0.056
terrible   = 0.043
really     = 0.038
------------------------------------------------------------
predicting word : 5
_EOS       = 0.554
it         = 0.097
i

------------------------------------------------------------
predicting word : 0
my         = 0.326
thirty     = 0.038
twenty     = 0.027
the        = 0.024
thousand   = 0.021
------------------------------------------------------------
predicting word : 1
own        = 0.041
thirty     = 0.034
credit     = 0.034
interest   = 0.027
train      = 0.025
------------------------------------------------------------
predicting word : 2
interest   = 0.077
_EOS       = 0.061
mind       = 0.047
one        = 0.038
baby       = 0.036
------------------------------------------------------------
predicting word : 3
_EOS       = 0.846
''         = 0.059
's         = 0.007
and        = 0.007
for        = 0.006
--------------------------------------------------------------------------------
39
--------------------------------------------------------------------------------
saved probs in : ./sp2enw_mel-80_vocab-nltk/sp_1.0_h-256_e-128_drpt-rnn-.3/probs/20051019_190221_288_fsp-A-76_probs.json
Utterance:

------------------------------------------------------------
predicting word : 0
i          = 0.890
and        = 0.045
god        = 0.016
they       = 0.015
you        = 0.003
------------------------------------------------------------
predicting word : 1
got        = 0.055
'm         = 0.054
should     = 0.051
get        = 0.036
thought    = 0.035
------------------------------------------------------------
predicting word : 2
like       = 0.148
married    = 0.048
already    = 0.039
it         = 0.036
back       = 0.034
------------------------------------------------------------
predicting word : 3
that       = 0.698
this       = 0.089
i          = 0.047
it         = 0.031
so         = 0.017
------------------------------------------------------------
predicting word : 4
so         = 0.754
i          = 0.078
since      = 0.016
_EOS       = 0.016
that       = 0.012
------------------------------------------------------------
predicting word : 5
i          = 0.339
_EOS       = 0.135
t

------------------------------------------------------------
predicting word : 0
pardon     = 0.786
hello      = 0.106
sorry      = 0.057
¿i         = 0.010
¿hello     = 0.010
------------------------------------------------------------
predicting word : 1
_EOS       = 0.870
me         = 0.056
that       = 0.012
''         = 0.005
you        = 0.004
--------------------------------------------------------------------------------
41
--------------------------------------------------------------------------------
saved probs in : ./sp2enw_mel-80_vocab-nltk/sp_1.0_h-256_e-128_drpt-rnn-.3/probs/20051017_234550_276_fsp-A-113_probs.json
Utterance: 20051017_234550_276_fsp-A-113
+-------------+--------------------------------------+
| es ref      | problemas programas de conversación  |
+-------------+--------------------------------------+
| en ref      | problems conversation programs       |
+-------------+--------------------------------------+
| model pred  | all the problems of with comm

------------------------------------------------------------
predicting word : 0
all        = 0.470
we         = 0.111
the        = 0.108
problems   = 0.102
every      = 0.046
------------------------------------------------------------
predicting word : 1
the        = 0.679
problems   = 0.123
of         = 0.048
all        = 0.040
that       = 0.008
------------------------------------------------------------
predicting word : 2
problems   = 0.678
problem    = 0.106
whole      = 0.060
programs   = 0.034
rest       = 0.016
------------------------------------------------------------
predicting word : 3
of         = 0.139
programs   = 0.082
eh         = 0.074
uh         = 0.073
ah         = 0.048
------------------------------------------------------------
predicting word : 4
with       = 0.310
commercials = 0.139
conversation = 0.063
of         = 0.049
commercial = 0.033
------------------------------------------------------------
predicting word : 5
commercials = 0.135
conversation = 0

------------------------------------------------------------
predicting word : 0
so         = 0.436
that       = 0.149
yes        = 0.078
okay       = 0.051
ah         = 0.046
------------------------------------------------------------
predicting word : 1
_EOS       = 0.776
i          = 0.041
much       = 0.036
then       = 0.014
that       = 0.012
--------------------------------------------------------------------------------
43
--------------------------------------------------------------------------------
saved probs in : ./sp2enw_mel-80_vocab-nltk/sp_1.0_h-256_e-128_drpt-rnn-.3/probs/20051022_180817_311_fsp-B-84_probs.json
Utterance: 20051022_180817_311_fsp-B-84
+-------------+----------------------------------------------------+
| es ref      | siempre es bueno tocar música yo creo              |
+-------------+----------------------------------------------------+
| en ref      | its always good to play music i think              |
+-------------+-------------------------------

------------------------------------------------------------
predicting word : 0
they       = 0.207
there      = 0.207
i          = 0.107
it         = 0.074
always     = 0.061
------------------------------------------------------------
predicting word : 1
're        = 0.381
are        = 0.293
always     = 0.193
have       = 0.041
've        = 0.028
------------------------------------------------------------
predicting word : 2
always     = 0.903
the        = 0.019
still      = 0.004
going      = 0.004
there      = 0.003
------------------------------------------------------------
predicting word : 3
good       = 0.619
the        = 0.078
well       = 0.067
peruvian   = 0.011
some       = 0.008
------------------------------------------------------------
predicting word : 4
looking    = 0.078
they       = 0.068
to         = 0.053
that       = 0.037
than       = 0.035
------------------------------------------------------------
predicting word : 5
for        = 0.410
in         = 0.073
a

------------------------------------------------------------
predicting word : 0
putting    = 0.059
i          = 0.054
well       = 0.026
good       = 0.018
fine       = 0.018
------------------------------------------------------------
predicting word : 1
music      = 0.801
the        = 0.023
me         = 0.015
a          = 0.012
to         = 0.007
------------------------------------------------------------
predicting word : 2
_EOS       = 0.897
on         = 0.009
old        = 0.009
from       = 0.006
at         = 0.006
--------------------------------------------------------------------------------
45
--------------------------------------------------------------------------------
saved probs in : ./sp2enw_mel-80_vocab-nltk/sp_1.0_h-256_e-128_drpt-rnn-.3/probs/20051009_182032_217_fsp-A-25_probs.json
Utterance: 20051009_182032_217_fsp-A-25
+-------------+----------------------------------+
| es ref      | fui a a otra universidad         |
+-------------+-----------------------------

------------------------------------------------------------
predicting word : 0
i          = 0.155
there      = 0.050
ah         = 0.047
went       = 0.039
she        = 0.037
------------------------------------------------------------
predicting word : 1
'm         = 0.344
went       = 0.251
could      = 0.105
am         = 0.059
was        = 0.047
------------------------------------------------------------
predicting word : 2
ah         = 0.121
going      = 0.112
already    = 0.074
a          = 0.054
there      = 0.036
------------------------------------------------------------
predicting word : 3
ah         = 0.370
in         = 0.163
oh         = 0.083
audrey     = 0.040
to         = 0.030
------------------------------------------------------------
predicting word : 4
in         = 0.235
ah         = 0.104
another    = 0.082
audrey     = 0.038
from       = 0.030
------------------------------------------------------------
predicting word : 5
another    = 0.384
other      = 0.160
t

------------------------------------------------------------
predicting word : 0
ah         = 0.706
oh         = 0.266
there      = 0.004
uh         = 0.003
aye        = 0.003
------------------------------------------------------------
predicting word : 1
yes        = 0.857
yeah       = 0.044
really     = 0.030
ah         = 0.014
oh         = 0.010
------------------------------------------------------------
predicting word : 2
yes        = 0.190
they       = 0.183
you        = 0.165
if         = 0.134
do         = 0.073
------------------------------------------------------------
predicting word : 3
they       = 0.306
you        = 0.180
it         = 0.066
yes        = 0.064
if         = 0.053
------------------------------------------------------------
predicting word : 4
have       = 0.506
're        = 0.132
are        = 0.115
do         = 0.042
call       = 0.027
------------------------------------------------------------
predicting word : 5
a          = 0.595
an         = 0.165
o

------------------------------------------------------------
predicting word : 0
in         = 0.617
those      = 0.045
from       = 0.038
that       = 0.038
of         = 0.028
------------------------------------------------------------
predicting word : 1
spain      = 0.645
los        = 0.059
these      = 0.040
those      = 0.027
the        = 0.025
------------------------------------------------------------
predicting word : 2
_EOS       = 0.984
right      = 0.001
and        = 0.001
or         = 0.001
for        = 0.001
--------------------------------------------------------------------------------
48
--------------------------------------------------------------------------------
saved probs in : ./sp2enw_mel-80_vocab-nltk/sp_1.0_h-256_e-128_drpt-rnn-.3/probs/20051017_180712_270_fsp-B-81_probs.json
Utterance: 20051017_180712_270_fsp-B-81
+-------------+-------+
| es ref      | hm mm |
+-------------+-------+
| en ref      | hm mm |
+-------------+-------+
| model pred  | mm    |
+-

------------------------------------------------------------
predicting word : 0
mm         = 0.296
hmm        = 0.188
hm         = 0.125
mhm        = 0.118
um         = 0.083
------------------------------------------------------------
predicting word : 1
_EOS       = 0.952
and        = 0.003
no         = 0.003
yes        = 0.002
but        = 0.002
--------------------------------------------------------------------------------
49
--------------------------------------------------------------------------------
saved probs in : ./sp2enw_mel-80_vocab-nltk/sp_1.0_h-256_e-128_drpt-rnn-.3/probs/20051016_180547_265_fsp-A-67_probs.json
Utterance: 20051016_180547_265_fsp-A-67
+-------------+--------------------------------+
| es ref      | dice esto es para proteger     |
+-------------+--------------------------------+
| en ref      | says this is to protect        |
+-------------+--------------------------------+
| model pred  | and then to get to respect     |
+-------------+-------------

------------------------------------------------------------
predicting word : 0
and        = 0.304
yes        = 0.187
so         = 0.082
if         = 0.058
that       = 0.053
------------------------------------------------------------
predicting word : 1
then       = 0.242
that       = 0.140
this       = 0.090
if         = 0.072
so         = 0.061
------------------------------------------------------------
predicting word : 2
to         = 0.218
it         = 0.206
for        = 0.172
that       = 0.053
she        = 0.035
------------------------------------------------------------
predicting word : 3
get        = 0.094
be         = 0.086
protect    = 0.071
respect    = 0.058
grow       = 0.035
------------------------------------------------------------
predicting word : 4
to         = 0.265
perfect    = 0.058
back       = 0.044
three      = 0.027
another    = 0.016
------------------------------------------------------------
predicting word : 5
respect    = 0.066
be         = 0.057
p

------------------------------------------------------------
predicting word : 0
yes        = 0.946
yeah       = 0.037
yea        = 0.010
really     = 0.001
¿yes       = 0.000
------------------------------------------------------------
predicting word : 1
yes        = 0.838
it         = 0.035
that       = 0.025
i          = 0.020
yeah       = 0.016
------------------------------------------------------------
predicting word : 2
yes        = 0.269
it         = 0.197
i          = 0.168
that       = 0.109
she        = 0.033
------------------------------------------------------------
predicting word : 3
it         = 0.230
yes        = 0.181
i          = 0.168
that       = 0.157
is         = 0.031
------------------------------------------------------------
predicting word : 4
's         = 0.711
is         = 0.117
was        = 0.030
helps      = 0.011
already    = 0.009
------------------------------------------------------------
predicting word : 5
true       = 0.738
already    = 0.021
r

------------------------------------------------------------
predicting word : 0
oh         = 0.894
ah         = 0.065
wow        = 0.004
hey        = 0.004
okay       = 0.004
------------------------------------------------------------
predicting word : 1
yeah       = 0.375
i          = 0.183
yes        = 0.158
okay       = 0.104
yea        = 0.043
------------------------------------------------------------
predicting word : 2
_EOS       = 0.472
oh         = 0.137
yeah       = 0.087
okay       = 0.022
uh         = 0.020


In [51]:
vocab_dict['en_w']['i2w'][494]

b'yeah'

In [72]:
eg_utt = "20051009_210519_219_fsp-A-16"
generate_translate_probs(eg_utt)

decoding with word: _GO
decoding with word: yes
decoding with word: the
decoding with word: majority
decoding with word: is
decoding with word: like
decoding with word: politics
saved probs in : ./sp2enw_mel-80_vocab-nltk/sp_1.0_h-256_e-128_drpt-rnn-.3/probs/20051009_210519_219_fsp-A-16_probs.json
Utterance: 20051009_210519_219_fsp-A-16
+-------------+-------------------------------------+
| es ref      | yea la religión es como la política |
+-------------+-------------------------------------+
| en ref      | yeah religion is like politics      |
+-------------+-------------------------------------+
| model pred  | yes the majority is like politics   |
+-------------+-------------------------------------+
| model bleu  | 0.55                                |
+-------------+-------------------------------------+
| google pred | yeah religion is like politics      |
+-------------+-------------------------------------+
| google bleu | 1.00                                |
+------------

------------------------------------------------------------
predicting word : 0
yes        = 0.423
yeah       = 0.277
i          = 0.059
and        = 0.041
yea        = 0.029
------------------------------------------------------------
predicting word : 1
the        = 0.307
thank      = 0.124
religion   = 0.099
i          = 0.048
god        = 0.028
------------------------------------------------------------
predicting word : 2
majority   = 0.197
truth      = 0.121
religion   = 0.039
reality    = 0.035
radio      = 0.033
------------------------------------------------------------
predicting word : 3
is         = 0.575
how        = 0.098
of         = 0.083
like       = 0.073
are        = 0.046
------------------------------------------------------------
predicting word : 4
like       = 0.773
how        = 0.054
about      = 0.015
the        = 0.014
is         = 0.007
------------------------------------------------------------
predicting word : 5
politics   = 0.616
a          = 0.264
t

In [92]:
print("model beats google by factor of 2")

count = 0
# print("-"*80)
# print("{0:>5s} ||| {1:30s} ||| {2:>15s} || {3:>15s}".format("sn", "utt", "google utt bleu", "model utt bleu"))
# print("-"*80)
for utt in dev_utts:
    if len(model_s2t_refs[utt]) > 3 and len(model_s2t_refs[utt]) < 20:
        google_utt_bleu = sentence_bleu([google_dev_ref_0[utt]], google_hyp_r0[utt], smoothing_function=smooth_fun.method2)
        model_utt_bleu = sentence_bleu([model_s2t_refs[utt]], model_s2t_hyps[utt], smoothing_function=smooth_fun.method2)
        if model_utt_bleu >= (1.5 * google_utt_bleu) and model_utt_bleu >= 0.5:
            count += 1
            #print("{0:5d} ||| {1:30s} ||| {2:15.2f} || {3:15.2f}".format(count, utt, google_utt_bleu, model_utt_bleu))
            print("-"*80)
            print(count)
            print("-"*80)
    #         display_pp = PrettyTable(["cat","sent"], hrules=True)
    #         display_pp.align = "l"
    #         display_pp.header = False
    #         display_pp.add_row(["en ref", textwrap.fill(" ".join(model_s2t_refs[utt]),50)])
    #         display_pp.add_row(["model pred", textwrap.fill(" ".join(model_s2t_hyps[utt]),50)])
    #         display_pp.add_row(["model utt bleu", "{0:.2f}".format(model_utt_bleu)])
    #         display_pp.add_row(["google pred", textwrap.fill(" ".join(google_hyp_r0[utt]),50)])
    #         display_pp.add_row(["google utt bleu", "{0:.2f}".format(google_utt_bleu)])
    #         print(display_pp)
    #         play_utt(utt, map_dict['fisher_dev'])
            generate_translate_probs(utt)
        if count > 50:
            break


model beats google by factor of 2
--------------------------------------------------------------------------------
1
--------------------------------------------------------------------------------
saved probs in : ./sp2enw_mel-80_vocab-nltk/sp_1.0_h-256_e-128_drpt-rnn-.3/probs/20051018_210744_280_fsp-A-34_probs.json
Utterance: 20051018_210744_280_fsp-A-34
+-------------+--------------------------------------------+
| es ref      | oh boy ¿y tu vives con ah otro persona     |
+-------------+--------------------------------------------+
| en ref      | oh boy and you live with ah another person |
+-------------+--------------------------------------------+
| model pred  | oh oh and you live with ah another person  |
+-------------+--------------------------------------------+
| model bleu  | 0.90                                       |
+-------------+--------------------------------------------+
| google pred | oh boy and do you live with another person |
+-------------+----------------

------------------------------------------------------------
predicting word : 0
oh         = 0.584
ah         = 0.332
wow        = 0.020
um         = 0.007
uh         = 0.006
------------------------------------------------------------
predicting word : 1
oh         = 0.245
wow        = 0.094
okay       = 0.072
ok         = 0.069
yeah       = 0.038
------------------------------------------------------------
predicting word : 2
and        = 0.402
oh         = 0.092
you        = 0.058
ok         = 0.035
okay       = 0.032
------------------------------------------------------------
predicting word : 3
you        = 0.600
do         = 0.144
your       = 0.068
did        = 0.045
and        = 0.034
------------------------------------------------------------
predicting word : 4
live       = 0.913
lived      = 0.021
lives      = 0.009
you        = 0.007
come       = 0.005
------------------------------------------------------------
predicting word : 5
with       = 0.801
in         = 0.112
t

------------------------------------------------------------
predicting word : 0
you        = 0.196
i          = 0.151
like       = 0.110
when       = 0.080
with       = 0.069
------------------------------------------------------------
predicting word : 1
have       = 0.886
do         = 0.037
are        = 0.013
're        = 0.012
want       = 0.006
------------------------------------------------------------
predicting word : 2
to         = 0.969
that       = 0.008
the        = 0.004
a          = 0.003
one        = 0.001
------------------------------------------------------------
predicting word : 3
see        = 0.177
be         = 0.133
watch      = 0.091
have       = 0.082
go         = 0.049
------------------------------------------------------------
predicting word : 4
the        = 0.234
that       = 0.154
them       = 0.058
it         = 0.055
what       = 0.053
------------------------------------------------------------
predicting word : 5
cell       = 0.031
baby       = 0.026
p

------------------------------------------------------------
predicting word : 0
where      = 0.846
¿where     = 0.114
from       = 0.012
and        = 0.005
then       = 0.003
------------------------------------------------------------
predicting word : 1
are        = 0.314
you        = 0.215
was        = 0.185
were       = 0.106
it         = 0.048
------------------------------------------------------------
predicting word : 2
you        = 0.997
they       = 0.001
i          = 0.000
your       = 0.000
it         = 0.000
------------------------------------------------------------
predicting word : 3
from       = 0.273
going      = 0.048
now        = 0.044
right      = 0.034
lucky      = 0.029
------------------------------------------------------------
predicting word : 4
_EOS       = 0.201
now        = 0.146
toronto    = 0.048
right      = 0.043
you        = 0.033
--------------------------------------------------------------------------------
4
-------------------------------------

------------------------------------------------------------
predicting word : 0
mhm        = 0.309
hmm        = 0.250
mm         = 0.200
uh         = 0.059
hm         = 0.049
------------------------------------------------------------
predicting word : 1
mhm        = 0.885
_EOS       = 0.031
mm         = 0.020
hmm        = 0.015
aha        = 0.009
------------------------------------------------------------
predicting word : 2
_EOS       = 0.907
and        = 0.063
no         = 0.004
so         = 0.003
right      = 0.003
--------------------------------------------------------------------------------
5
--------------------------------------------------------------------------------
saved probs in : ./sp2enw_mel-80_vocab-nltk/sp_1.0_h-256_e-128_drpt-rnn-.3/probs/20051009_210519_219_fsp-B-37_probs.json
Utterance: 20051009_210519_219_fsp-B-37
+-------------+--------------------------+
| es ref      | sí sí definitivamente yo |
+-------------+--------------------------+
| en ref      | ye

------------------------------------------------------------
predicting word : 0
yes        = 0.968
yeah       = 0.017
yea        = 0.007
ye         = 0.001
really     = 0.001
------------------------------------------------------------
predicting word : 1
yes        = 0.962
and        = 0.010
yeah       = 0.004
yea        = 0.004
_EOS       = 0.003
------------------------------------------------------------
predicting word : 2
definitely = 0.713
yes        = 0.054
it         = 0.026
simply     = 0.018
absolutely = 0.014
------------------------------------------------------------
predicting word : 3
i          = 0.740
and        = 0.091
_EOS       = 0.039
that       = 0.019
me         = 0.019
------------------------------------------------------------
predicting word : 4
_EOS       = 0.438
i          = 0.165
have       = 0.036
and        = 0.031
do         = 0.030
--------------------------------------------------------------------------------
6
-------------------------------------

------------------------------------------------------------
predicting word : 0
they       = 0.127
oh         = 0.103
everything = 0.082
all        = 0.073
are        = 0.037
------------------------------------------------------------
predicting word : 1
're        = 0.498
are        = 0.406
all        = 0.019
have       = 0.009
take       = 0.006
------------------------------------------------------------
predicting word : 2
more       = 0.259
romantic   = 0.029
bigger     = 0.029
very       = 0.028
white      = 0.028
------------------------------------------------------------
predicting word : 3
romantic   = 0.249
white      = 0.149
rich       = 0.016
twenty     = 0.013
whites     = 0.012
------------------------------------------------------------
predicting word : 4
_EOS       = 0.951
for        = 0.006
than       = 0.005
right      = 0.003
ones       = 0.003
--------------------------------------------------------------------------------
7
-------------------------------------

------------------------------------------------------------
predicting word : 0
ten        = 0.383
and        = 0.150
years      = 0.073
james      = 0.024
you        = 0.017
------------------------------------------------------------
predicting word : 1
years      = 0.972
year       = 0.005
and        = 0.003
ten        = 0.002
angeles    = 0.001
------------------------------------------------------------
predicting word : 2
where      = 0.803
and        = 0.054
¿where     = 0.030
old        = 0.027
before     = 0.014
------------------------------------------------------------
predicting word : 3
are        = 0.915
where      = 0.032
you        = 0.026
did        = 0.005
¿where     = 0.005
------------------------------------------------------------
predicting word : 4
you        = 0.994
where      = 0.002
they       = 0.001
are        = 0.001
your       = 0.000
------------------------------------------------------------
predicting word : 5
from       = 0.936
where      = 0.023
_

------------------------------------------------------------
predicting word : 0
yeah       = 0.488
i          = 0.230
yes        = 0.123
okay       = 0.103
yea        = 0.013
------------------------------------------------------------
predicting word : 1
yeah       = 0.877
i          = 0.031
yes        = 0.028
okay       = 0.027
yea        = 0.008
------------------------------------------------------------
predicting word : 2
yeah       = 0.769
i          = 0.032
okay       = 0.023
yes        = 0.022
_EOS       = 0.018
------------------------------------------------------------
predicting word : 3
yeah       = 0.166
um         = 0.130
_EOS       = 0.130
mm         = 0.104
uh         = 0.099
------------------------------------------------------------
predicting word : 4
_EOS       = 0.337
um         = 0.122
uh         = 0.104
yeah       = 0.080
mm         = 0.074
--------------------------------------------------------------------------------
9
-------------------------------------

------------------------------------------------------------
predicting word : 0
ah         = 0.628
oh         = 0.279
to         = 0.041
uh         = 0.005
aye        = 0.005
------------------------------------------------------------
predicting word : 1
the        = 0.161
i          = 0.052
snow       = 0.041
to         = 0.040
mine       = 0.022
------------------------------------------------------------
predicting word : 2
snow       = 0.296
fear       = 0.029
baby       = 0.020
owner      = 0.018
middle     = 0.012
------------------------------------------------------------
predicting word : 3
and        = 0.332
is         = 0.046
gave       = 0.031
the        = 0.027
_EOS       = 0.016
------------------------------------------------------------
predicting word : 4
all        = 0.342
the        = 0.064
everything = 0.060
that       = 0.038
it         = 0.026
------------------------------------------------------------
predicting word : 5
that       = 0.655
ah         = 0.146
o

------------------------------------------------------------
predicting word : 0
aha        = 0.719
right      = 0.112
uh         = 0.074
uhhuh      = 0.032
yes        = 0.017
------------------------------------------------------------
predicting word : 1
oh         = 0.940
ah         = 0.010
okay       = 0.009
_EOS       = 0.006
o          = 0.005
------------------------------------------------------------
predicting word : 2
that       = 0.338
how        = 0.332
what       = 0.069
okay       = 0.039
oh         = 0.034
------------------------------------------------------------
predicting word : 3
's         = 0.977
_EOS       = 0.004
well       = 0.003
good       = 0.001
is         = 0.001
------------------------------------------------------------
predicting word : 4
good       = 0.798
nice       = 0.141
great      = 0.022
cool       = 0.009
why        = 0.004
------------------------------------------------------------
predicting word : 5
_EOS       = 0.965
to         = 0.005
f

------------------------------------------------------------
predicting word : 0
oh         = 0.985
ah         = 0.005
or         = 0.001
o          = 0.001
ok         = 0.001
------------------------------------------------------------
predicting word : 1
that       = 0.573
how        = 0.270
nice       = 0.037
good       = 0.025
it         = 0.015
------------------------------------------------------------
predicting word : 2
's         = 0.983
is         = 0.005
good       = 0.003
well       = 0.002
nice       = 0.002
------------------------------------------------------------
predicting word : 3
good       = 0.910
nice       = 0.058
great      = 0.015
true       = 0.004
fine       = 0.003
------------------------------------------------------------
predicting word : 4
_EOS       = 0.989
right      = 0.003
no         = 0.001
¿right     = 0.001
that       = 0.001
--------------------------------------------------------------------------------
12
------------------------------------

------------------------------------------------------------
predicting word : 0
yes        = 0.837
yeah       = 0.088
yea        = 0.007
if         = 0.006
but        = 0.004
------------------------------------------------------------
predicting word : 1
or         = 0.931
oh         = 0.035
i          = 0.006
like       = 0.004
hello      = 0.002
------------------------------------------------------------
predicting word : 2
maybe      = 0.534
perhaps    = 0.134
fifteen    = 0.057
what       = 0.037
more       = 0.012
------------------------------------------------------------
predicting word : 3
what       = 0.434
how        = 0.257
like       = 0.163
more       = 0.014
¿what      = 0.014
------------------------------------------------------------
predicting word : 4
in         = 0.363
's         = 0.226
is         = 0.074
a          = 0.059
do         = 0.049
------------------------------------------------------------
predicting word : 5
spain      = 0.960
spanish    = 0.012
i

------------------------------------------------------------
predicting word : 0
yes        = 0.887
really     = 0.044
yeah       = 0.036
yea        = 0.010
¿yes       = 0.006
------------------------------------------------------------
predicting word : 1
wow        = 0.931
ah         = 0.032
oh         = 0.019
well       = 0.003
so         = 0.001
------------------------------------------------------------
predicting word : 2
and        = 0.365
_EOS       = 0.275
you        = 0.035
i          = 0.029
oh         = 0.026
------------------------------------------------------------
predicting word : 3
_EOS       = 0.564
you        = 0.163
then       = 0.029
i          = 0.019
so         = 0.019
--------------------------------------------------------------------------------
14
--------------------------------------------------------------------------------
saved probs in : ./sp2enw_mel-80_vocab-nltk/sp_1.0_h-256_e-128_drpt-rnn-.3/probs/20051018_210744_280_fsp-B-21_probs.json
Utterance:

------------------------------------------------------------
predicting word : 0
and        = 0.935
¿and       = 0.030
yes        = 0.007
if         = 0.004
you        = 0.004
------------------------------------------------------------
predicting word : 1
you        = 0.608
do         = 0.140
they       = 0.063
he         = 0.054
have       = 0.033
------------------------------------------------------------
predicting word : 2
have       = 0.656
had        = 0.111
're        = 0.047
do         = 0.040
are        = 0.030
------------------------------------------------------------
predicting word : 3
some       = 0.740
one        = 0.129
a          = 0.051
something  = 0.019
two        = 0.015
------------------------------------------------------------
predicting word : 4
kids       = 0.625
children   = 0.330
boys       = 0.010
kid        = 0.004
girls      = 0.003
------------------------------------------------------------
predicting word : 5
_EOS       = 0.936
children   = 0.018
k

------------------------------------------------------------
predicting word : 0
yes        = 0.664
yeah       = 0.185
eh         = 0.031
uh         = 0.023
oh         = 0.016
------------------------------------------------------------
predicting word : 1
i          = 0.963
it         = 0.007
we         = 0.007
no         = 0.005
well       = 0.002
------------------------------------------------------------
predicting word : 2
do         = 0.980
don´t      = 0.012
ca         = 0.002
know       = 0.001
'm         = 0.001
------------------------------------------------------------
predicting word : 3
n't        = 0.998
not        = 0.001
i          = 0.000
know       = 0.000
like       = 0.000
------------------------------------------------------------
predicting word : 4
know       = 0.993
do         = 0.001
think      = 0.001
i          = 0.001
see        = 0.001
------------------------------------------------------------
predicting word : 5
why        = 0.787
because    = 0.109
a

------------------------------------------------------------
predicting word : 0
mm         = 0.281
hmm        = 0.150
yes        = 0.131
um         = 0.083
ah         = 0.034
------------------------------------------------------------
predicting word : 1
yes        = 0.485
no         = 0.323
yeah       = 0.065
if         = 0.011
but        = 0.010
------------------------------------------------------------
predicting word : 2
no         = 0.917
not        = 0.016
i          = 0.011
right      = 0.008
yes        = 0.008
------------------------------------------------------------
predicting word : 3
that       = 0.313
i          = 0.237
no         = 0.148
yes        = 0.073
this       = 0.052
------------------------------------------------------------
predicting word : 4
's         = 0.856
is         = 0.106
does       = 0.005
yes        = 0.004
has        = 0.004
------------------------------------------------------------
predicting word : 5
true       = 0.897
not        = 0.044
r

------------------------------------------------------------
predicting word : 0
that       = 0.168
you        = 0.081
how        = 0.074
exactly    = 0.069
who        = 0.066
------------------------------------------------------------
predicting word : 1
's         = 0.339
is         = 0.127
they       = 0.100
are        = 0.062
you        = 0.058
------------------------------------------------------------
predicting word : 2
right      = 0.110
in         = 0.065
a          = 0.040
it         = 0.036
how        = 0.033
------------------------------------------------------------
predicting word : 3
in         = 0.401
between    = 0.130
and        = 0.086
the        = 0.041
at         = 0.036
------------------------------------------------------------
predicting word : 4
the        = 0.430
toronto    = 0.092
in         = 0.084
between    = 0.076
front      = 0.029
------------------------------------------------------------
predicting word : 5
center     = 0.649
internet   = 0.059
s

------------------------------------------------------------
predicting word : 0
and        = 0.583
they       = 0.096
it         = 0.047
she        = 0.044
you        = 0.037
------------------------------------------------------------
predicting word : 1
to         = 0.226
it         = 0.103
she        = 0.094
ah         = 0.079
they       = 0.079
------------------------------------------------------------
predicting word : 2
meet       = 0.386
know       = 0.281
to         = 0.092
get        = 0.051
do         = 0.019
------------------------------------------------------------
predicting word : 3
what       = 0.951
him        = 0.012
whatever   = 0.010
it         = 0.006
the        = 0.003
------------------------------------------------------------
predicting word : 4
it         = 0.221
he         = 0.106
is         = 0.094
to         = 0.077
_EOS       = 0.073
------------------------------------------------------------
predicting word : 5
's         = 0.466
is         = 0.372
w

------------------------------------------------------------
predicting word : 0
yes        = 0.931
yeah       = 0.054
yea        = 0.007
if         = 0.002
and        = 0.001
------------------------------------------------------------
predicting word : 1
is         = 0.560
it         = 0.376
that       = 0.048
very       = 0.003
this       = 0.002
------------------------------------------------------------
predicting word : 2
very       = 0.468
is         = 0.180
good       = 0.135
really     = 0.044
it         = 0.034
------------------------------------------------------------
predicting word : 3
good       = 0.910
nice       = 0.043
well       = 0.019
great      = 0.004
very       = 0.003
------------------------------------------------------------
predicting word : 4
music      = 0.415
the        = 0.322
in         = 0.038
to         = 0.021
_EOS       = 0.019
------------------------------------------------------------
predicting word : 5
right      = 0.522
no         = 0.177
_

In [20]:
len(vocab_dict['es_w']['w2i']), len(vocab_dict['en_w']['w2i'])

(32864, 17834)

In [21]:
es_words = set(vocab_dict['es_w']['w2i'].keys())
en_words = set(vocab_dict['en_w']['w2i'].keys())

In [24]:
len(es_words), len(en_words)

(32864, 17834)

In [25]:
common_words = es_words & en_words

In [26]:
len(common_words)

5185

In [46]:
freq_common_es = {w: vocab_dict['es_w']['freq'][w] for w in common_words}
freq_common_en = {w: vocab_dict['en_w']['freq'][w] for w in common_words}
freq_common_both = {w: (vocab_dict['en_w']['freq'][w], vocab_dict['es_w']['freq'][w]) for w in common_words}


In [63]:
len(freq_common_es), len(freq_common_en)

(5185, 5185)

In [38]:
sum(freq_common_es.values()), sum(freq_common_en.values())

(844202, 1282482)

In [65]:
len(vocab_dict['es_w']['freq']), sum(vocab_dict['es_w']['freq'].values()), len(vocab_dict['en_w']['freq']), sum(vocab_dict['en_w']['freq'].values())

(32864, 1496796, 17834, 1497356)

In [40]:
844202 / 1496796, 1282482 / 1497356

(0.5640060502566816, 0.8564977199810867)

In [45]:
freq_common_en[b'que']

4

In [43]:
sorted(freq_common_es.items(), reverse=True, key= lambda t: t[1])

[(b'que', 66461),
 (b'y', 46805),
 (b'no', 46353),
 (b'de', 44681),
 (b'la', 31692),
 (b'es', 30622),
 (b'a', 29964),
 (b'en', 29434),
 (b'yo', 25011),
 (b'el', 22036),
 (b'pero', 17450),
 (b'me', 15909),
 (b'lo', 15830),
 (b'un', 14416),
 (b'los', 13587),
 (b'como', 13082),
 (b'se', 13045),
 (b'ah', 12325),
 (b'o', 12283),
 (b'por', 12150),
 (b'eh', 11447),
 (b'una', 11000),
 (b'ya', 9461),
 (b'si', 9423),
 (b'con', 9422),
 (b'm\xc3\xa1s', 9335),
 (b'mi', 8658),
 (b'hay', 7869),
 (b'bueno', 7838),
 (b'tu', 7768),
 (b'las', 7241),
 (b'te', 7052),
 (b'muy', 6319),
 (b'le', 5858),
 (b'uno', 5578),
 (b'todo', 5527),
 (b'mhm', 5367),
 (b'sea', 4945),
 (b'son', 4784),
 (b'gente', 4642),
 (b'mm', 4556),
 (b'oh', 4502),
 (b'mucho', 4498),
 (b'este', 4272),
 (b'del', 3891),
 (b'al', 3630),
 (b'ellos', 3384),
 (b'ahora', 3194),
 (b'cosas', 3116),
 (b'\xc2\xbfno', 3043),
 (b'era', 2817),
 (b'va', 2742),
 (b'algo', 2736),
 (b'dos', 2543),
 (b'hace', 2522),
 (b'he', 2470),
 (b'su', 2405),
 (b'uh',

In [44]:
sorted(freq_common_en.items(), reverse=True, key= lambda t: t[1])

[(b'i', 59981),
 (b'the', 55077),
 (b'and', 45356),
 (b'that', 43290),
 (b'yes', 35054),
 (b'you', 34194),
 (b'it', 33448),
 (b'to', 32575),
 (b'a', 28402),
 (b'in', 24685),
 (b"'s", 24162),
 (b'is', 22852),
 (b'they', 21268),
 (b"n't", 19184),
 (b'of', 19117),
 (b'do', 18452),
 (b'but', 17179),
 (b'are', 15324),
 (b'have', 15316),
 (b'like', 14334),
 (b'no', 13330),
 (b'there', 13007),
 (b'well', 12354),
 (b'because', 10834),
 (b'know', 10619),
 (b'so', 10415),
 (b'for', 10381),
 (b'was', 10053),
 (b'ah', 9590),
 (b'what', 9472),
 (b'oh', 9183),
 (b'not', 8879),
 (b'with', 8158),
 (b'or', 7956),
 (b'my', 7548),
 (b'we', 7520),
 (b'from', 7481),
 (b'if', 7166),
 (b'eh', 7164),
 (b'me', 7121),
 (b'here', 7046),
 (b'this', 7010),
 (b'he', 6770),
 (b'think', 6677),
 (b'people', 6560),
 (b'right', 6505),
 (b'one', 6340),
 (b'aha', 6291),
 (b'how', 5978),
 (b'uh', 5595),
 (b"'m", 5546),
 (b'very', 5535),
 (b'all', 5215),
 (b'be', 5059),
 (b'more', 5015),
 (b'sure', 4842),
 (b'then', 4733),


In [47]:
sorted(freq_common_both.items(), reverse=True, key= lambda t: t[1])

[(b'i', (59981, 282)),
 (b'the', (55077, 93)),
 (b'and', (45356, 61)),
 (b'that', (43290, 32)),
 (b'yes', (35054, 46)),
 (b'you', (34194, 550)),
 (b'it', (33448, 26)),
 (b'to', (32575, 143)),
 (b'a', (28402, 29964)),
 (b'in', (24685, 62)),
 (b"'s", (24162, 65)),
 (b'is', (22852, 21)),
 (b'they', (21268, 21)),
 (b"n't", (19184, 17)),
 (b'of', (19117, 39)),
 (b'do', (18452, 80)),
 (b'but', (17179, 2)),
 (b'are', (15324, 19)),
 (b'have', (15316, 8)),
 (b'like', (14334, 34)),
 (b'no', (13330, 46353)),
 (b'there', (13007, 8)),
 (b'well', (12354, 40)),
 (b'because', (10834, 43)),
 (b'know', (10619, 548)),
 (b'so', (10415, 818)),
 (b'for', (10381, 18)),
 (b'was', (10053, 7)),
 (b'ah', (9590, 12325)),
 (b'what', (9472, 25)),
 (b'oh', (9183, 4502)),
 (b'not', (8879, 23)),
 (b'with', (8158, 2)),
 (b'or', (7956, 44)),
 (b'my', (7548, 124)),
 (b'we', (7520, 9)),
 (b'from', (7481, 1)),
 (b'if', (7166, 1)),
 (b'eh', (7164, 11447)),
 (b'me', (7121, 15909)),
 (b'here', (7046, 6)),
 (b'this', (7010, 8)

In [66]:
C = 20
common_in_both = [w for w, (c1, c2) in freq_common_both.items() if c1 >= C and c2 >= C]

In [67]:
len(common_in_both)

415

In [68]:
common_in_both

[b'bill',
 b'regular',
 b'virginia',
 b'com',
 b'jorge',
 b'gloria',
 b'un',
 b'cultural',
 b'alicia',
 b'propaganda',
 b'mercedes',
 b'base',
 b'cable',
 b'city',
 b'plata',
 b'horrible',
 b'once',
 b'decide',
 b'hmm',
 b'karma',
 b'sale',
 b'y',
 b'salvador',
 b'buenos',
 b'francisco',
 b'norma',
 b'rico',
 b'gringos',
 b'bolivia',
 b'he',
 b'latino',
 b'jersey',
 b'arturo',
 b'orleans',
 b'know',
 b'personal',
 b'as',
 b'diego',
 b'bla',
 b'el',
 b'because',
 b'tropical',
 b'florida',
 b'santo',
 b'paulo',
 b'lima',
 b'guatemala',
 b'mean',
 b'bronx',
 b'us',
 b'no',
 b'uh',
 b'doctor',
 b'ernestina',
 b'ebay',
 b'philly',
 b'paulina',
 b'it',
 b'that',
 b'kansas',
 b'tv',
 b'club',
 b'hotmail',
 b'plan',
 b'denver',
 b'colorado',
 b'racial',
 b'universal',
 b'total',
 b'venezuela',
 b'pennsylvania',
 b'chihuahua',
 b'websites',
 b'on',
 b'houston',
 b'idaho',
 b'yo',
 b'video',
 b'cumbia',
 b'las',
 b'medicaid',
 b'lorena',
 b'honduras',
 b'david',
 b'um',
 b'mail',
 b'ma',
 b'nico