In [1]:
%load_ext autoreload
%autoreload 1

%aimport basics
%aimport nn_config
%aimport enc_dec


from basics import *
from nn_config import *
from enc_dec import *
%matplotlib inline

callhome es-en word level configuration
translating es to en
vocab size, en=51, fr=51


In [2]:
xp = cuda.cupy if gpuid >= 0 else np

In [3]:
text_data = pickle.load(open(text_data_dict, "rb"))

In [4]:
model = SpeechEncoderDecoder(SPEECH_DIM, vocab_size_en, num_layers_enc, num_layers_dec,
                               hidden_units, gpuid, attn=use_attn)

In [5]:
log_train_fil_name, text_fname, dev_fname, test_fname

('es_speech_to_en_char_model/train_17394sen_5-2layers_128units_es_en_speech2text_callhome_es_en_1.log',
 {'en': '../../corpora/callhome/uttr_fa_vad_wavs/train.en',
  'fr': '../../corpora/callhome/uttr_fa_vad_wavs/speech_train.es'},
 {'en': '../../corpora/callhome/uttr_fa_vad_wavs/dev.en',
  'fr': '../../corpora/callhome/uttr_fa_vad_wavs/speech_dev.es'},
 {'en': '../../corpora/callhome/uttr_fa_vad_wavs/test.en',
  'fr': '../../corpora/callhome/uttr_fa_vad_wavs/speech_test.es'})

In [6]:
def predict_sentence(line_num, line_fr, line_en=None, display=True, plot_name=None, p_filt=0, r_filt=0):
    if not CHAR_LEVEL:
        fr_sent = line_fr.strip().split()
    else:
        fr_sent = [c.encode() for c in list(line_fr.strip().decode())]
    fr_ids = [w2i["fr"].get(w, UNK_ID) for w in fr_sent]

    # english reference is optional. If provided, compute precision/recall
    if line_en:
        en_sent = line_en.strip().split()
        en_ids = [w2i["en"].get(w, UNK_ID) for w in en_sent]

    pred_ids, alpha_arr = model.encode_decode_predict(fr_ids)
    pred_words = [i2w["en"][w].decode() if w != EOS_ID else " _EOS" for w in pred_ids]
    # print(pred_ids)
    # print(pred_words)

    prec = 0
    rec = 0
    filter_match = False

    matches = count_match(en_ids, pred_ids)
    if EOS_ID in pred_ids:
        pred_len = len(pred_ids)-1
    else:
        pred_len = len(pred_ids)
    # subtract 1 from length for EOS id
    prec = (matches/pred_len) if pred_len > 0 else 0
    rec = matches/len(en_ids)

    if display and (prec >= p_filt and rec >= r_filt):
        filter_match = True
        # convert raw binary into string
        # fr_words = [w.decode() for w in fr_sent]

        print("{0:s}".format("-"*50))
        print("sentence: {0:d}".format(line_num))
        print("{0:s} | {1:80s}".format("Src", line_fr.strip().decode()))
        print("{0:s} | {1:80s}".format("Ref", line_en.strip().decode()))
        
        if not CHAR_LEVEL:
            print("{0:s} | {1:80s}".format("Hyp", " ".join(pred_words)))
        else:
            print("{0:s} | {1:80s}".format("Hyp", "".join(pred_words)))

        print("{0:s}".format("-"*50))

        print("{0:s} | {1:0.4f}".format("precision", prec))
        print("{0:s} | {1:0.4f}".format("recall", rec))

        # if plot_name and use_attn:
        #     plot_attention(alpha_arr, fr_words, pred_words, plot_name)

    return matches, len(pred_ids), len(en_ids), filter_match

# In[ ]:


def predict(s=NUM_TRAINING_SENTENCES, num=NUM_DEV_SENTENCES, display=True, plot=False, p_filt=0, r_filt=0, fil_name=text_fname):
    print("English predictions, s={0:d}, num={1:d}:".format(s, num))

    metrics = {"cp":[], "tp":[], "t":[]}

    filter_count = 0

    with open(fil_name["fr"], "rb") as fr_file, open(fil_name["en"], "rb") as en_file:
        for i, (line_fr, line_en) in enumerate(zip(fr_file, en_file), start=0):
            if i >= s and i < (s+num):
                if plot:
                    plot_name = os.path.join(model_dir, "sample_{0:d}_plot.png".format(i+1))
                else:
                    plot_name=None

                # make prediction
                cp, tp, t, f = predict_sentence(i, line_fr,
                                             line_en,
                                             display=display,
                                             plot_name=plot_name,
                                             p_filt=p_filt, r_filt=r_filt)
                metrics["cp"].append(cp)
                metrics["tp"].append(tp)
                metrics["t"].append(t)
                filter_count += (1 if f else 0)

    print("sentences matching filter = {0:d}".format(filter_count))
    return metrics

def count_match(list1, list2):
    # each list can have repeated elements. The count should account for this.
    count1 = Counter(list1)
    count2 = Counter(list2)
    count2_keys = count2.keys()-set([UNK_ID, EOS_ID])
    common_w = set(count1.keys()) & set(count2_keys)
    #all_w = set(count1.keys()) + set(count2.keys())
    matches = sum([min(count1[w], count2[w]) for w in common_w])
    #matches = sum([max(0, count2[v]-count1[v]) for v in (count2-count1).values()])
    #matches = sum([max(0, count2[v]-count1[v]) for v in common_w])
    return matches


In [7]:
def compute_pplx(src_fname, tar_fname, num_sent):
    loss = 0
    num_words = 0
    # with open(test_fname["fr"], "rb") as fr_file, open(test_fname["en"], "rb") as en_file:
    with open(src_fname, "rb") as fr_file, open(tar_fname, "rb") as en_file:
        with tqdm(total=num_sent) as pbar:
            sys.stderr.flush()
            out_str = "loss={0:.6f}".format(0)
            pbar.set_description(out_str)
            for i, (line_fr, line_en) in enumerate(zip(fr_file, en_file), start=1):

                if i > num_sent:
                    break

                if not CHAR_LEVEL:
                    fr_sent = line_fr.strip().split()
                    en_sent = line_en.strip().split()
                else:
                    fr_sent = [c.encode() for c in list(line_fr.strip().decode())]
                    en_sent = [c.encode() for c in list(line_en.strip().decode())]

                fr_ids = [w2i["fr"].get(w, UNK_ID) for w in fr_sent]
                en_ids = [w2i["en"].get(w, UNK_ID) for w in en_sent]

                if len(fr_ids) > 0 and len(en_ids) > 0:
                    # compute loss
                    curr_loss = float(model.encode_decode_train(fr_ids, en_ids, train=False).data)
                    loss += curr_loss
                    num_words += len(en_ids)

                    out_str = "loss={0:.6f}".format(curr_loss)
                    pbar.set_description(out_str)
                pbar.update(1)

            # end of for
        # end of pbar
    # end of with open file
    loss_per_word = loss / num_words
    pplx = 2 ** loss_per_word
    random_pplx = vocab_size_en

    print("{0:s}".format("-"*50))
    print("{0:s} | {1:0.6f}".format("dev perplexity", pplx))
    print("{0:s}".format("-"*50))

    return pplx

In [8]:
def get_ids(align_list, char_level=CHAR_LEVEL):
    words = [a.word for a in align_list]
    text_line = " ".join(words)
    
    if not char_level:
        symbols = [w.encode() for w in text_line.strip()]
    else:
        symbols = [c.encode() for c in list(text_line.strip())]
    
    return symbols

In [9]:
print(b" ".join(get_ids(text_data["train"]["041.004"]["en"])))

b'T O   S E E   H O W   T H E   D O C U M E N T S   A R E   A N D   E L S E   W H A T   H A P P E N S   H A P P E N S   I S   A S   I   H A V E   C H A N G E D   H O U S E   D O N   D O N'


In [10]:
def train_loop(num_training, num_epochs, log_mode="a"):
    # Set up log file for loss
    log_dev_fil = open(log_dev_fil_name, mode=log_mode)
    log_dev_csv = csv.writer(log_dev_fil, lineterminator="\n")

    # initialize perplexity on dev set
    # save model when new epoch value is lower than previous
    pplx = float("inf")

    sys.stderr.flush()

    for epoch in range(num_epochs):
        for i, sp_fil in enumerate(sorted(list(text_data["train"].keys()))[:num_training]):
            with tqdm(total=num_training) as pbar:
                sys.stderr.flush()
                print(sp_fil)
                loss_per_epoch = 0
                out_str = "epoch={0:d}, loss={1:.6f}, mean loss={2:.6f}".format(epoch+1, 0, 0)
                pbar.set_description(out_str)
                
                # get the word/character ids
                fr_sent = get_ids(text_data["train"][sp_fil]["es"])
                en_sent = get_ids(text_data["train"][sp_fil]["en"])

                fr_ids = [w2i["fr"].get(w, UNK_ID) for w in fr_sent]
                en_ids = [w2i["en"].get(w, UNK_ID) for w in en_sent]
                
                speech_feat = xp.load(os.path.join(speech_dir, sp_fil+speech_extn))
                print(speech_feat.shape)

                it = (epoch * num_training) + i

                # compute loss
                loss = model.encode_decode_train(speech_feat, en_ids)

                # set up for backprop
                model.cleargrads()
                loss.backward()
                # update parameters
                optimizer.update()
                # store loss value for display
                loss_val = float(loss.data)
                loss_per_epoch += loss_val

                out_str = "epoch={0:d}, loss={1:.6f}, mean loss={2:.6f}".format(
                           epoch+1, it, loss_val, (loss_per_epoch / i))
                pbar.set_description(out_str)
                pbar.update(1)
            # end with pbar
        # end for num_training

        print("finished training on {0:d} sentences".format(num_training))
        print("{0:s}".format("-"*50))
        print("computing perplexity")
        pplx_new = compute_pplx(dev_fname["fr"], dev_fname["en"], NUM_MINI_DEV_SENTENCES)

        if pplx_new > pplx:
            print("perplexity went up during training, breaking out of loop")
            break
        
        pplx = pplx_new
        print(log_dev_fil_name)
        print(model_fil.replace(".model", "_{0:d}.model".format(epoch+1)))

        if (epoch+1) % ITERS_TO_SAVE == 0:
            bleu_score = compute_bleu(dev_fname["fr"], dev_fname["en"], NUM_MINI_DEV_SENTENCES)
            print("Saving model")
            serializers.save_npz(model_fil.replace(".model", "_{0:d}.model".format(last_epoch_id+epoch+1)), 
                                 model)
            print("Finished saving model")

        # log pplx and bleu score
        log_dev_csv.writerow([(last_epoch_id+epoch+1), pplx_new, bleu_score])
        log_dev_fil.flush()
    
    print("Simple predictions (╯°□°）╯︵ ┻━┻")
    print("training set predictions")
    _ = predict(s=0, num=2, plot=False)
    print("Simple predictions (╯°□°）╯︵ ┻━┻")
    print("dev set predictions")
    _ = predict(s=NUM_TRAINING_SENTENCES, num=3, plot=False)
    # print("{0:s}".format("-"*50))
    # compute_bleu(dev_fname["fr"], dev_fname["en"], NUM_MINI_DEV_SENTENCES)
    # print("{0:s}".format("-"*50))

    print("Final saving model")
    serializers.save_npz(model_fil, model)
    print("Finished saving model")

    # close log file
    log_train_fil.close()
    log_dev_fil.close()
    print(log_train_fil_name)
    print(log_dev_fil_name)
    print(model_fil)

In [11]:
# forward_states = model[model.lstm_enc[-1]].h
# backward_states = model[model.lstm_rev_enc[-1]].h

In [12]:
# model.enc_states = F.concat((forward_states, backward_states), axis=1)

In [13]:
train_loop(num_training=1, num_epochs=1)

  0%|          | 0/1 [00:00<?, ?it/s]

041.001
(1168, 120)
speech (1168, 1, 120)
L0_enc before (1168, 1, 120)
L0_enc out (584, 1, 256)
L1_enc before (584, 1, 256)
L1_enc out (292, 1, 256)
L2_enc before (292, 1, 256)
L2_enc out (146, 1, 256)
L3_enc before (146, 1, 256)
L3_enc out (73, 1, 256)
L4_enc before (73, 1, 256)
L4_enc out (73, 1, 128)
speech (1168, 1, 120)
L0_rev_enc before (1168, 1, 120)
L0_rev_enc out (584, 1, 256)
L1_rev_enc before (584, 1, 256)
L1_rev_enc out (292, 1, 256)
L2_rev_enc before (292, 1, 256)
L2_rev_enc out (146, 1, 256)
L3_rev_enc before (146, 1, 256)
L3_rev_enc out (73, 1, 256)
L4_rev_enc before (73, 1, 256)
L4_rev_enc out (73, 1, 128)





NameError: name 'optimizer' is not defined