In [1]:
from __future__ import print_function
from __future__ import division
import os
import pickle
import json
import subprocess
from IPython.display import display
from IPython.display import Audio
from IPython.display import Image
import bisect
from collections import namedtuple
import numpy as np
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
import textwrap
import random
import math
import scipy as sp
import scipy.io.wavfile as wav
from tqdm import tqdm

from matplotlib.ticker import MultipleLocator, \
     FormatStrFormatter, AutoMinorLocator
%matplotlib inline

In [2]:
from nmt_run import *

In [3]:
SOX = "sox"
SPH2PIPE = "../installs/wav/sph2pipe_v2.5/sph2pipe"
FEACALC = "../installs/wav/icsi-scenic-tools-20120105/feacalc-0.92/feacalc"

STANDFEAT = "../installs/origZRTools/plebdisc/standfeat"

In [4]:
mboshi_path = "/afs/inf.ed.ac.uk/group/project/ast/work/corpora/mboshi-french-parallel-corpus/"

In [5]:
train_wavs_path = os.path.join(mboshi_path, "train_wav")
test_wavs_path = os.path.join(mboshi_path, "dev_wav")

In [6]:
train_dev_ids = [w.replace(".wav", "") for w in os.listdir(train_wavs_path)]

In [7]:
test_ids = [w.replace(".wav", "") for w in os.listdir(test_wavs_path)]

In [8]:
len(train_dev_ids), len(set(train_dev_ids))

(4616, 4616)

In [9]:
random.seed("haha")
np.random.seed(10)
dev_ids = sorted(list(set(np.random.choice(train_dev_ids, 200, replace=False))))

In [10]:
len(dev_ids), len(set(dev_ids))

(200, 200)

In [11]:
train_ids = sorted(list(set(train_dev_ids) - set(dev_ids)))

In [12]:
len(train_ids), len(dev_ids), len(test_ids)

(4416, 200, 514)

In [13]:
def clean_out_str(out_str):
    out_str = out_str.replace("=", " ")
    out_str = out_str.replace(",", " ")
    out_str = out_str.replace(".", " ")
    out_str = out_str.replace(",", " ")
    out_str = out_str.replace("`", "")
    out_str = out_str.replace('"', '')
    out_str = out_str.replace('¿', '')
    out_str = out_str.replace("''", "")
    out_str = out_str.replace(":", "")
    out_str = out_str.replace("!", " ")
    out_str = out_str.replace("|", "")
    out_str = out_str.replace(";", " ")
    out_str = out_str.replace("‐", " ")
    out_str = out_str.replace("-", " ")
    

    # for BPE
    out_str = out_str.replace("@@ ", "")
    out_str = out_str.replace("@@", "")

    out_str = out_str.strip().lower()
    return out_str

In [14]:
def read_text(text_path):
    all_words = []
    all_chars = []
    utt2words = {}
    with open(text_path, "r", encoding="utf-8") as in_f:
        for line in in_f:
            u, t = line.split(" ", 1)
            t = clean_out_str(t)
            utt2words[u] = t
            curr_words = [w.strip() for w in t.split()]
            all_words.extend(curr_words)
            all_chars.extend([c for c in t])
        # end for line
    # end with
    fr_chars = Counter(all_chars)
#     fr_chars = {}
#     for w in all_words:
#         for c in w:
#             if c not in fr_chars:
#                 fr_chars[c] = 1
#             else:
#                 fr_chars[c] += 1
    return utt2words, Counter(all_words), fr_chars
    

In [15]:
mboshi_train_dev_text, mboshi_train_dev_words, mboshi_train_dev_chars = read_text(os.path.join(mboshi_path, 
                                                                                               "train.fr"))

In [16]:
sum(mboshi_train_dev_words.values()), len(mboshi_train_dev_words)

(36030, 5116)

In [17]:
sum(mboshi_train_dev_chars.values()), len(mboshi_train_dev_chars)

(189396, 45)

In [18]:
mboshi_test_text, mboshi_test_words, mboshi_test_chars = read_text(os.path.join(mboshi_path, "dev.fr"))

In [19]:
sum(mboshi_test_words.values()), len(mboshi_test_words)

(3954, 1200)

In [None]:
len(set(mboshi_train_dev_text.keys()) - set(train_ids))

In [None]:
def write_id_file(ids, out_fname):
    with open(out_fname, "w", encoding="utf-8") as out_f:
        for i in ids:
            out_f.write("{0:s}\n".format(i))
        # end for
    # end with        

In [None]:
def write_text_file(ids, text_dict, out_fname):
    with open(out_fname, "w", encoding="utf-8") as out_f:
        for i in ids:
            out_f.write("{0:s}\n".format(text_dict[i]))
        # end for
    # end with        

In [None]:
id_sets = {"mboshi_train": train_ids, "mboshi_dev": dev_ids, "mboshi_test": test_ids}
text_sets = {"mboshi_train": mboshi_train_dev_text, 
             "mboshi_dev": mboshi_train_dev_text, 
             "mboshi_test": mboshi_test_text}
set_paths = {k: os.path.join("./mboshi", "{0:s}.fr".format(k)) for k in id_sets}
set_id_paths = {k: os.path.join("./mboshi", "{0:s}.ids".format(k)) for k in id_sets}

In [None]:
set_paths, set_id_paths

In [None]:
for i in id_sets:
    write_id_file(id_sets[i], set_id_paths[i])
    write_text_file(id_sets[i], text_sets[i], set_paths[i])

In [None]:
def read_bpe_text(text_path, ids):
    all_words = []
    utt2words = {}
    with open(text_path, "r", encoding="utf-8") as in_f:
        for u, line in tqdm(zip(ids, in_f)):
            t = line.split()
            utt2words[u] = t
            all_words.extend(t)
        # end for line
    # end with
    return utt2words, dict(Counter(all_words))

In [None]:
mboshi_train_bpe_text, mboshi_train_bpe_words = read_bpe_text(os.path.join("mboshi", 
                                                              "mboshi_train.BPE_1000.fr"),
                                                              train_ids)

In [None]:
mboshi_dev_bpe_text, mboshi_dev_bpe_words = read_bpe_text(os.path.join("mboshi", 
                                                          "mboshi_dev.BPE_1000.fr"),
                                                           dev_ids)

In [None]:
train_ids[4]

In [None]:
mboshi_train_bpe_text[train_ids[4]], mboshi_train_dev_text[train_ids[4]]

In [None]:
len(mboshi_train_bpe_words), len(mboshi_dev_bpe_words)

In [None]:
bpe_text_sets = {"mboshi_train": mboshi_train_bpe_text, 
             "mboshi_dev": mboshi_dev_bpe_text}

In [None]:
for c in mboshi_train_dev_chars:
    print(c, end=", ")

In [None]:
def create_new_vocab(words):
    out = {"w2i":{}, "i2w":{}, "freq":{}}
    START_VOCAB = [PAD, GO, EOS, UNK]
    for w in START_VOCAB:
        out['w2i'][w] = len(out["w2i"])
        out["freq"][w] = 1
    #for w in words_list['words']:
    sorted_w = sorted(words.items(), reverse=True, key=lambda t: t[1])
    for w in sorted_w:
        encoded_word = w[0].encode()
        out["w2i"][encoded_word] = len(out["w2i"])
        out["freq"][encoded_word] = w[1]

    out["i2w"] = {val:key for key, val in out["w2i"].items()}
    return out

In [None]:
fr_vocab = {}

In [None]:
fr_vocab["en_c"] = create_new_vocab(mboshi_train_dev_chars)

In [None]:
fr_vocab["en_w"] = create_new_vocab(mboshi_train_dev_words)

In [None]:
fr_vocab["bpe_w"] = create_new_vocab(mboshi_train_bpe_words)

In [None]:
fr_vocab.keys()

In [None]:
fr_vocab["en_c"].keys()

In [None]:
pickle.dump(fr_vocab, open("./mfcc_13dim/mboshi_train_vocab.dict", "wb"))

In [None]:
def create_mboshi_map():
    oov = {}
    mboshi_map = {}
    
    for id_set in id_sets:
        print(id_set)
        mboshi_map[id_set] = {}
        oov[id_set] = []
        for i in id_sets[id_set]:
            mboshi_map[id_set][i] = {}            
            words = text_sets[id_set][i].split()
            encoded_words = [w.encode() for w in words]
            encoded_chars = [c.encode() for c in text_sets[id_set][i]]
            
            mboshi_map[id_set][i]["en_w"] = encoded_words
            mboshi_map[id_set][i]["en_c"] = encoded_chars
            
            if id_set in bpe_text_sets:
                bpe_tokens = [w.encode() for w in bpe_text_sets[id_set][i]]
                
            else:
                bpe_tokens = []
            mboshi_map[id_set][i]["bpe_w"] = bpe_tokens
            
            for c in bpe_tokens:
                if c not in fr_vocab["bpe_w"]["w2i"]:
                    oov[id_set].append(c)
    return mboshi_map, oov

In [None]:
mboshi_map, oov = create_mboshi_map()

In [None]:
# mboshi_map["mboshi_dev"]

In [None]:
pickle.dump(mboshi_map, open("./mfcc_13dim/mboshi_map.dict", "wb"))

In [None]:
oov

In [None]:
wav_path = en_path = os.path.join(ainu_path, "wav")
en_path = os.path.join(ainu_path, "encl")

In [None]:
ids = [w.replace(".wav", "") for w in os.listdir(wav_path)]

In [None]:
'0.000'.split(".",1)

In [None]:
utt2spk = {k: k.split(".",1)[0] for k in ids}
spk2utt = {}

for k in ids:
    spk_key = k.split(".",1)[0]
    if spk_key in spk2utt:
        spk2utt[spk_key].append(k)
    else:
        spk2utt[spk_key] = []
        
print("# of utts = {0:d}".format(len(ids)))
print("# of narratives = {0:d}".format(len(spk2utt)))
for s in spk2utt:
    print(s, len(spk2utt[s]))

In [None]:
out_path = os.path.join(ainu_path, "text")

In [None]:
ids_file = os.path.join(out_path, "ainu10.ids")
text_file = os.path.join(out_path, "ainu10.clean.en")
bpe_file = os.path.join(out_path, "ainu10.BPE_1000.en")

In [None]:
with open(ids_file, "w", encoding="utf-8") as ids_f, open(text_file, "w", encoding="utf-8") as text_f:
    for i in ids:
        ids_f.write("{0:s}\n".format(i))
        with open(os.path.join(en_path, "{0:s}.en.cl".format(i)), "r") as en_f:
            lines = en_f.readlines()
            text = clean_out_str(lines[0].strip())
            if len(lines) > 1:
                print(i, len(lines))
            text_f.write("{0:s}\n".format(text))
    

### Create MFCCs, and normalize them

In [None]:
wav_path = '/afs/inf.ed.ac.uk/group/project/ast/work/corpora/mboshi-french-parallel-corpus/wavs'
mfcc_path = '/afs/inf.ed.ac.uk/group/project/ast/work/corpora/mboshi-french-parallel-corpus/mfcc_raw'
mfcc_std_path = '/afs/inf.ed.ac.uk/group/project/ast/work/corpora/mboshi-french-parallel-corpus/mfcc_std'
mfcc_final_path = '/afs/inf.ed.ac.uk/group/project/ast/work/corpora/mboshi-french-parallel-corpus/mboshi_mfccs'

In [None]:
for id_set in id_sets:
    ids = id_sets[id_set]
    for i in tqdm(ids, ncols=80):
        wav_fname = os.path.join(wav_path, "{0:s}.wav".format(i))
        mfcc_fname = os.path.join(mfcc_path, "{0:s}.mfcc".format(i))
        mfcc_std_fname = os.path.join(mfcc_std_path, "{0:s}.std.mfcc".format(i))
        mfcc_final_fname = os.path.join(mfcc_final_path, "{0:s}".format(i))

        !$FEACALC -plp no -cep 13 -dom cep -deltaorder 0 -dither -frqaxis bark \
            -samplerate 16000 -win 25 -step 10 -ip MSWAVE -rasta false -compress true \
            -op swappedraw -o $mfcc_fname $wav_fname

        !$STANDFEAT -D 13 -infile $mfcc_fname -outfile $mfcc_std_fname

        out_mfcc = np.fromfile(mfcc_std_fname, dtype=np.float32)
        out_mfcc = out_mfcc.reshape((-1,13))
        print(out_mfcc.shape)
        np.save(mfcc_final_fname, out_mfcc)
        
    

In [None]:
ha = np.load(os.path.join(mfcc_final_path, "abiayi_2015-09-08-11-18-39_samsung-SM-T530_mdw_elicit_Dico18_1.npy"))

In [None]:
ha.shape

In [None]:
ha[:1]

In [None]:
haha = ha.reshape((-1,13))

In [None]:
ha.shape, haha.shape

In [None]:
# for c in set(swbd1_ids) - {"swbd1_train_nodev"}:
info = {}
for c in id_sets:
    print(c)
    info[c] = {}
    for x in tqdm(id_sets[c], ncols=80):
        info[c][x] = {}
        t_data = np.load("./mfcc_13dim/mboshi_mfccs/{0:s}.npy".format(x))
        info[c][x]["sp"] = t_data.shape[0]
        info[c][x]["es_w"] = 0
        info[c][x]["es_c"] = 0
        info[c][x]["en_w"] = len(mboshi_map[c][x]["en_w"])
        info[c][x]["en_c"] = len(mboshi_map[c][x]["en_c"])

In [None]:
pickle.dump(info, open("./mfcc_13dim/info_mboshi.dict", "wb"))

In [None]:
durs = {}
for c in id_sets:
    print(c)
    durs[c] = []
    for x in tqdm(id_sets[c], ncols=80):
        t_data = np.load("./mfcc_13dim/mboshi_mfccs/{0:s}.npy".format(x))
        durs[c].append(t_data.shape[0])
        

In [None]:
for c in durs:
    print(c)
    print("total hrs = {0:.3f}".format(sum(durs[c]) / 100. / 3600))
    print("min = {0:.2f}, max = {1:.2f}, mean = {2:.2f}".format(np.min(durs[c])/100, 
                                                                np.max(durs[c])/100, np.mean(durs[c])/100))

In [20]:
train_text = "mboshi/mboshi_train.fr"
dev_text = "mboshi/mboshi_test.fr"

In [21]:
def get_words(text_fname):
    words = []
    with open(text_fname, "r", encoding="utf-8") as in_f:
        for line in in_f:
            words.extend(line.strip().split())
    return Counter(words)        

In [22]:
train_counter = get_words(train_text)
dev_counter = get_words(dev_text)

In [23]:
train_counter.most_common(10)

[('de', 1369),
 ('la', 1320),
 ('le', 1179),
 ('est', 937),
 ('a', 923),
 ('il', 858),
 ('les', 710),
 ('à', 509),
 ('dans', 445),
 ('un', 424)]

In [24]:
def get_lengths(text_fname):
    lengths = []
    with open(text_fname, "r", encoding="utf-8") as in_f:
        for line in in_f:
            lengths.append(len(line.strip().split()))
    return np.array(lengths)

In [25]:
train_lengths = get_lengths(train_text)
dev_lengths = get_lengths(dev_text)

In [26]:
np.mean(train_lengths), np.min(train_lengths), np.max(train_lengths)

(7.798460144927536, 1, 27)

In [27]:
np.mean(dev_lengths), np.min(dev_lengths), np.max(dev_lengths)

(7.692607003891051, 2, 21)

In [31]:
K = 8
N = len(dev_lengths)
print(N)

514


In [32]:
def write_k_most_common(out_fname, K, N):
    k_words = " ".join([i[0] for i in train_counter.most_common(K)])
    out_line = "{0:s}\n".format(k_words)
    with open(out_fname, "w", encoding="utf-8") as out_f:
        for n in range(N):
            out_f.write(out_line)

In [33]:
write_k_most_common("./mboshi/mboshi_test_dummy_top-{0:d}_words.fr".format(K), K=K, N=N)

### Most common sentences in Mboshi Train
sort mboshi/mboshi_train.fr | uniq -c | sort -rn | head -n 12

```
6 les pêcheurs ont rapporté beaucoup de poisson
6 les bananes sont pleines
6 il est très bavard
6 attends moi  j'arrive
5 il n'aime pas être battu au jeu
5 celui ci est mon champ  celui là est à mon oncle
4 tu peux partir devant  je t'atteindrai en route
4 si tu attends encore un peu  il va venir
4 ses cheveux sont brillants
4 on brûle les herbes
4 on a augmenté la paie des trvailleurs
4 l'éléphant barrit
```

### Most common sentences in Fisher Train
sort ../subword-nmt/fisher_train.en | uniq -c | sort -rn | head -n 20

```
   6804 yes
   3106 aha
   1975 mm
   1237 hmm
   1217 sure
   1150 oh
   1054 ah
    930 mhm
    790 yeah
    726 yes yes
    708 right
    632 uh huh
    434 hello
    429 exactly
    424 no
    409 okay
    403 uh uh
    362 hm mm
    358 oh yes
    346 um
    ```

### Prepare seq-to-seq dictionaries

In [None]:
cfg_path = "./mfcc_13dim/"
map_dict = pickle.load(open("../speech2text/mfcc_13dim/bpe_map.dict", "rb"))
vocab_dict = pickle.load(open("../speech2text/mfcc_13dim/bpe_train_vocab.dict", "rb"))

In [None]:
map_dict.keys()

In [None]:
spk2utt.keys()

In [None]:
def create_ainu_map(train_spkrs, dev_spkrs, test_spkrs):
    oov = {}
    ainu_map = {}
    train_ids_str = "_".join(map(str, train_spkrs))
    dev_ids_str = "_".join(map(str, dev_spkrs))
    test_ids_str = "_".join(map(str, test_spkrs))
    
    ainu_map_fname = "ainu_train-{0:s}-dev-{1:s}-test-{2:s}_map.dict".format(train_ids_str, 
                                                                             dev_ids_str, 
                                                                             test_ids_str)
        
    
    with open(os.path.join("../subword-nmt/", "ainu10.BPE_1000.en"), "rb") as text_f, \
         open(os.path.join("../subword-nmt/", "ainu10.ids"), "r") as id_f, \
         open(os.path.join("../subword-nmt/", "ainu10.clean.en"), "rb") as words_f:
        for i, t, e in zip(id_f, text_f, words_f):
            curr_spkr = utt2spk[i.strip()]
            if curr_spkr in train_spkrs:
                c = "ainu_train-{0:s}".format(train_ids_str)
            elif curr_spkr in dev_spkrs:
                c = "ainu_dev-{0:s}".format(dev_ids_str)
            elif curr_spkr in test_spkrs:
                c = "ainu_test-{0:s}".format(test_ids_str)
            else:
                print(i, curr_spkr)
                print("Achtung!!")
            if c not in ainu_map:
                ainu_map[c] = {}
                oov[c] = []
            ainu_map[c][i.strip()] = {}
            ainu_map[c][i.strip()]["bpe_w"] = t.strip().split()
            ainu_map[c][i.strip()]["en_w"] = e.strip().split()
            ainu_map[c][i.strip()]["en_c"] = [tt.encode() for tt in e.strip().decode()]
            for w in t.strip().split():
                if w not in vocab_dict["bpe_w"]["w2i"]:
                    oov[c].append(w)
                    
            for w in t.strip().split():
                if w not in vocab_dict["bpe_w"]["w2i"]:
                    oov[c].append(w)

    print(ainu_map_fname)
    return ainu_map, ainu_map_fname, oov

In [None]:
train_spkrs=[str(i) for i in range(2,10)]
dev_spkrs=['0']
test_spkrs=['1']

In [None]:
ainu_map, ainu_map_fname, oov = create_ainu_map(train_spkrs=train_spkrs, 
                                                dev_spkrs=dev_spkrs, test_spkrs=test_spkrs)

In [None]:
print([w.decode() for w in oov['ainu_train-2_3_4_5_6_7_8_9']])
print([w.decode() for w in oov['ainu_dev-0']])
print([w.decode() for w in oov['ainu_test-1']])

In [None]:
ainu_map['ainu_train-2_3_4_5_6_7_8_9']['2.000']

In [None]:
def create_ainu_ref(train_spkrs, dev_spkrs, test_spkrs):
    train_ids_str = "_".join(map(str, train_spkrs))
    dev_ids_str = "_".join(map(str, dev_spkrs))
    test_ids_str = "_".join(map(str, test_spkrs))
        
    ainu_ids = {}
    ainu_text = {}
    with open(os.path.join("../subword-nmt/", "ainu10.ids"), "r") as id_f, \
         open(os.path.join("../subword-nmt/", "ainu10.clean.en"), "r") as words_f:
        for i, e in zip(id_f, words_f):
            curr_spkr = utt2spk[i.strip()]
            if curr_spkr in train_spkrs:
                c = "ainu_train-{0:s}".format(train_ids_str)
            elif curr_spkr in dev_spkrs:
                c = "ainu_dev-{0:s}".format(dev_ids_str)
            elif curr_spkr in test_spkrs:
                c = "ainu_test-{0:s}".format(test_ids_str)
            else:
                print(i, curr_spkr)
                print("Achtung!!")
            if c not in ainu_ids:
                ainu_ids[c] = []
                ainu_text[c] = []
            
            ainu_ids[c].append(i.strip())
            ainu_text[c].append(e.strip())

    print(list(ainu_ids.keys()))
    return ainu_ids, ainu_text

In [None]:
ainu_ids, ainu_text = create_ainu_ref(train_spkrs=train_spkrs, dev_spkrs=dev_spkrs, test_spkrs=test_spkrs)

In [None]:
for c in ainu_text:
    with open("./mfcc_13dim/ainu/{0:s}.en".format(c), "w", encoding="utf-8") as out_f:
        for i in ainu_text[c]:
            out_f.write("{0:s}\n".format(i))
        # end for
    # end with
# end for

In [None]:
print("writing ids to: {0:s}".format(ainu_map_fname.replace("map", "ids")))
pickle.dump(ainu_ids, open("./mfcc_13dim/ainu/{0:s}".format(ainu_map_fname.replace("map", "ids")), "wb"))

In [None]:
print("writing map to: {0:s}".format(ainu_map_fname))
pickle.dump(ainu_map, open(os.path.join(cfg_path, ainu_map_fname), "wb"))
print("writing vocab to: {0:s}".format(ainu_map_fname.replace("_map", "_train_vocab")))
pickle.dump(vocab_dict, open(os.path.join(cfg_path, ainu_map_fname.replace("_map", "_train_vocab")), "wb"))

In [None]:
info = pickle.load(open("../speech2text/mfcc_13dim/info.dict", "rb"))

In [None]:
ainu_map.keys()

In [None]:
# for c in set(swbd1_ids) - {"swbd1_train_nodev"}:
info = {}
for c in set(ainu_map.keys()):    
    info[c] = {}
    for x in tqdm(ainu_ids[c], ncols=80):
        info[c][x] = {}
        t_data = np.load("./mfcc_13dim/ainu_mfccs/{0:s}.npy".format(x))
        info[c][x]["sp"] = t_data.shape[0]
        info[c][x]["es_w"] = 0
        info[c][x]["es_c"] = 0
        info[c][x]["en_w"] = len(ainu_map[c][x]["en_w"])
        info[c][x]["en_c"] = len(" ".join([w.decode() for w in ainu_map[c][x]["en_w"]]))
    # end for        

In [None]:
info.keys()

In [None]:
print("writing info to: {0:s}".format("./mfcc_13dim/{0:s}".format(ainu_map_fname.replace("map", "info"))))
pickle.dump(info, open("./mfcc_13dim/{0:s}".format(ainu_map_fname.replace("map", "info")), "wb"))

In [None]:
np.max(np.array([len(ainu_map["ainu_train-2_3_4_5_6_7_8_9"][x]["bpe_w"]) for x in info["ainu_train-2_3_4_5_6_7_8_9"]]))

In [None]:
plt.hist(np.array([info["ainu_train-2_3_4_5_6_7_8_9"][x]["en_w"] for x in info["ainu_train-2_3_4_5_6_7_8_9"]]))

## Create MFCCs

In [None]:
from python_speech_features import delta

In [None]:
def create_mfb(wav_fname, mfb_fname, mfb_std_fname, nfilt=40):
    rate, sig = wav.read(wav_fname)
    mfb_feat = logfbank(sig,rate, nfilt=nfilt)
    d_mfb_feat = delta(mfb_feat, 2)
    dd_mfb_feat = delta(d_mfb_feat, 2)
    mfb_feat = np.concatenate((mfb_feat, d_mfb_feat), axis=1)
    mfb_feat = np.concatenate((mfb_feat, dd_mfb_feat), axis=1)
    try:
        mfb_feat_std = (mfb_feat - np.mean(mfb_feat, axis=0)) / np.std(mfb_feat, axis=0)
    except:
        print(wav_fname)
    # save mfb files
#     np.save(open(mfb_fname, "wb"), mfb_feat)
    np.save(open(mfb_std_fname, "wb"), mfb_feat_std.astype(np.float32))

In [None]:
def create_mfcc(wav_fname, mfcc_fname, mfcc_std_fname):
    rate, sig = wav.read(wav_fname)
    mfcc_feat = mfcc(sig,rate)
    d_mfcc_feat = delta(mfcc_feat, 2)
    dd_mfcc_feat = delta(d_mfcc_feat, 2)
    mfcc_feat = np.concatenate((mfcc_feat, d_mfcc_feat), axis=1)
    mfcc_feat = np.concatenate((mfcc_feat, dd_mfcc_feat), axis=1)
    std_vals = np.std(mfcc_feat, axis=0)
    mfcc_feat_std = (mfcc_feat - np.mean(mfcc_feat, axis=0)) / np.std(mfcc_feat, axis=0)
    if not np.isfinite(std_vals).all() or not np.isfinite(std_vals).all():
        print("HAAAALP", wav_fname)
    # save mfcc files
#     np.save(open(mfcc_fname, "wb"), mfcc_feat)
    np.save(open(mfcc_std_fname, "wb"), mfcc_feat_std.astype(np.float32))

In [None]:
def create_all_speech_features(mffc1mfb0=True):
    with tqdm(total=len(align_dict)) as pbar:
        for wav_fil in sorted(align_dict.keys()):
            for j, uttr in enumerate(sorted(align_dict[wav_fil].keys())):
                wav_fname = os.path.join(fa_vad_wavs_path, "{0:s}_fa_vad.wav".format(uttr))
                if mffc1mfb0:
                    mfcc_fname = os.path.join(fa_vad_mfcc_path, "{0:s}_fa_vad.mfcc".format(uttr))
                    mfcc_std_fname = os.path.join(fa_vad_std_mfcc_path, "{0:s}_fa_vad.std.mfcc".format(uttr))
                    create_mfcc(wav_fname, mfcc_fname, mfcc_std_fname)
                else:
                    mfb_fname = os.path.join(fa_vad_mfb_path, "{0:s}_fa_vad.mfb".format(uttr))
                    mfb_std_fname = os.path.join(fa_vad_std_mfb_path, "{0:s}_fa_vad.std.mfb".format(uttr))
                    create_mfb(wav_fname, mfb_fname, mfb_std_fname)
                
            # end for uttr
            pbar.update(1)
        # end for file
    # end pbar

    print("Completed!")

In [None]:
create_all_speech_features(mffc1mfb0=False)

In [None]:
for j, uttr in enumerate(sorted(align_dict['038'].keys())[20:]):
    print(uttr, end=', ')
    wav_fname = os.path.join(fa_vad_wavs_path, "{0:s}_fa_vad.wav".format(uttr))
    mfcc_fname = os.path.join(fa_vad_mfcc_path, "{0:s}_fa_vad.mfcc".format(uttr))
    mfcc_std_fname = os.path.join(fa_vad_std_mfcc_path, "{0:s}_fa_vad.std.mfcc".format(uttr))
    create_mfcc(wav_fname, mfcc_fname, mfcc_std_fname)
    

In [None]:
!ls ../uttr_fa_vad_wavs/mfcc/ | wc

In [None]:
haha = np.load("../uttr_fa_vad_wavs/mfcc/001.002_fa_vad.mfcc")

In [None]:
haha_mfb = np.load("../uttr_fa_vad_wavs/mfb/001.002_fa_vad.mfb")

In [None]:
haha.shape, haha_mfb.shape

In [None]:
haha_mfb[0,:5]

In [None]:
np.expand_dims(haha_mfb, 0)[0,0,-5:]

In [None]:
np.flipud(haha_mfb)[-1,:5]

### etc code

In [None]:
wavfile = os.path.join(uttr_wavs_path, "001.002.wav")
(rate,sig) = wav.read(wavfile)
mfcc_feat = mfcc(sig,rate)
mfb = logfbank(sig, rate, nfilt=80)

In [None]:
mfb.shape, mfcc_feat.shape

In [None]:
(rate,sig) = wav.read(wavfile)
mfcc_feat = mfcc(sig,rate)
d_mfcc_feat = delta(mfcc_feat, 2)
dd_mfcc_feat = delta(d_mfcc_feat, 2)
mfcc_feat = np.concatenate((mfcc_feat, d_mfcc_feat), axis=1)
mfcc_feat = np.concatenate((mfcc_feat, dd_mfcc_feat), axis=1)
mfcc_feat_std = (mfcc_feat - np.mean(mfcc_feat, axis=0)) / np.std(mfcc_feat, axis=0)
mfcc_feat.shape

In [None]:
mfcc_feat.shape

In [None]:
create_mfcc(wavfile, fa_vad_mfcc_path)

In [None]:
!ls ../uttr_fa_vad_wavs/mfcc/

In [None]:
os.path.basename(wavfile)

In [None]:
mfcc_feat.shape, d_mfcc_feat.shape, dd_mfcc_feat.shape

In [None]:
mfcc_feat_base[0,:], mfcc_feat[0, :13]

In [None]:
d_mfcc_feat[0,:], mfcc_feat[0, 13:26]

In [None]:
np.mean(mfcc_feat, axis=0).shape, np.std(mfcc_feat, axis=0).shape

In [None]:
mfcc_feat_std = (mfcc_feat - np.mean(mfcc_feat, axis=0)) / np.std(mfcc_feat, axis=0)

In [None]:
mfcc_feat_std[0, 13:26]

In [None]:
Audio(os.path.join(uttr_wavs_path, "001.002.wav"))

In [None]:
Audio(os.path.join(fa_vad_wavs_path, "001.002_fa_vad.wav"))

In [None]:
print(" ".join([w.word for w in align_dict["110"]["110.005"]["es"]]))

In [None]:
!soxi ../uttr_fa_vad_wavs/fa_vad_wavs/110.005_fa_vad.wav
!soxi ../uttr_fa_vad_wavs/uttr_wavs/110.005.wav

### Use MFCCs and Log Mel Filterbanks generated using Kaldi

In [None]:
kaldi_out_path = "../uttr_fa_vad_wavs/kaldi/"

In [None]:
haha = np.load("../uttr_fa_vad_wavs/mfcc_std/001.002_fa_vad.std.mfcc")

In [None]:
haha.shape, haha[0,:5]

In [None]:
!ls $kaldi_out_path/mfcc_cmvn_dd_vad

In [None]:
kaldi_test = np.load("../uttr_fa_vad_wavs/kaldi/mfcc_cmvn_dd_vad/test.npz")

In [None]:
kaldi_dev['001.002'].shape, kaldi_dev['001.002'][0,:5]