In [43]:
from __future__ import print_function
from __future__ import division
import os
import pickle
import json
import subprocess
from IPython.display import display
from IPython.display import Audio
import bisect
from collections import namedtuple
import numpy as np
import pandas as pd
from collections import Counter
from python_speech_features import mfcc
import matplotlib.pyplot as plt
import seaborn as sns
import textwrap
import nltk
import math
import scipy as sp
import scipy.io.wavfile as wav
from nltk.corpus import stopwords
from tqdm import tqdm

from matplotlib.ticker import MultipleLocator, \
     FormatStrFormatter, AutoMinorLocator
%matplotlib inline

In [44]:
with open("config.json") as json_data_file:
    config = json.load(json_data_file)

In [45]:
Align = namedtuple('Align', ['word', 'start', 'end'])
Node = namedtuple('Node', ['file', 'seg', 'start', 'end', 'es', 'es_cnt'])
Eval = namedtuple('Eval', ['n1', 'n2', 'dtw', 'es_sim', 'es_cnt_sim', 'en_j_sim'])

In [46]:
align_dict = pickle.load(open(config['es']['align_dict_fname'], "rb"))

In [47]:
def read_segments_file(seg_fname):
    segment_map = {}
    with open(seg_fname, "r") as seg_f:
        for i, line in enumerate(seg_f):
            if i == 0:
                continue
            try:
                line_items = line.strip().split()
                seg_key = line_items[0]
                file_id = line_items[1]
                if file_id not in segment_map:
                    segment_map[file_id] = {}
                seg_start = float(line_items[6])
                seg_end = float(line_items[7])
                segment_map[file_id][seg_key] = (seg_start, seg_end)
            except ValueError:
                print("Incorrect line format at line: %d" % i)
    return segment_map
        

In [48]:
segment_map = read_segments_file('../segments.txt')

### Create folder to store utterance level, and VAD wavs

In [121]:
merged_wavs_path = "../mergeWavs/"
uttr_vad_wavs_path = '../uttr_fa_vad_wavs'
uttr_wavs_path = os.path.join(uttr_vad_wavs_path, "uttr_wavs")
fa_vad_wavs_path = os.path.join(uttr_vad_wavs_path, "fa_vad_wavs")
fa_vad_plp_path = os.path.join(uttr_vad_wavs_path, "plp")

fa_vad_mfcc_path = os.path.join(uttr_vad_wavs_path, "mfcc")
fa_vad_std_mfcc_path = os.path.join(uttr_vad_wavs_path, "mfcc_std")

if not os.path.exists(uttr_vad_wavs_path):
    os.makedirs(uttr_vad_wavs_path)
if not os.path.exists(uttr_wavs_path):
    os.makedirs(uttr_wavs_path)
if not os.path.exists(fa_vad_wavs_path):
    os.makedirs(fa_vad_wavs_path)
if not os.path.exists(fa_vad_plp_path):
    os.makedirs(fa_vad_plp_path)

if not os.path.exists(fa_vad_mfcc_path):
    os.makedirs(fa_vad_mfcc_path)

if not os.path.exists(fa_vad_std_mfcc_path):
    os.makedirs(fa_vad_std_mfcc_path)

### Create uttr level wavs

In [24]:
def create_audio_wav(source_file, target_file, intervals):
    intervals = intervals[:1] + ["=%s" % interval for interval in intervals[1:]]
    subprocess.call(["sox", source_file, target_file, \
                     "trim", intervals[0]] + intervals[1:])
    

In [25]:
def create_all_wavs():
    for wav_fil in sorted(align_dict.keys()):
        print("processing wav: %s" % wav_fil)
        wav_path = os.path.join(merged_wavs_path, "{0:s}.wav".format(wav_fil))
        for j, uttr in enumerate(sorted(align_dict[wav_fil].keys())):
            if j % 50 == 0:
                print('processing uttr: %s' % uttr)
            target_file = os.path.join(uttr_wavs_path, "{0:s}.wav".format(uttr))
            intervals = list(map(str, segment_map[wav_fil][uttr]))
            create_audio_wav(wav_path, target_file, intervals)
            # create vad uttr
            target_file = os.path.join(fa_vad_wavs_path, "{0:s}_fa_vad.wav".format(uttr))
            intervals = []
            uttr_start = segment_map[wav_fil][uttr][0]
            for entry in align_dict[wav_fil][uttr]['es']:
                s_t = "{0:.2f}".format(uttr_start + (entry.start/100))
                e_t = "{0:.2f}".format(uttr_start + (entry.end/100))
                intervals.extend([s_t, e_t])
            create_audio_wav(wav_path, target_file, intervals)
    

In [26]:
#----------------------------------------------------------
# Uncomment to create uttr and vad level wavs
#----------------------------------------------------------
# create_all_wavs()

### Create PLPs

In [27]:
def create_plp(wav_fname, plp_fname):
    FEACALC = config['base']["feacalc"]
    subprocess.call([FEACALC,"-plp", \
                    "12", "-cep", "13", "-dom", "cep", "-deltaorder", \
                    "2", "-dither", "-frqaxis", "bark", "-samplerate", \
                    "8000", "-win", "25", "-step", "10", "-ip", \
                    "MSWAVE", "-rasta", "false", "-compress", \
                    "true", "-op", "swappedraw", "-o", plp_fname, wav_fname])

In [28]:
def create_all_plps():
    for wav_fil in sorted(align_dict.keys()):
        print("processing wav: %s" % wav_fil)
        for j, uttr in enumerate(sorted(align_dict[wav_fil].keys())):
            if j % 50 == 0:
                print('processing plp for uttr: %s' % uttr)
            wav_fname = os.path.join(fa_vad_wavs_path, "{0:s}_fa_vad.wav".format(uttr))
            plp_fname = os.path.join(fa_vad_plp_path, "{0:s}_fa_vad.plp".format(uttr))
            
            create_plp(wav_fname, plp_fname)

    print("Completed!")

In [29]:
#----------------------------------------------------------
# Uncomment to create plps for vad wavs
#----------------------------------------------------------
# create_all_plps()

## Create train/dev/test splits

In [50]:
def create_text_dict(align_dict):
    text_dict = {"train" : {}, "dev": {}, "test": {}}
#     for key in text_dict:
#         text_dict[key] = {"es": {}, "es_cnt": {}, "en": {}, "en_cnt": {}}
    with tqdm(total=len(align_dict)) as pbar:    
        for fid in align_dict:
            fid_val = int(fid)
            for category, limits in config['es']['dataset_split'].items():
                if fid_val >= limits[0] and fid_val <= limits[1]:
                    for sid in align_dict[fid]:
                        text_dict[category][sid] = align_dict[fid][sid]
                    break
            pbar.update(1)
        # end for
    # end with
    print("file count")
    for key in text_dict:
        print("{0:10s} | {1:d}".format(key, len({sid.split('.')[0] for sid in text_dict[key]})))
    
    print("saving text dict as {0:s}".format(config['es']['text_split_fname']))
    pickle.dump(text_dict, open(config['es']['text_split_fname'], "wb"))
    
    return text_dict

In [51]:
text_dict = create_text_dict(align_dict)

100%|██████████| 104/104 [00:00<00:00, 7049.25it/s]


file count
train      | 71
dev        | 13
test       | 20
saving text dict as ../text_split.dict


## Create MT files

In [180]:
" ".join([a.word for a in text_dict['train']['041.001']['en']])

'NO BECAUSE I AM NOT TAKING CLASSES NOW THAT IS WHAT I WAS WAITING FOR AT FIRST WAS TAKING CLASSES IN SUMMER BUT MY ECONOMIC ECONOMIC AID AID IN SUMMER WAS NOT VERY GOOD THEN HAD TO LEAVE THE CLASSES BECAUSE IF DIDN'

In [191]:
def create_text_file(text_dict, cat, lang, vocab_fname):
    with tqdm(total=len(text_dict[cat])) as pbar, open(vocab_fname, "w") as out_f:
        for sid in sorted(text_dict[cat].keys()):
            words = " ".join([a.word for a in text_dict[cat][sid][lang]])
            out_f.write("{0:s}\n".format(words))
            pbar.update(1)
        # end for
    # end with

In [192]:
def create_vocab_file(text_dict, lang, vocab_fname):
    cat = "train"
    vocab = set()
    with tqdm(total=len(text_dict[cat])) as pbar, open(vocab_fname, "w") as out_f:
        for sid in sorted(text_dict[cat].keys()):
            vocab |= set([a.word for a in text_dict[cat][sid][lang]])
            pbar.update(1)
        # end for
        for w in vocab:
            out_f.write("{0:s}\n".format(w))
    # end with
    print("vocab size = {0:d}".format(len(vocab)))

In [188]:
en_vocab_fname = os.path.join(uttr_vad_wavs_path, "vocab.en")
es_vocab_fname = os.path.join(uttr_vad_wavs_path, "vocab.es")

en_train_fname = os.path.join(uttr_vad_wavs_path, "train.en")
es_train_fname = os.path.join(uttr_vad_wavs_path, "train.es")

en_dev_fname = os.path.join(uttr_vad_wavs_path, "dev.en")
es_dev_fname = os.path.join(uttr_vad_wavs_path, "dev.es")

en_test_fname = os.path.join(uttr_vad_wavs_path, "test.en")
es_test_fname = os.path.join(uttr_vad_wavs_path, "test.es")

In [189]:
create_text_file(text_dict, 'train', "es", es_train_fname)
create_text_file(text_dict, 'train', "en", en_train_fname)

create_text_file(text_dict, 'dev', "es", es_dev_fname)
create_text_file(text_dict, 'dev', "en", en_dev_fname)

create_text_file(text_dict, 'test', "es", es_test_fname)
create_text_file(text_dict, 'test', "en", en_test_fname)

100%|██████████| 13137/13137 [00:00<00:00, 82504.91it/s]
100%|██████████| 13137/13137 [00:00<00:00, 118886.00it/s]
100%|██████████| 2476/2476 [00:00<00:00, 94722.55it/s]
100%|██████████| 2476/2476 [00:00<00:00, 119336.01it/s]
100%|██████████| 1781/1781 [00:00<00:00, 93608.54it/s]
100%|██████████| 1781/1781 [00:00<00:00, 111383.64it/s]


In [194]:
create_vocab_file(text_dict, "es", es_vocab_fname)
create_vocab_file(text_dict, "en", en_vocab_fname)

100%|██████████| 13137/13137 [00:00<00:00, 90013.17it/s]
100%|██████████| 13137/13137 [00:00<00:00, 140410.09it/s]

vocab size = 8900
vocab size = 5687





In [190]:
!head $es_vocab_fname; head $en_vocab_fname

NO PORQUE YA NO ESTOY TOMANDO CLASES AHORA ES ESO LO QUE ESTABA ESPERANDO ESTABA PRIMERO TOMANDO CLASES EN EL VERANO PERO MI AYUDA ECONóMICA PARA EL VERANO NO FUE MUY BUENA ENTONCES LAS TUVE QUE DEJAR LAS CLASES PORQUE SI NO NO VOY A TENER DINERO SUFICIENTE PARA PAGAR GRECIA
AJá
PARA VER COMO ESTáN LOS DOCUMENTOS Y DEMáS LO QUE PASA ES QUE COMO ME HE CAMBIADO DE CASA NO NO ME LLEGA NADA Y NO SE NADA
MMM
ENTONCES VOY A IR A VER COMO ESTá TODO EL ASUNTO YA LO úNICO QUE FALTARíA SERíA MI EXAMEN MéDICO Y QUE MANDEN LAS FOTOS DE MI PASAPORTE PARA MI PASAPORTE
BUENO ENTONCES HACES ALLí UNA UNA UNA ESTE UN SEMESTRE Y EL SIGUIENTE SEMESTRE EN ESTADOS UNIDOS
Sí
COMúN Y CORRIENTE
Sí
OYE Y VENDRáS
NO BECAUSE I AM NOT TAKING CLASSES NOW THAT IS WHAT I WAS WAITING FOR AT FIRST WAS TAKING CLASSES IN SUMMER BUT MY ECONOMIC ECONOMIC AID AID IN SUMMER WAS NOT VERY GOOD THEN HAD TO LEAVE THE CLASSES BECAUSE IF DIDN
AHA
TO SEE HOW THE DOCUMENTS ARE AND ELSE WHAT HAPPENS HAPPENS IS AS I HAVE C

### Check transcripts and translations

In [55]:
es_words = [a.word for sid in text_dict['train'] for a in text_dict['train'][sid]['es']]
es_cnt_words = [a.word for sid in text_dict['train'] for a in text_dict['train'][sid]['es_cnt']]
en_words = [a.word for sid in text_dict['train'] for a in text_dict['train'][sid]['en']]
en_cnt_words = [a.word for sid in text_dict['train'] for a in text_dict['train'][sid]['en_cnt']]

In [57]:
from collections import Counter

In [58]:
es_words_freq = Counter(es_words)
es_cnt_words_freq = Counter(es_cnt_words)
en_words_freq = Counter(en_words)
en_cnt_words_freq = Counter(en_cnt_words)

In [59]:
print(sorted(en_words_freq.items(), reverse=True, key=lambda t:t[1])[:10])

[('THE', 3781), ('AND', 3536), ('THAT', 3102), ('I', 2878), ('YES', 2578), ('TO', 2528), ('YOU', 1680), ("'T", 1666), ('NO', 1571), ("'S", 1559)]


In [60]:
print(sorted(es_words_freq.items(), reverse=True, key=lambda t:t[1])[:10])

[('QUE', 5215), ('NO', 4508), ('Y', 3827), ('A', 3243), ('DE', 2989), ('Sí', 2850), ('LA', 2444), ('YA', 2057), ('ES', 2021), ('EL', 1998)]


In [181]:
print([(w,f) for w, f in en_words_freq.items() if "'" in w])

[("'M", 490), ("'S", 1559), ("'T", 1666), ("'LL", 394), ("'VE", 146), ("'D", 23), ("'RE", 198), ("'AM", 8), ("'", 33), ("'CLOCK", 1), ("O'CLOCK", 1), ("'OEUVRES", 1), ("'R", 1), ("'TS", 1)]


In [62]:
print([(w,f) for w, f in es_words_freq.items() if "<" in w])

[('<NOISE>', 387), ('<BACKGROUND>', 83), ('<LAUGH>', 758), ('<COUGH>', 9), ('<SNEEZE>', 4), ('<BREATH>', 15)]


## Create MFCCs

In [149]:
def create_mfcc(wav_fname, mfcc_fname, mfcc_std_fname):
    rate, sig = wav.read(wav_fname)
    mfcc_feat = mfcc(sig,rate)
    d_mfcc_feat = delta(mfcc_feat, 2)
    dd_mfcc_feat = delta(d_mfcc_feat, 2)
    mfcc_feat = np.concatenate((mfcc_feat, d_mfcc_feat), axis=1)
    mfcc_feat = np.concatenate((mfcc_feat, dd_mfcc_feat), axis=1)
    mfcc_feat_std = (mfcc_feat - np.mean(mfcc_feat, axis=0)) / np.std(mfcc_feat, axis=0)
    # save mfcc files
    np.save(open(mfcc_fname, "wb"), mfcc_feat)
    np.save(open(mfcc_std_fname, "wb"), mfcc_feat_std)

In [150]:
def create_all_mfccs():
    with tqdm(total=len(align_dict)) as pbar:
        for wav_fil in sorted(align_dict.keys()):
            for j, uttr in enumerate(sorted(align_dict[wav_fil].keys())):
                wav_fname = os.path.join(fa_vad_wavs_path, "{0:s}_fa_vad.wav".format(uttr))
                mfcc_fname = os.path.join(fa_vad_mfcc_path, "{0:s}_fa_vad.mfcc".format(uttr))
                mfcc_std_fname = os.path.join(fa_vad_std_mfcc_path, "{0:s}_fa_vad.std.mfcc".format(uttr))
                create_mfcc(wav_fname, mfcc_fname, mfcc_std_fname)
            # end for uttr
            pbar.update(1)
        # end for file
    # end pbar

    print("Completed!")

In [151]:
create_all_mfccs()

100%|██████████| 104/104 [05:30<00:00,  3.72s/it]

Completed!





In [152]:
!ls ../uttr_fa_vad_wavs/mfcc/ | wc

  17394   17394  347880


In [153]:
haha = np.load("../uttr_fa_vad_wavs/mfcc/001.002_fa_vad.mfcc")

In [154]:
haha.shape

(139, 39)

### etc code

In [81]:
from python_speech_features import delta

In [107]:
wavfile = os.path.join(uttr_wavs_path, "001.002.wav")
(rate,sig) = wav.read(wavfile)
mfcc_feat = mfcc(sig,rate)
d_mfcc_feat = np.concatenate(mfcc_feat, delta(mfcc_feat, 2), axis=1)
mfcc_feat = np.concatenate(mfcc_feat, delta(d_mfcc_feat, 2), axis=1)
dd_mfcc_feat = delta(d_mfcc_feat, 2)

In [148]:
(rate,sig) = wav.read(wavfile)
mfcc_feat = mfcc(sig,rate)
d_mfcc_feat = delta(mfcc_feat, 2)
dd_mfcc_feat = delta(d_mfcc_feat, 2)
mfcc_feat = np.concatenate((mfcc_feat, d_mfcc_feat), axis=1)
mfcc_feat = np.concatenate((mfcc_feat, dd_mfcc_feat), axis=1)
mfcc_feat_std = (mfcc_feat - np.mean(mfcc_feat, axis=0)) / np.std(mfcc_feat, axis=0)
mfcc_feat.shape

(225, 39)

In [145]:
mfcc_feat.shape

(225, 52)

In [123]:
create_mfcc(wavfile, fa_vad_mfcc_path)

In [126]:
!ls ../uttr_fa_vad_wavs/mfcc/

In [117]:
os.path.basename(wavfile)

'001.002.wav'

In [108]:
mfcc_feat.shape, d_mfcc_feat.shape, dd_mfcc_feat.shape

((225, 39), (225, 13), (225, 13))

In [109]:
mfcc_feat_base[0,:], mfcc_feat[0, :13]

(array([  8.54303756, -25.03482944,  -3.42300258,  -7.95375789,
         -3.84443312,  -6.1952858 , -11.45219107,   1.87897649,
         -7.05916616, -16.38897308,   1.40664859,   0.43634576,  -0.24589086]),
 array([  8.54303756, -25.03482944,  -3.42300258,  -7.95375789,
         -3.84443312,  -6.1952858 , -11.45219107,   1.87897649,
         -7.05916616, -16.38897308,   1.40664859,   0.43634576,  -0.24589086]))

In [110]:
d_mfcc_feat[0,:], mfcc_feat[0, 13:26]

(array([  1.12967002e-03,  -6.35149630e-01,   1.46086731e+00,
          2.33510882e+00,   4.74836837e-01,  -1.09234739e+00,
         -1.47314262e+00,   7.71047838e-01,   1.65089182e+00,
          3.18544759e+00,  -1.53773528e+00,  -3.45056747e-01,
         -1.56087548e+00]),
 array([  1.12967002e-03,  -6.35149630e-01,   1.46086731e+00,
          2.33510882e+00,   4.74836837e-01,  -1.09234739e+00,
         -1.47314262e+00,   7.71047838e-01,   1.65089182e+00,
          3.18544759e+00,  -1.53773528e+00,  -3.45056747e-01,
         -1.56087548e+00]))

In [111]:
np.mean(mfcc_feat, axis=0).shape, np.std(mfcc_feat, axis=0).shape

((39,), (39,))

In [112]:
mfcc_feat_std = (mfcc_feat - np.mean(mfcc_feat, axis=0)) / np.std(mfcc_feat, axis=0)

In [113]:
mfcc_feat_std[0, 13:26]

array([-0.00437915, -0.30292499,  0.55785951,  1.32699758,  0.20380063,
       -0.34257392, -0.48811877,  0.28812347,  0.51641007,  1.11521123,
       -0.50372484, -0.11665855, -0.53459715])

In [18]:
Audio(os.path.join(uttr_wavs_path, "001.002.wav"))

In [19]:
Audio(os.path.join(fa_vad_wavs_path, "001.002_fa_vad.wav"))

In [15]:
print(" ".join([w.word for w in align_dict["110"]["110.005"]["es"]]))

AY QUé LINDA


In [73]:
!soxi ../uttr_fa_vad_wavs/fa_vad_wavs/110.005_fa_vad.wav
!soxi ../uttr_fa_vad_wavs/uttr_wavs/110.005.wav


Input File     : '../uttr_fa_vad_wavs/fa_vad_wavs/110.005_fa_vad.wav'
Channels       : 1
Sample Rate    : 8000
Precision      : 16-bit
Duration       : 00:00:01.36 = 10880 samples ~ 102 CDDA sectors
File Size      : 21.8k
Bit Rate       : 128k
Sample Encoding: 16-bit Signed Integer PCM


Input File     : '../uttr_fa_vad_wavs/uttr_wavs/110.005.wav'
Channels       : 1
Sample Rate    : 8000
Precision      : 16-bit
Duration       : 00:00:02.05 = 16400 samples ~ 153.75 CDDA sectors
File Size      : 32.8k
Bit Rate       : 128k
Sample Encoding: 16-bit Signed Integer PCM

