In [1]:
from __future__ import print_function
from __future__ import division
import os
import cPickle as pickle
import json
import subprocess
from IPython.display import display
from IPython.display import Audio
import bisect
from collections import namedtuple
import numpy as np
import pandas as pd
from collections import Counter
from prettytable import PrettyTable
import matplotlib.pyplot as plt
import seaborn as sns
import textwrap
import nltk
import math
import scipy as sp
from nltk.corpus import stopwords

from matplotlib.ticker import MultipleLocator, \
     FormatStrFormatter, AutoMinorLocator
%matplotlib inline

In [2]:
with open("config.json") as json_data_file:
    config = json.load(json_data_file)

In [3]:
Align = namedtuple('Align', ['word', 'start', 'end'])
Node = namedtuple('Node', ['file', 'seg', 'start', 'end', 'es', 'es_cnt'])
Eval = namedtuple('Eval', ['n1', 'n2', 'dtw', 'es_sim', 'es_cnt_sim', 'en_j_sim'])

In [4]:
align_dict = pickle.load(open(config['es']['align_dict_fname'], "rb"))

In [6]:
def read_segments_file(seg_fname):
    segment_map = {}
    with open(seg_fname, "r") as seg_f:
        for i, line in enumerate(seg_f):
            if i == 0:
                continue
            try:
                line_items = line.strip().split()
                seg_key = line_items[0]
                file_id = line_items[1]
                if file_id not in segment_map:
                    segment_map[file_id] = {}
                seg_start = float(line_items[6])
                seg_end = float(line_items[7])
                segment_map[file_id][seg_key] = (seg_start, seg_end)
            except ValueError:
                print("Incorrect line format at line: %d" % i)
    return segment_map
        

In [7]:
segment_map = read_segments_file('../segments.txt')

### Create folder to store utterance level, and VAD wavs

In [22]:
merged_wavs_path = "../mergeWavs/"
uttr_vad_wavs_path = '../uttr_fa_vad_wavs'
uttr_wavs_path = os.path.join(uttr_vad_wavs_path, "uttr_wavs")
fa_vad_wavs_path = os.path.join(uttr_vad_wavs_path, "fa_vad_wavs")
fa_vad_plp_path = os.path.join(uttr_vad_wavs_path, "plp")
if not os.path.exists(uttr_vad_wavs_path):
    os.makedirs(uttr_vad_wavs_path)
if not os.path.exists(uttr_wavs_path):
    os.makedirs(uttr_wavs_path)
if not os.path.exists(fa_vad_wavs_path):
    os.makedirs(fa_vad_wavs_path)
if not os.path.exists(fa_vad_plp_path):
    os.makedirs(fa_vad_plp_path)

### Create uttr level wavs

In [25]:
def create_audio_wav(source_file, target_file, intervals):
    intervals = intervals[:1] + ["=%s" % interval for interval in intervals[1:]]
    subprocess.call(["sox", source_file, target_file, \
                     "trim", intervals[0]] + intervals[1:])
    

In [10]:
def create_all_wavs():
    for wav_fil in sorted(align_dict.keys()):
        print("processing wav: %s" % wav_fil)
        wav_path = os.path.join(merged_wavs_path, "{0:s}.wav".format(wav_fil))
        for j, uttr in enumerate(sorted(align_dict[wav_fil].keys())):
            if j % 50 == 0:
                print('processing uttr: %s' % uttr)
            target_file = os.path.join(uttr_wavs_path, "{0:s}.wav".format(uttr))
            intervals = list(map(str, segment_map[wav_fil][uttr]))
            create_audio_wav(wav_path, target_file, intervals)
            # create vad uttr
            target_file = os.path.join(fa_vad_wavs_path, "{0:s}_fa_vad.wav".format(uttr))
            intervals = []
            uttr_start = segment_map[wav_fil][uttr][0]
            for entry in align_dict[wav_fil][uttr]['es']:
                s_t = "{0:.2f}".format(uttr_start + (entry.start/100))
                e_t = "{0:.2f}".format(uttr_start + (entry.end/100))
                intervals.extend([s_t, e_t])
            create_audio_wav(wav_path, target_file, intervals)
    

In [27]:
#----------------------------------------------------------
# Uncomment to create uttr and vad level wavs
#----------------------------------------------------------
# create_all_wavs()

### Create PLPs

In [21]:
def create_plp(wav_fname, plp_fname):
    FEACALC = config['base']["feacalc"]
    subprocess.call([FEACALC,"-plp", \
                    "12", "-cep", "13", "-dom", "cep", "-deltaorder", \
                    "2", "-dither", "-frqaxis", "bark", "-samplerate", \
                    "8000", "-win", "25", "-step", "10", "-ip", \
                    "MSWAVE", "-rasta", "false", "-compress", \
                    "true", "-op", "swappedraw", "-o", plp_fname, wav_fname])

In [23]:
def create_all_plps():
    for wav_fil in sorted(align_dict.keys()):
        print("processing wav: %s" % wav_fil)
        for j, uttr in enumerate(sorted(align_dict[wav_fil].keys())):
            if j % 50 == 0:
                print('processing plp for uttr: %s' % uttr)
            wav_fname = os.path.join(fa_vad_wavs_path, "{0:s}_fa_vad.wav".format(uttr))
            plp_fname = os.path.join(fa_vad_plp_path, "{0:s}_fa_vad.plp".format(uttr))
            
            create_plp(wav_fname, plp_fname)

    print("Completed!")

In [28]:
#----------------------------------------------------------
# Uncomment to create plps for vad wavs
#----------------------------------------------------------
# create_all_plps()

### Sample code

In [12]:
align_dict['001']['001.002']['es']

[Align(word='LAS', start=25, end=48),
 Align(word='HA', start=48, end=56),
 Align(word='MANDADO', start=56, end=113),
 Align(word='AL', start=113, end=135),
 Align(word='AH', start=181, end=211)]

In [13]:
segment_map['001']['001.002']

(2.58, 4.84)

In [16]:
Audio(os.path.join(uttr_wavs_path, "110.005.wav"))

In [17]:
Audio(os.path.join(fa_vad_wavs_path, "110.005_fa_vad.wav"))

In [18]:
print(" ".join([w.word.decode("utf-8") for w in align_dict["110"]["110.005"]["es"]]))

AY QUé LINDA


In [19]:
align_dict["110"]["110.005"]["es"]

[Align(word='AY', start=17, end=42),
 Align(word='QU\xc3\xa9', start=42, end=67),
 Align(word='LINDA', start=67, end=153)]

In [20]:
!soxi ../uttr_fa_vad_wavs/fa_vad_wavs/110.005_fa_vad.wav


Input File     : '../uttr_fa_vad_wavs/fa_vad_wavs/110.005_fa_vad.wav'
Channels       : 1
Sample Rate    : 8000
Precision      : 16-bit
Duration       : 00:00:01.36 = 10880 samples ~ 102 CDDA sectors
File Size      : 21.8k
Bit Rate       : 128k
Sample Encoding: 16-bit Signed Integer PCM

