In [1]:
from __future__ import print_function
from __future__ import division
import os
import cPickle as pickle
import json
import subprocess
from IPython.display import display
from IPython.display import Audio
from collections import namedtuple
from itertools import izip
import sys
import nltk
from nltk.corpus import stopwords
import numpy as np
from tqdm import tqdm

In [2]:
with open("config.json") as json_data_file:
    config = json.load(json_data_file)

# Preprocessing CALLHOME for ZRTools


- Created: 26-Oct-2016


## Create mapping for start time for each segment

Format: Dictionary  
key: {key: value}  
*file: {file.seg.wav: start time}*  
Name: segment_start.dict, segment_start.txt  

In [3]:
def read_segments_file(seg_fname):
    segment_map = {}
    with open(seg_fname, "r") as seg_f:
        for i, line in enumerate(seg_f):
            if i == 0:
                continue
            try:
                line_items = line.strip().split()
                seg_key = line_items[0]
                file_id = line_items[1]
                if file_id not in segment_map:
                    segment_map[file_id] = {}
                seg_start = int(float(line_items[6])*100)
                segment_map[file_id][seg_key] = seg_start
            except ValueError:
                print("Incorrect line format at line: %d" % i)
    return segment_map
        

### Read segment map

In [4]:
def read_segment_map():
    segment_map = read_segments_file('../segments.txt')
    pickle.dump(segment_map, open(config['es']['segment_dict_fname'], "wb"))

## Create VAD files for merged wavs

In [5]:
def create_merged_vad_from_ed(vad_file_id, segment_map, seg_vad_path, merged_vad_path):
    total_dur_10ms = 0
    total_dur_10ms_ge500ms = 0
    with open(os.path.join(merged_vad_path, vad_file_id+".vad"), "w") as vad_f:
        print("creating vad %s ..." % vad_file_id)
        for i, (seg_id, seg_start) in enumerate(sorted(segment_map[vad_file_id].items(), key=lambda t:t[0])):
            with open(os.path.join(seg_vad_path, seg_id+".vad"), "r") as seg_vad_f:
                for line in seg_vad_f:
                    line_items = map(int, line.strip().split())
                    start = seg_start+line_items[0]
                    end = seg_start+line_items[1]
                    total_dur_10ms += (end-start)
                    total_dur_10ms_ge500ms += ((end - start) if (end-start) >= 50 else 0)
                    out_line = ("%d %d\n" %(start, end))
                    vad_f.write(out_line)
                # end for
            # end reading seg file
        # end looping over all segments
    # end writing vad file
    return total_dur_10ms, total_dur_10ms_ge500ms

### Create new directory for merged vads

In [6]:
# merged_ed_vads_path = "../mergedVads"
# seg_vad_path = "../vad"
# if not os.path.exists(merged_ed_vads_path):
#     os.makedirs(merged_ed_vads_path)

### Create merged vad for each file

In [7]:
# total_dur_10ms, total_dur_10ms_ge500ms = 0, 0
# for vad_file_id in segment_map:
#     t1, t2 = create_merged_vad(vad_file_id, segment_map, seg_vad_path, merged_vads_path)
#     total_dur_10ms += t1
#     total_dur_10ms_ge500ms += t2

In [8]:
# print(total_dur_10ms, total_dur_10ms_ge500ms)
# print(map(lambda t: "{0:.3f}".format((t / 100.0 / 3600)), [total_dur_10ms, total_dur_10ms_ge500ms]))

## Create PLP features

In [9]:
merged_wavs_path = "../mergeWavs"
plp_path = "../plp"
plp_norm_path = "../std_plp"
if not os.path.exists(plp_path):
    os.makedirs(plp_path)
if not os.path.exists(plp_norm_path):
    os.makedirs(plp_norm_path)

In [10]:
def create_file_lst(file_lst_fname):
    prefix = "../corpora/callhome/mergeWavs"
    wav_file_list = [os.path.join(prefix, wav_file) for \
                     wav_file in os.listdir(merged_wavs_path) if wav_file.endswith(".wav")]
    wav_file_list_string = "\n".join(wav_file_list)
    with open(file_lst_fname, "w") as out_f:
        out_f.write(wav_file_list_string)
    print("Finished writing files.lst")

In [11]:
# create_file_lst(config["es"]["lst_file"])

In [12]:
def create_plp(wav_fname, plp_fname):
    FEACALC = config['base']["feacalc"]
    subprocess.call([FEACALC,"-plp", \
                    "12", "-cep", "13", "-dom", "cep", "-deltaorder", \
                    "2", "-dither", "-frqaxis", "bark", "-samplerate", \
                    "8000", "-win", "25", "-step", "10", "-ip", \
                    "MSWAVE", "-rasta", "false", "-compress", \
                    "true", "-op", "swappedraw", "-o", plp_fname, wav_fname])

    
def normalize_plp(plp_fname, vad_fname, plp_norm_fname):
    STANDFEAT = config['base']["standfeat"]
    # Standardize binary file, for VAD regions only
    subprocess.call([STANDFEAT, "-D", "39", "-infile", \
                    plp_fname, "-outfile", plp_norm_fname, \
                    "-vadfile", vad_fname])

In [13]:
def create_and_normalize_plps():
    for i, file_id in enumerate(segment_map):
        wav_fname = os.path.join(merged_wavs_path, file_id+".wav")
        vad_fname = os.path.join(merged_fa_vads_path, file_id+".vad")
        plp_fname = os.path.join(plp_path, file_id+".binary")
        plp_norm_fname = os.path.join(plp_norm_path, file_id+".std.binary")

        #print(file_id, wav_fname, vad_fname, plp_fname, plp_norm_fname)

        # create PLP
        if i % 20 == 0:
            print("plp for file %s " % file_id)

        #if not os.path.exists(plp_fname):
        create_plp(wav_fname, plp_fname)

        if i % 20 == 0:
            print("normalizing plp %s" % file_id)

        #if not os.path.exists(plp_norm_fname):
        normalize_plp(plp_fname, vad_fname, plp_norm_fname)
    print("Completed!")

## Create LSH files

In [14]:
lsh_path = "../lsh"
if not os.path.exists(lsh_path):
    os.makedirs(lsh_path)
lsh_proj_fname = os.path.join(lsh_path, "proj_S64xD39_seed1")

In [15]:
def create_lsh_proj_file(lsh_proj_fname):
    subprocess.call([config['base']["lsh_genproj"], \
                     "-D","39","-S","64","-seed", \
                     "1","-projfile", lsh_proj_fname])

def create_lsh_file(plp_norm_fname, vad_fname, lsh_proj_fname, lsh_fname):
    LSH = config['base']["lsh"]
    subprocess.call([LSH, "-D", "39", "-S", "64", \
                    "-projfile", lsh_proj_fname, \
                    "-featfile", plp_norm_fname, "-sigfile", \
                    lsh_fname, "-vadfile", vad_fname])

In [16]:
if not os.path.exists(lsh_proj_fname):
    create_lsh_proj_file(lsh_proj_fname)

In [17]:
def create_lsh_files():
    for i, file_id in enumerate(segment_map):
        wav_fname = os.path.join(merged_wavs_path, file_id+".wav")
        vad_fname = os.path.join(merged_fa_vads_path, file_id+".vad")
        plp_norm_fname = os.path.join(plp_norm_path, file_id+".std.binary")
        lsh_fname = os.path.join(lsh_path, file_id+".std.lsh64")

        #print(file_id, wav_fname, vad_fname, plp_fname, plp_norm_fname)

        # create LSH
        if i % 20 == 0:
            print("lsh for file %s " % file_id)

        #if not os.path.exists(lsh_fname):
        create_lsh_file(plp_norm_fname, vad_fname, lsh_proj_fname, lsh_fname)

    print("Completed!")

## Create ZRTools discovery command files

In [18]:
exp_path = '../exp'
if not os.path.exists(exp_path):
    os.makedirs(exp_path)

# List of wav files
segment_map = pickle.load(open(config['es']['segment_dict_fname'], "rb"))
wav_file_list = sorted(segment_map.keys())
exp_name = 'callhome'

In [19]:
def create_files_base():
    with open(os.path.join(exp_path, 'files.base'), "w") as out_f:
        for wav_file in wav_file_list:
            out_f.write(wav_file+'\n')
    print("Generated files.base")

In [20]:
def create_discovery_cmd_scripts(exp_path, wav_file_list, exp_name, num_splits=1):
    disc_file_split_base = "disc_{0:d}.cmd"
    disc_file_split = os.path.join(exp_path, disc_file_split_base)
    disc_split_file = os.path.join(exp_path, "disc_split.txt")
    num_files = len(wav_file_list)
    exp_local_path = os.path.join("exp", exp_name)
    cmd_string = "scripts/plebdisc_filepair \"{0:s}\" \"{1:s}\" {2:s} 39\n"

    total_lines = num_files * num_files
    lines_per_file = total_lines // num_splits
    smallfile = None
    curr_line = 0
    curr_file_num = 0

    for i in xrange(num_files) :
        if i % 20 == 0:
            print("Progress: {0:d} out of: {1:d}".format(curr_line+1, total_lines))
        for j in xrange(num_files):
            out_line = cmd_string.format(wav_file_list[i], \
                                              wav_file_list[j], \
                                              exp_local_path)
            if curr_line % lines_per_file == 0:
                if smallfile:
                    smallfile.close()
                small_filename = disc_file_split.format(curr_file_num)
                smallfile = open(small_filename, "w")
                curr_file_num += 1
            smallfile.write(out_line)
            curr_line += 1
    if smallfile:
        smallfile.close()

    # Making a list of commands to execute the split disc list
    full_split_cmd_string = "nice sh {0:s} 1> {1:s} 2>{2:s} &\n"
    split_cmd = os.path.join(exp_local_path, "matches","{0:s}.{1:d}")
    with open(disc_split_file, "w") as out_f:
        for i in xrange(curr_file_num):
            curr_split_file = os.path.join(exp_local_path, disc_file_split_base.format(i))
            split_cmd_out = split_cmd.format("out", i)
            #split_cmd_err = split_cmd.format("err", i)
            split_cmd_err = "/dev/null"

            out_line = "nice sh "
            out_f.write(full_split_cmd_string.format(curr_split_file, \
                                                    split_cmd_out, \
                                                    split_cmd_err))

    print("Completed - disc.cmd")

In [21]:
# create_discovery_cmd_scripts(exp_path=exp_path, wav_file_list=wav_file_list, exp_name=exp_name, num_splits=25)

# Read transcripts, and translations into a dictionary

In [22]:
Align = namedtuple('Align', ['word', 'start', 'end'])

In [23]:
def read_alignment_file(align_fname, stopwords_corpus=None):
    align_list = []
    with open(align_fname, "r") as align_f:
        for line in align_f:
            line_items = line.strip().split()
            if len(line_items) != 3:
                raise ValueError
            start, end = map(lambda v: int(float(v)*100), line_items[1:3])
            if (not stopwords_corpus) or \
            (stopwords_corpus and line_items[0].lower().decode("utf-8") not in stopwords_corpus):
                align_list.append(Align(*[line_items[0], start, end]))
    if sorted(align_list, key=lambda t: t.start) != align_list and not align_fname.endswith("en"):
        raise IOError    
            
    return align_list

In [24]:
es_words_path = '../wav2es-words/'
en_words_path = '../wav2eng-words/'
align_dict_fname = config['es']['align_dict_fname']

In [25]:
# # test code
# stopwords_es = set(stopwords.words('spanish'))
# stopwords_en = set(stopwords.words('english'))
# display(read_alignment_file('../wav2es-words/001.001.es'))
# display(read_alignment_file('../wav2es-words/001.001.es', stopwords_corpus=stopwords_es))
# display(read_alignment_file('../wav2eng-words/001.001.en'))
# display(read_alignment_file('../wav2eng-words/001.001.en', stopwords_corpus=stopwords_en))

In [26]:
def get_file_list(file_path, file_ext):
    return [os.path.splitext(f)[0] for f in os.listdir(file_path) if f.endswith(file_ext)]

In [27]:
es_file_list = get_file_list(es_words_path, 'es')
en_file_list = get_file_list(en_words_path, 'en')

print(sorted(es_file_list) == sorted(en_file_list))
print(set(es_file_list)-set(en_file_list))

False
set(['009.025'])


In [28]:
def create_alignment_dict():
    align_dict = {}
    stopwords_es = set(stopwords.words('spanish'))
    stopwords_en = set(stopwords.words('english'))
    for file_id in segment_map:
        #print("Processing file: %s" % file_id)
        align_dict[file_id] = {}
        for seg_id in segment_map[file_id]:
            align_dict[file_id][seg_id] = {}
            es_fname = os.path.join(es_words_path, seg_id+".es")
            en_fname = os.path.join(en_words_path, seg_id+".en")
            align_dict[file_id][seg_id]["es"] = read_alignment_file(es_fname)
            align_dict[file_id][seg_id]["en"] = read_alignment_file(en_fname)
            align_dict[file_id][seg_id]["es_cnt"] = read_alignment_file(es_fname, stopwords_corpus=stopwords_es)
            align_dict[file_id][seg_id]["en_cnt"] = read_alignment_file(en_fname, stopwords_corpus=stopwords_en)
    return align_dict
        

In [29]:
# align_dict = create_alignment_dict()
# pickle.dump(align_dict, open(align_dict_fname, "wb"))

## Create VAD from alignments

In [30]:
align_dict = pickle.load(open(align_dict_fname, "rb"))
segment_map = pickle.load(open(config['es']['segment_dict_fname'], "rb"))
# has_500ms_fa_vad_dict_fname = config['es']['has_500ms_fa_vad_dict']

In [31]:
def create_uttr_vad_from_alignment(align_dict, vad_path):
    has_500ms_dur = {}
    total_dur_10ms = 0
    total_dur_10ms_ge500ms = 0
    for i, vad_file_id in enumerate(align_dict):
        if i % 20 == 0:
            print("Created vad for %d files id" % i)
        for seg_id in align_dict[vad_file_id]:
            with open(os.path.join(vad_path, seg_id+".vad"), "w") as vad_f:
                dur_10ms = 0
                dur_10ms_ge500ms = 0
                vad_list = []
                # start index
                s = 0
                # create a local list of alignment values
                align_list = align_dict[vad_file_id][seg_id]['es']
                for j in xrange(len(align_list)):
                    # if 1st or last element, add to vad_list
                    if ((j+1) == len(align_list)) or (align_list[j].end != align_list[j+1].start):
                        vad_list.append(((align_list[s].start), (align_list[j].end)))
                        s=j+1
                # write vad list to file        
                for vad_tup in vad_list:
                    start = vad_tup[0]
                    end = vad_tup[1]
                    dur_10ms += (end-start)
                    dur_10ms_ge500ms += ((end - start) if (end-start) >= 50 else 0)
                    out_line = ("%d %d\n" %(start, end))
                    vad_f.write(out_line)
                
                # set whether atleast one vad region of 500 ms
                has_500ms_dur[seg_id] = (dur_10ms_ge500ms > 0)
                # compute total durations
                total_dur_10ms += dur_10ms
                total_dur_10ms_ge500ms += dur_10ms_ge500ms 
                    
            # end for
        # end looping over all segments
    # end writing vad file
    return total_dur_10ms, total_dur_10ms_ge500ms, has_500ms_dur
    

In [32]:
def create_merged_vad_from_alignment(vad_file_id, align_dict, segment_map, vad_path):
    total_dur_10ms = 0
    total_dur_10ms_ge500ms = 0
    with open(os.path.join(vad_path, vad_file_id+".vad"), "w") as vad_f:
        print("creating vad %s ..." % vad_file_id)
        for i, (seg_id, seg_start) in enumerate(sorted(segment_map[vad_file_id].items(), key=lambda t:t[0])):
            vad_list = []
            # start index
            s = 0
            # create a local list of alignment values
            align_list = align_dict[vad_file_id][seg_id]['es']
            for j in xrange(len(align_list)):
                # if 1st or last element, add to vad_list
                if ((j+1) == len(align_list)) or (align_list[j].end != align_list[j+1].start):
                    vad_list.append(((seg_start+align_list[s].start), (seg_start+align_list[j].end)))
                    s=j+1
            # write vad list to file        
            for vad_tup in vad_list:
                start = vad_tup[0]
                end = vad_tup[1]
                total_dur_10ms += (end-start)
                total_dur_10ms_ge500ms += ((end - start) if (end-start) >= 50 else 0)
                out_line = ("%d %d\n" %(start, end))
                vad_f.write(out_line)
            # end for
        # end looping over all segments
    # end writing vad file
    return total_dur_10ms, total_dur_10ms_ge500ms
    

### Create new directory for merged vads

In [33]:
uttr_fa_vads_path = config['es']['es_uttr_fa_vad']
if not os.path.exists(uttr_fa_vads_path):
    os.makedirs(uttr_fa_vads_path)

merged_fa_vads_path = config['es']['es_merge_fa_vad']
if not os.path.exists(merged_fa_vads_path):
    os.makedirs(merged_fa_vads_path)


In [34]:
# t1, t2, has_500ms_dur = create_uttr_vad_from_alignment(align_dict, uttr_fa_vads_path)
# # print(t1, t2)
# print(map(lambda t: "{0:.3f} hrs".format((t / 100.0 / 3600)), [t1, t2]))
# print("saving dict: %s" % has_500ms_fa_vad_dict_fname)
# pickle.dump(has_500ms_dur, open(has_500ms_fa_vad_dict_fname, "wb"))

In [35]:
# # check how many utterances have atleast 500ms VAD
# print("total utterances: %d" % len(has_500ms_dur))
# uttrs_with_500ms = {k:v for k, v in has_500ms_dur.items() if v}
# print("with 500ms: %d" % len(uttrs_with_500ms))

In [36]:
# save dev file with only 500ms segments

In [37]:
# dev_500ms_fname = config['es']['mt_dev_500ms_files']
# dev_fname = config['es']['mt_dev_test_files']
# with open(dev_fname, "r") as dev_f, open(dev_500ms_fname, "w") as dev_500ms_f:
#     for line in dev_f:
#         if line.strip() in has_500ms_dur and has_500ms_dur[line.strip()]:
#             dev_500ms_f.write(line)

In [38]:
# !wc $dev_500ms_fname

### Create merged vad for each file

In [39]:
def create_merged_vads():
    total_dur_10ms, total_dur_10ms_ge500ms = 0, 0
    for vad_file_id in segment_map:
        t1, t2 = create_vad_from_alignment(vad_file_id, align_dict, segment_map, merged_fa_vads_path)
        total_dur_10ms += t1
        total_dur_10ms_ge500ms += t2

In [40]:
# print(total_dur_10ms, total_dur_10ms_ge500ms)
# print(map(lambda t: "{0:.3f}".format((t / 100.0 / 3600)), [total_dur_10ms, total_dur_10ms_ge500ms]))

## Create features

In [41]:
def create_gold_feats(align_dict, gold_feats_dict_fname, es_key="es"):
    gold_feats_dict = {}
    for fid in align_dict:
        for sid in align_dict[fid]:
            gold_feats_dict[sid] = {}
            if align_dict[fid][sid][es_key] == []:
                # Only es_cnt can be empty, in which case include stop words
                gold_feats_dict[sid] = [w.word for w in align_dict[fid][sid]['es']]
            else:
                gold_feats_dict[sid] = [w.word for w in align_dict[fid][sid][es_key]]
    print("Saving gold features using key: %s" % es_key)
    pickle.dump(gold_feats_dict, open(gold_feats_dict_fname, "wb"))
    print("finished ...")
    return gold_feats_dict
        

In [42]:
def gold_feats():
    align_dict = pickle.load(open(align_dict_fname, "rb"))
    gold_feats_dict_fname = config['es']['gold_feats']
    gold_feats_dict = create_gold_feats(align_dict, gold_feats_dict_fname, es_key="es_cnt")

## Check English translations

In [43]:
es_words = [a.word for fid in align_dict for sid in align_dict[fid] for a in align_dict[fid][sid]['es']]
es_cnt_words = [a.word for fid in align_dict for sid in align_dict[fid] for a in align_dict[fid][sid]['es_cnt']]
en_words = [a.word for fid in align_dict for sid in align_dict[fid] for a in align_dict[fid][sid]['en']]
en_cnt_words = [a.word for fid in align_dict for sid in align_dict[fid] for a in align_dict[fid][sid]['en_cnt']]

In [44]:
from collections import Counter

In [45]:
es_words_freq = Counter(es_words)
es_cnt_words_freq = Counter(es_cnt_words)
en_words_freq = Counter(en_words)
en_cnt_words_freq = Counter(en_cnt_words)

In [46]:
print(sorted(en_words_freq.items(), reverse=True, key=lambda t:t[1])[:10])

[('THE', 5178), ('AND', 4629), ('THAT', 4080), ('I', 3849), ('TO', 3359), ('YES', 3345), ("'T", 2265), ('YOU', 2256), ('NO', 2135), ("'S", 2030)]


In [47]:
print(sorted(en_cnt_words_freq.items(), reverse=True, key=lambda t:t[1])[:10])

[('YES', 3345), ("'T", 2265), ("'S", 2030), ('WELL', 1829), ('AH', 1349), ('KNOW', 1100), ('OH', 1066), ('SEE', 934), ('YEAH', 904), ('LIKE', 889)]


In [48]:
print(sorted(es_words_freq.items(), reverse=True, key=lambda t:t[1])[:10])

[('QUE', 7089), ('NO', 6110), ('Y', 5037), ('A', 4310), ('DE', 4009), ('S\xc3\xad', 3667), ('LA', 3425), ('YA', 2782), ('EL', 2680), ('ES', 2587)]


In [49]:
print(sorted(es_cnt_words_freq.items(), reverse=True, key=lambda t:t[1])[:10])

[('AH', 1831), ('PUES', 1236), ('BUENO', 1186), ('BIEN', 1183), ('SI', 1045), ('<LAUGH>', 987), ('MMM', 976), ('AS\xc3\xad', 781), ('ENTONCES', 775), ('CLARO', 729)]


In [50]:
print([(w,f) for w, f in en_cnt_words_freq.items() if "'" in w])

[("'T", 2265), ("'S", 2030), ("'R", 1), ("'D", 34), ("'M", 627), ("'TS", 1), ("'VE", 184), ("'", 40), ("'OEUVRES", 1), ("'RE", 269), ("'CLOCK", 4), ("'LL", 540), ("O'CLOCK", 1), ("'AM", 8)]


In [51]:
print([(w,f) for w, f in es_words_freq.items() if "<" in w])

[('<SNEEZE>', 4), ('<COUGH>', 11), ('<LAUGH>', 987), ('<BREATH>', 16), ('<NOISE>', 450), ('<BACKGROUND>', 107)]


In [52]:
print([(w,f) for w, f in es_cnt_words_freq.items() if "<" in w])

[('<SNEEZE>', 4), ('<COUGH>', 11), ('<LAUGH>', 987), ('<BREATH>', 16), ('<NOISE>', 450), ('<BACKGROUND>', 107)]


## Key-word spotting, prepare data

In [44]:
def read_proto_list():
    protos = []
    with open(config['proto']['protos_list'], "r") as f:
        for line in f:
            protos.append(line.strip())
    return protos

In [45]:
def create_vad_norm_plp_for_protos(protos):
    if not os.path.exists(config['proto']['vad_path']):
        os.makedirs(config['proto']['vad_path'])
    if not os.path.exists(config['proto']['norm_plp_path']):
        os.makedirs(config['proto']['norm_plp_path'])
    if not os.path.exists(config['proto']['proto_float32_path']):
        os.makedirs(config['proto']['proto_float32_path'])
    if not os.path.exists(config['proto']['lsh_path']):
        os.makedirs(config['proto']['lsh_path'])
        
    sys.stderr.flush()
    with tqdm(total=len(protos)) as pbar:
        for i, pid in enumerate(protos, start=1):
            try:
                pid_base = os.path.splitext(pid)[0]
                plp_fname = os.path.join(config['proto']['proto_path'], pid)
                plp_f32_fname = os.path.join(config['proto']['proto_float32_path'], pid)
                plp_norm_fname = (os.path.join(config['proto']['norm_plp_path'], 
                                          "{0:s}.std.binary".format(pid_base)))
                vad_fname = (os.path.join(config['proto']['vad_path'], "{0:s}.vad".format(pid_base)))
                lsh_fname = (os.path.join(config['proto']['lsh_path'], 
                                          "{0:s}.std.lsh64".format(pid_base)))
                # read npy file to get shape
                x = np.load(plp_fname)
                # ZRTools use float32
                y = x.astype(np.float32)
                y.tofile(plp_f32_fname)

                # create vad file
                with open(vad_fname, "w") as f:
                    f.write("0\t{0:d}\n".format(y.shape[0]))

                # normalize plp
                normalize_plp(plp_f32_fname, vad_fname, plp_norm_fname)
                #print(os.path.getsize(plp_fname))

                # create lsh
                create_lsh_file(plp_norm_fname, vad_fname, lsh_proj_fname, lsh_fname)
                
                # update progress
                pbar.set_description("processing proto {0:s}\n".format(pid_base))
#                 pbar.update(i)
                print(i)
            except:
                print(pid)
    print("completed")
        

In [46]:
protos = read_proto_list()
protos[:5]

['10TH.npy', '13.npy', '155.npy', '20TH.npy', '5TH.npy']

In [102]:
create_vad_norm_plp_for_protos(protos)

processing proto 155
processing proto ABEL
processing proto ABSOLUTELY
processing proto ABUSED
processing proto ACCELERATING
processing proto ACCESSIBLE
processing proto ACCORDING
processing proto ACCUSING
processing proto ACTED
processing proto ACTRESS
processing proto ADDRESSES
processing proto ADOBE
processing proto ADRIAN
processing proto ADVANTAGEMM
processing proto ADVISER
processing proto AFFECTED
processing proto AFTERNOONS
processing proto AGARRAPO
processing proto AGENT
processing proto AGREEMENT
processing proto AHA
processing proto AHH
processing proto AIRES
processing proto ALARM
processing proto ALEXICO
processing proto ALIKE
processing proto ALLERGY
processing proto AL
processing proto ALSO
processing proto ALWAYS
processing proto AMAZES
processing proto AMENDED
processing proto AMINTITA
processing proto AMUSES
processing proto ANDALUCIA
processing proto ANGELES
processing proto ANISSA
processing proto ANNULLEDH
processing proto A
processing proto ANTENOR
processing prot

CASTAñEDA.npy


processing proto CATS
processing proto CECI
processing proto CELEBRATION
processing proto CENTIMETER
processing proto CEREAL
processing proto CESAREAN
processing proto CHAMA
processing proto CHANGING
processing proto CHARGED
processing proto CHAT
processing proto CHEAPER
processing proto CHECHO
processing proto CHEER
processing proto CHERLA
processing proto CHICKPEAS
processing proto CHILEANS
processing proto CHIN
processing proto CHIVETO
processing proto CHOITE
processing proto CHOSEN
processing proto CHUCHA
processing proto CINCINATTI
processing proto CIVIL
processing proto CLASS
processing proto CLEARER
processing proto CLIMBED
processing proto CLOCK
processing proto CLOTHING
processing proto COBS
processing proto CODE
processing proto COINCIDENTALLY
processing proto COLLEGE
processing proto COLONY
processing proto COME
processing proto COMMENT
processing proto COMMITTEE
processing proto COMMUNICATIVE
processing proto COMPATIBLE
processing proto COMPLEX
processing proto COMPUTER
pro

DOñIHUE.npy
DOñINHUE.npy


processing proto DOZEN
processing proto DRAMATIC
processing proto DREAMING
processing proto DRESS
processing proto DRIVERS
processing proto DROPPED
processing proto DRYING
processing proto DUMB
processing proto DUST
processing proto EARNING
processing proto EARTH
processing proto EASY
processing proto ECHO
processing proto ECUADOR
processing proto EEEEE
processing proto EGGS
processing proto EITHER
processing proto ELECTRIC
processing proto ELENITA
processing proto ELISA
processing proto EMBARRASSING
processing proto EMPLOYMENT
processing proto ENDED
processing proto ENERGY
processing proto ENIA
processing proto E
processing proto ENTER
processing proto ENVELOP
processing proto EQUIPPING
processing proto ES
processing proto ESTABLISHMENT
processing proto ETCETERA
processing proto EVEN
processing proto EVERYONE
processing proto EXACT
processing proto EXCEED
processing proto EXCUSE
processing proto EXODUS
processing proto EXPERIENCE
processing proto EXPLAINING
processing proto EXPRESIDEN

FERNáNDEZ.npy
FERNáN.npy


processing proto FIFTH
processing proto FIGURES
processing proto FINALLY
processing proto FINCA
processing proto FINISHES
processing proto FIRM
processing proto FITS
processing proto FLAT
processing proto FLOOR
processing proto FLUENT
processing proto FOCUSED
processing proto FOOD
processing proto FORCES
processing proto FORGET
processing proto FORMED
processing proto FORTUNE
processing proto FOUR
processing proto FRANCINA
processing proto FREAK
processing proto FREEWAY
processing proto FRIACHO
processing proto FRIENDS
processing proto FROM
processing proto FUCKING
processing proto FULFILLED
processing proto FUNDAMENTAL
processing proto FURROW
processing proto GABRIEL
processing proto GAME
processing proto GAS
processing proto GAY
processing proto GENERATED
processing proto GEORGE
processing proto GERMANY
processing proto GIFTS
processing proto GIRLS
processing proto GLASSES
processing proto GLUTTON
processing proto GOES
processing proto GONG
processing proto GORDA
processing proto GOT

HERNáNDEZ.npy


processing proto HIPNOPEDIA
processing proto HIS
processing proto HMM
processing proto HOLIDAY
processing proto HONE
processing proto HON
processing proto HOPEFULLY
processing proto HOSPITAL
processing proto HOT
processing proto HOUSEWIFE
processing proto HT
processing proto HUGE
processing proto HUMANS
: 3066323it [00:41, 148482.72it/s]

HUARáS.npy


processing proto HURST
processing proto HYPNOPAEDIA
processing proto IDEAL
processing proto IDIOTS
processing proto IGUEY
processing proto IL
: 3171436it [00:42, 152179.82it/s]   

IGUAZú.npy


processing proto IMMEDIATE
processing proto IMPORTANT
processing proto IMPROVE
processing proto INCAE
processing proto INCREASED
processing proto INDEPENDENT
processing proto INEVITABLE
processing proto INFORMALLY
processing proto INGRATE
processing proto INLAWS
processing proto INSIDE
processing proto INSTALL
processing proto INSURANCE
processing proto INTERCHANGE
processing proto INTERNAL
processing proto INTERPRETATION
processing proto INT
processing proto INVENTED
processing proto INVITATION
processing proto INVOLVES
processing proto IRIS
processing proto ISLAND
processing proto ITALIA
processing proto ITS
processing proto JACKASS
processing proto JAIL
: 3612367it [00:45, 151230.85it/s]   

IVáN.npy


processing proto JAPAN
processing proto JEDY
processing proto JESSICA
processing proto J
processing proto JOIE
processing proto JO
processing proto JOYA
processing proto JUDGE
: 3756590it [00:46, 162249.07it/s]

JOSé.npy


processing proto JUMPED
processing proto JURISPRUDENCE
processing proto KAREN
processing proto KENNEDY
processing proto KICKBACK
processing proto KIDNAP
processing proto KILLING
processing proto KINDA
processing proto KISS
processing proto KNEW
processing proto KNOWS
processing proto LABORATORY
processing proto LAID
processing proto LAND
processing proto LAS
processing proto LATINA
: 4053557it [00:48, 173792.83it/s]

LARRAñAGA.npy


processing proto LAUNDRY
processing proto LAYING
processing proto LEAFLETS
processing proto LEASE
processing proto LEGALLY
processing proto LENCHA
processing proto LESSEN
processing proto LETTERS
processing proto LIAR
processing proto LIED
processing proto LIGHTNING
processing proto LIKEWISE
processing proto LINEL
processing proto LION
processing proto LISTENING
processing proto LITTLE
processing proto LLA
processing proto LOAN
processing proto LOGICAL
processing proto LOOKED
processing proto LOOSES
processing proto LOSS
processing proto LOVELY
processing proto LOWER
processing proto LUCKILY
processing proto LULITO
processing proto MACAMAN
processing proto MADHOUSE
processing proto MAGAZINE
processing proto MAIDS
processing proto MAINTAINS
processing proto MAKING
processing proto MAMMY
processing proto MANDATORY
processing proto MANOLITO
processing proto MANU
processing proto MARCH
processing proto MARIBEL
: 4848510it [00:52, 189504.07it/s]

MARAñON.npy
MARíA.npy


processing proto MARIO
processing proto MARKS
processing proto MARQUI
processing proto MARTINA
processing proto MASTERS
processing proto MATES
processing proto MATTER
processing proto MAURO
processing proto MAZATLAN
processing proto MEANWHILE
processing proto MEDELLIN
processing proto MEETING
processing proto MEMORIZED
processing proto ME
: 5159426it [00:54, 194190.16it/s]       

MéLIDA.npy


processing proto MERENGUE
processing proto MESSAGES
processing proto METEOROLOGY
processing proto MEXICAN
processing proto MIAS
processing proto MICROSURGERY
processing proto MIGUEL
processing proto MILKMAN
processing proto MINA
processing proto MINISERIES
processing proto MINUTES
processing proto MISSES
processing proto MISTRESS
: 5436961it [00:55, 203092.59it/s]

MISIóN.npy


processing proto M
processing proto MODEST
processing proto MOMENT
processing proto MONDAYS
processing proto MONTECINOS
processing proto MOOD
: 5573462it [00:56, 203674.05it/s]      

MóNICA.npy


processing proto MORMON
processing proto MOTA
processing proto MOVED
processing proto MOVING
processing proto MUNDIAL
processing proto MURDERER
processing proto MY
processing proto NAMED
processing proto NANNY
processing proto NATALIA
processing proto NATUSHA
processing proto NEED
processing proto NEGRURA
processing proto NENA
processing proto NEUROLOGIST
processing proto NEWS
processing proto NIECE
processing proto NINA
: 6000095it [00:58, 214340.98it/s] 

NICOLáS.npy


processing proto NINTITA
processing proto NONA
processing proto NORA
processing proto NOSAS
processing proto NOTES
processing proto NOTROPIL
processing proto NOYES
processing proto NUM
: 6196374it [00:59, 218710.33it/s]  

´.npy
¨.npy
¡.npy
​.npy


processing proto NYLON
processing proto OBLIGATORY
processing proto OBVIOUS
processing proto OCULAR
: 6295794it [00:59, 221957.74it/s] 

OCAñA.npy


processing proto OFFENDED
processing proto OFFICES
processing proto OHHH
processing proto OLDER
processing proto OMARIA
processing proto ONES
processing proto OPELUCE
processing proto OPERABLE
processing proto OPINIONS
processing proto OPTIONS
processing proto ORGANIZATION
processing proto ORLANDO
processing proto OSO
processing proto OTTAWA
processing proto OUTFIT
processing proto OVERDID
processing proto OVERWHELMS
processing proto OYE
processing proto PACK
processing proto PAINS
processing proto PALACE
processing proto PAN
processing proto PAPER
processing proto PARAGRAPHS
processing proto PARISH
processing proto PARTICULAR
processing proto PASADENA
processing proto PASSPORT
processing proto PATO
processing proto PAULA
processing proto PAYMENT
processing proto PEACE
processing proto PEDESTRIAN
processing proto PEGUELE
processing proto PENDING
processing proto PEPE
processing proto PERFUME
processing proto PERMIT
processing proto PERSON
processing proto PERVERT
processing proto PHARM

protos.list


processing proto PSYCHOSOMATIC
processing proto PUCHAS
processing proto PULL
processing proto PURCHASES
processing proto PUSHY
processing proto QUALITY
processing proto QUESTION
processing proto QUIETER
processing proto QUIT
processing proto RADIO
processing proto RAISE
processing proto RAMON
processing proto RARELY
processing proto RAULITO
processing proto READING
: 8744826it [01:10, 262481.84it/s]

RAúL.npy


processing proto REALLY
processing proto RECEIVE
processing proto RECIPE
processing proto RECOMMENDATIONS
processing proto RECORDING
processing proto RECYCLE
processing proto REFERENCE
processing proto REFORMIST
processing proto REGARDS
processing proto REGULAR
processing proto RELATIONSHIP
processing proto RELIABLE
processing proto REMAIN
processing proto REMINDED
processing proto RENDERING
processing proto RENTING
processing proto REPEATED
processing proto REPLACE
processing proto REPRESENTATIVE
processing proto REQUEST
processing proto RESENTMENT
processing proto RESI
processing proto RESPONSE
processing proto RESTS
processing proto RET
processing proto REUNION
processing proto REVOLUTION
processing proto RICARDO
processing proto RIDDEN
processing proto RINGING
processing proto R
processing proto ROBERTO
processing proto RODEO
processing proto ROLLS
processing proto ROOMMATE
processing proto ROSARY
processing proto ROUND
processing proto RUINED
processing proto RUSSIAN
processing pr

completed





In [47]:
def create_keyword_spotting_cmd_scripts(exp_path, wav_file_list, protos, exp_name, num_splits=1):
    disc_file_split_base = "keyword_spot_{0:d}.cmd"
    disc_file_split = os.path.join(exp_path, disc_file_split_base)
    disc_split_file = os.path.join(exp_path, "keyword_spot_split.txt")
    
    num_files = len(wav_file_list)
    num_protos = len(protos)
    
    exp_local_path = os.path.join("exp", exp_name)
    cmd_string = "scripts/plebdisc_filepair_keyword_spotting \"{0:s}\" \"{1:s}\" {2:s} 39\n"

    total_lines = num_files * num_protos
    lines_per_file = total_lines // num_splits
    smallfile = None
    curr_line = 0
    curr_file_num = 0

    for i in xrange(num_protos):
        pid_base = os.path.splitext(protos[i])[0]
        if i % 20 == 0:
            print("Progress: {0:d} out of: {1:d}".format(curr_line+1, total_lines))
        for j in xrange(num_files):
            out_line = cmd_string.format(pid_base, wav_file_list[j], exp_local_path)
            if curr_line % lines_per_file == 0:
                if smallfile:
                    smallfile.close()
                small_filename = disc_file_split.format(curr_file_num)
                smallfile = open(small_filename, "w")
                curr_file_num += 1
            smallfile.write(out_line)
            curr_line += 1
    if smallfile:
        smallfile.close()

    # Making a list of commands to execute the split disc list
    full_split_cmd_string = "nice sh {0:s} 1> {1:s} 2>{2:s} &\n"
    split_cmd = os.path.join(exp_local_path, "matches","{0:s}.{1:d}")
    with open(disc_split_file, "w") as out_f:
        for i in xrange(curr_file_num):
            curr_split_file = os.path.join(exp_local_path, disc_file_split_base.format(i))
            split_cmd_out = split_cmd.format("keyword_spot_out", i)
            #split_cmd_err = split_cmd.format("err", i)
            split_cmd_err = "/dev/null"

            out_line = "nice sh "
            out_f.write(full_split_cmd_string.format(curr_split_file, \
                                                    split_cmd_out, \
                                                    split_cmd_err))

    print("Completed - keyword_spot cmd script")

In [51]:
create_keyword_spotting_cmd_scripts(exp_path=exp_path, wav_file_list=wav_file_list[:33], protos=protos,
                                    exp_name=exp_name, num_splits=25)

Progress: 1 out of: 188199
Progress: 661 out of: 188199
Progress: 1321 out of: 188199
Progress: 1981 out of: 188199
Progress: 2641 out of: 188199
Progress: 3301 out of: 188199
Progress: 3961 out of: 188199
Progress: 4621 out of: 188199
Progress: 5281 out of: 188199
Progress: 5941 out of: 188199
Progress: 6601 out of: 188199
Progress: 7261 out of: 188199
Progress: 7921 out of: 188199
Progress: 8581 out of: 188199
Progress: 9241 out of: 188199
Progress: 9901 out of: 188199
Progress: 10561 out of: 188199
Progress: 11221 out of: 188199
Progress: 11881 out of: 188199
Progress: 12541 out of: 188199
Progress: 13201 out of: 188199
Progress: 13861 out of: 188199
Progress: 14521 out of: 188199
Progress: 15181 out of: 188199
Progress: 15841 out of: 188199
Progress: 16501 out of: 188199
Progress: 17161 out of: 188199
Progress: 17821 out of: 188199
Progress: 18481 out of: 188199
Progress: 19141 out of: 188199
Progress: 19801 out of: 188199
Progress: 20461 out of: 188199
Progress: 21121 out of: 1881

In [50]:
wav_file_list[:33]

['001',
 '002',
 '005',
 '006',
 '007',
 '009',
 '010',
 '011',
 '012',
 '013',
 '014',
 '015',
 '018',
 '021',
 '022',
 '023',
 '024',
 '025',
 '026',
 '027',
 '028',
 '029',
 '030',
 '031',
 '032',
 '033',
 '034',
 '035',
 '036',
 '037',
 '038',
 '039',
 '040']

In [106]:
len(protos)

5703