In [107]:
from __future__ import print_function
from __future__ import division
import os
import cPickle as pickle
import json
import subprocess
from IPython.display import display
from IPython.display import Audio
from collections import namedtuple
from itertools import izip
import sys
import nltk
from nltk.corpus import stopwords
import numpy as np
from tqdm import tqdm

In [108]:
with open("config.json") as json_data_file:
    config = json.load(json_data_file)

# Preprocessing CALLHOME for ZRTools


- Created: 26-Oct-2016


## Create mapping for start time for each segment

Format: Dictionary  
key: {key: value}  
*file: {file.seg.wav: start time}*  
Name: segment_start.dict, segment_start.txt  

In [5]:
def read_segments_file(seg_fname):
    segment_map = {}
    with open(seg_fname, "r") as seg_f:
        for i, line in enumerate(seg_f):
            if i == 0:
                continue
            try:
                line_items = line.strip().split()
                seg_key = line_items[0]
                file_id = line_items[1]
                if file_id not in segment_map:
                    segment_map[file_id] = {}
                seg_start = int(float(line_items[6])*100)
                segment_map[file_id][seg_key] = seg_start
            except ValueError:
                print("Incorrect line format at line: %d" % i)
    return segment_map
        

### Read segment map

In [6]:
def read_segment_map():
    segment_map = read_segments_file('../segments.txt')
    pickle.dump(segment_map, open(config['es']['segment_dict_fname'], "wb"))

## Create VAD files for merged wavs

In [7]:
def create_merged_vad_from_ed(vad_file_id, segment_map, seg_vad_path, merged_vad_path):
    total_dur_10ms = 0
    total_dur_10ms_ge500ms = 0
    with open(os.path.join(merged_vad_path, vad_file_id+".vad"), "w") as vad_f:
        print("creating vad %s ..." % vad_file_id)
        for i, (seg_id, seg_start) in enumerate(sorted(segment_map[vad_file_id].items(), key=lambda t:t[0])):
            with open(os.path.join(seg_vad_path, seg_id+".vad"), "r") as seg_vad_f:
                for line in seg_vad_f:
                    line_items = map(int, line.strip().split())
                    start = seg_start+line_items[0]
                    end = seg_start+line_items[1]
                    total_dur_10ms += (end-start)
                    total_dur_10ms_ge500ms += ((end - start) if (end-start) >= 50 else 0)
                    out_line = ("%d %d\n" %(start, end))
                    vad_f.write(out_line)
                # end for
            # end reading seg file
        # end looping over all segments
    # end writing vad file
    return total_dur_10ms, total_dur_10ms_ge500ms

### Create new directory for merged vads

In [8]:
# merged_ed_vads_path = "../mergedVads"
# seg_vad_path = "../vad"
# if not os.path.exists(merged_ed_vads_path):
#     os.makedirs(merged_ed_vads_path)

### Create merged vad for each file

In [9]:
# total_dur_10ms, total_dur_10ms_ge500ms = 0, 0
# for vad_file_id in segment_map:
#     t1, t2 = create_merged_vad(vad_file_id, segment_map, seg_vad_path, merged_vads_path)
#     total_dur_10ms += t1
#     total_dur_10ms_ge500ms += t2

In [10]:
# print(total_dur_10ms, total_dur_10ms_ge500ms)
# print(map(lambda t: "{0:.3f}".format((t / 100.0 / 3600)), [total_dur_10ms, total_dur_10ms_ge500ms]))

## Create PLP features

In [11]:
merged_wavs_path = "../mergeWavs"
plp_path = "../plp"
plp_norm_path = "../std_plp"
if not os.path.exists(plp_path):
    os.makedirs(plp_path)
if not os.path.exists(plp_norm_path):
    os.makedirs(plp_norm_path)

In [12]:
def create_file_lst(file_lst_fname):
    prefix = "../corpora/callhome/mergeWavs"
    wav_file_list = [os.path.join(prefix, wav_file) for \
                     wav_file in os.listdir(merged_wavs_path) if wav_file.endswith(".wav")]
    wav_file_list_string = "\n".join(wav_file_list)
    with open(file_lst_fname, "w") as out_f:
        out_f.write(wav_file_list_string)
    print("Finished writing files.lst")

In [13]:
# create_file_lst(config["es"]["lst_file"])

In [14]:
def create_plp(wav_fname, plp_fname):
    FEACALC = config['base']["feacalc"]
    subprocess.call([FEACALC,"-plp", \
                    "12", "-cep", "13", "-dom", "cep", "-deltaorder", \
                    "2", "-dither", "-frqaxis", "bark", "-samplerate", \
                    "8000", "-win", "25", "-step", "10", "-ip", \
                    "MSWAVE", "-rasta", "false", "-compress", \
                    "true", "-op", "swappedraw", "-o", plp_fname, wav_fname])

    
def normalize_plp(plp_fname, vad_fname, plp_norm_fname):
    STANDFEAT = config['base']["standfeat"]
    # Standardize binary file, for VAD regions only
    subprocess.call([STANDFEAT, "-D", "39", "-infile", \
                    plp_fname, "-outfile", plp_norm_fname, \
                    "-vadfile", vad_fname])

In [15]:
def create_and_normalize_plps():
    for i, file_id in enumerate(segment_map):
        wav_fname = os.path.join(merged_wavs_path, file_id+".wav")
        vad_fname = os.path.join(merged_fa_vads_path, file_id+".vad")
        plp_fname = os.path.join(plp_path, file_id+".binary")
        plp_norm_fname = os.path.join(plp_norm_path, file_id+".std.binary")

        #print(file_id, wav_fname, vad_fname, plp_fname, plp_norm_fname)

        # create PLP
        if i % 20 == 0:
            print("plp for file %s " % file_id)

        #if not os.path.exists(plp_fname):
        create_plp(wav_fname, plp_fname)

        if i % 20 == 0:
            print("normalizing plp %s" % file_id)

        #if not os.path.exists(plp_norm_fname):
        normalize_plp(plp_fname, vad_fname, plp_norm_fname)
    print("Completed!")

## Create LSH files

In [16]:
lsh_path = "../lsh"
if not os.path.exists(lsh_path):
    os.makedirs(lsh_path)
lsh_proj_fname = os.path.join(lsh_path, "proj_S64xD39_seed1")

In [17]:
def create_lsh_proj_file(lsh_proj_fname):
    subprocess.call([config['base']["lsh_genproj"], \
                     "-D","39","-S","64","-seed", \
                     "1","-projfile", lsh_proj_fname])

def create_lsh_file(plp_norm_fname, vad_fname, lsh_proj_fname, lsh_fname):
    LSH = config['base']["lsh"]
    subprocess.call([LSH, "-D", "39", "-S", "64", \
                    "-projfile", lsh_proj_fname, \
                    "-featfile", plp_norm_fname, "-sigfile", \
                    lsh_fname, "-vadfile", vad_fname])

In [18]:
if not os.path.exists(lsh_proj_fname):
    create_lsh_proj_file(lsh_proj_fname)

In [19]:
def create_lsh_files():
    for i, file_id in enumerate(segment_map):
        wav_fname = os.path.join(merged_wavs_path, file_id+".wav")
        vad_fname = os.path.join(merged_fa_vads_path, file_id+".vad")
        plp_norm_fname = os.path.join(plp_norm_path, file_id+".std.binary")
        lsh_fname = os.path.join(lsh_path, file_id+".std.lsh64")

        #print(file_id, wav_fname, vad_fname, plp_fname, plp_norm_fname)

        # create LSH
        if i % 20 == 0:
            print("lsh for file %s " % file_id)

        #if not os.path.exists(lsh_fname):
        create_lsh_file(plp_norm_fname, vad_fname, lsh_proj_fname, lsh_fname)

    print("Completed!")

## Create ZRTools discovery command files

In [20]:
exp_path = '../exp'
if not os.path.exists(exp_path):
    os.makedirs(exp_path)

# List of wav files
segment_map = pickle.load(open(config['es']['segment_dict_fname'], "rb"))
wav_file_list = sorted(segment_map.keys())
exp_name = 'callhome'

In [21]:
def create_files_base():
    with open(os.path.join(exp_path, 'files.base'), "w") as out_f:
        for wav_file in wav_file_list:
            out_f.write(wav_file+'\n')
    print("Generated files.base")

In [22]:
def create_discovery_cmd_scripts(exp_path, wav_file_list, exp_name, num_splits=1):
    disc_file_split_base = "disc_{0:d}.cmd"
    disc_file_split = os.path.join(exp_path, disc_file_split_base)
    disc_split_file = os.path.join(exp_path, "disc_split.txt")
    num_files = len(wav_file_list)
    exp_local_path = os.path.join("exp", exp_name)
    cmd_string = "scripts/plebdisc_filepair \"{0:s}\" \"{1:s}\" {2:s} 39\n"

    total_lines = num_files * num_files
    lines_per_file = total_lines // num_splits
    smallfile = None
    curr_line = 0
    curr_file_num = 0

    for i in xrange(num_files) :
        if i % 20 == 0:
            print("Progress: {0:d} out of: {1:d}".format(curr_line+1, total_lines))
        for j in xrange(num_files):
            out_line = cmd_string.format(wav_file_list[i], \
                                              wav_file_list[j], \
                                              exp_local_path)
            if curr_line % lines_per_file == 0:
                if smallfile:
                    smallfile.close()
                small_filename = disc_file_split.format(curr_file_num)
                smallfile = open(small_filename, "w")
                curr_file_num += 1
            smallfile.write(out_line)
            curr_line += 1
    if smallfile:
        smallfile.close()

    # Making a list of commands to execute the split disc list
    full_split_cmd_string = "nice sh {0:s} 1> {1:s} 2>{2:s} &\n"
    split_cmd = os.path.join(exp_local_path, "matches","{0:s}.{1:d}")
    with open(disc_split_file, "w") as out_f:
        for i in xrange(curr_file_num):
            curr_split_file = os.path.join(exp_local_path, disc_file_split_base.format(i))
            split_cmd_out = split_cmd.format("out", i)
            #split_cmd_err = split_cmd.format("err", i)
            split_cmd_err = "/dev/null"

            out_line = "nice sh "
            out_f.write(full_split_cmd_string.format(curr_split_file, \
                                                    split_cmd_out, \
                                                    split_cmd_err))

    print("Completed - disc.cmd")

In [23]:
# create_discovery_cmd_scripts(exp_path=exp_path, wav_file_list=wav_file_list, exp_name=exp_name, num_splits=25)

# Read transcripts, and translations into a dictionary

In [24]:
Align = namedtuple('Align', ['word', 'start', 'end'])

In [25]:
def read_alignment_file(align_fname, stopwords_corpus=None):
    align_list = []
    with open(align_fname, "r") as align_f:
        for line in align_f:
            line_items = line.strip().split()
            if len(line_items) != 3:
                raise ValueError
            start, end = map(lambda v: int(float(v)*100), line_items[1:3])
            if (not stopwords_corpus) or \
            (stopwords_corpus and line_items[0].lower().decode("utf-8") not in stopwords_corpus):
                align_list.append(Align(*[line_items[0], start, end]))
    if sorted(align_list, key=lambda t: t.start) != align_list and not align_fname.endswith("en"):
        raise IOError    
            
    return align_list

In [26]:
es_words_path = '../wav2es-words/'
en_words_path = '../wav2eng-words/'
align_dict_fname = config['es']['align_dict_fname']

In [27]:
# # test code
# stopwords_es = set(stopwords.words('spanish'))
# stopwords_en = set(stopwords.words('english'))
# display(read_alignment_file('../wav2es-words/001.001.es'))
# display(read_alignment_file('../wav2es-words/001.001.es', stopwords_corpus=stopwords_es))
# display(read_alignment_file('../wav2eng-words/001.001.en'))
# display(read_alignment_file('../wav2eng-words/001.001.en', stopwords_corpus=stopwords_en))

In [28]:
def get_file_list(file_path, file_ext):
    return [os.path.splitext(f)[0] for f in os.listdir(file_path) if f.endswith(file_ext)]

In [29]:
es_file_list = get_file_list(es_words_path, 'es')
en_file_list = get_file_list(en_words_path, 'en')

print(sorted(es_file_list) == sorted(en_file_list))
print(set(es_file_list)-set(en_file_list))

False
set(['009.025'])


In [30]:
def create_alignment_dict():
    align_dict = {}
    stopwords_es = set(stopwords.words('spanish'))
    stopwords_en = set(stopwords.words('english'))
    for file_id in segment_map:
        #print("Processing file: %s" % file_id)
        align_dict[file_id] = {}
        for seg_id in segment_map[file_id]:
            align_dict[file_id][seg_id] = {}
            es_fname = os.path.join(es_words_path, seg_id+".es")
            en_fname = os.path.join(en_words_path, seg_id+".en")
            align_dict[file_id][seg_id]["es"] = read_alignment_file(es_fname)
            align_dict[file_id][seg_id]["en"] = read_alignment_file(en_fname)
            align_dict[file_id][seg_id]["es_cnt"] = read_alignment_file(es_fname, stopwords_corpus=stopwords_es)
            align_dict[file_id][seg_id]["en_cnt"] = read_alignment_file(en_fname, stopwords_corpus=stopwords_en)
    return align_dict
        

In [31]:
# align_dict = create_alignment_dict()
# pickle.dump(align_dict, open(align_dict_fname, "wb"))

## Create VAD from alignments

In [32]:
align_dict = pickle.load(open(align_dict_fname, "rb"))
segment_map = pickle.load(open(config['es']['segment_dict_fname'], "rb"))
# has_500ms_fa_vad_dict_fname = config['es']['has_500ms_fa_vad_dict']

In [33]:
def create_uttr_vad_from_alignment(align_dict, vad_path):
    has_500ms_dur = {}
    total_dur_10ms = 0
    total_dur_10ms_ge500ms = 0
    for i, vad_file_id in enumerate(align_dict):
        if i % 20 == 0:
            print("Created vad for %d files id" % i)
        for seg_id in align_dict[vad_file_id]:
            with open(os.path.join(vad_path, seg_id+".vad"), "w") as vad_f:
                dur_10ms = 0
                dur_10ms_ge500ms = 0
                vad_list = []
                # start index
                s = 0
                # create a local list of alignment values
                align_list = align_dict[vad_file_id][seg_id]['es']
                for j in xrange(len(align_list)):
                    # if 1st or last element, add to vad_list
                    if ((j+1) == len(align_list)) or (align_list[j].end != align_list[j+1].start):
                        vad_list.append(((align_list[s].start), (align_list[j].end)))
                        s=j+1
                # write vad list to file        
                for vad_tup in vad_list:
                    start = vad_tup[0]
                    end = vad_tup[1]
                    dur_10ms += (end-start)
                    dur_10ms_ge500ms += ((end - start) if (end-start) >= 50 else 0)
                    out_line = ("%d %d\n" %(start, end))
                    vad_f.write(out_line)
                
                # set whether atleast one vad region of 500 ms
                has_500ms_dur[seg_id] = (dur_10ms_ge500ms > 0)
                # compute total durations
                total_dur_10ms += dur_10ms
                total_dur_10ms_ge500ms += dur_10ms_ge500ms 
                    
            # end for
        # end looping over all segments
    # end writing vad file
    return total_dur_10ms, total_dur_10ms_ge500ms, has_500ms_dur
    

In [34]:
def create_merged_vad_from_alignment(vad_file_id, align_dict, segment_map, vad_path):
    total_dur_10ms = 0
    total_dur_10ms_ge500ms = 0
    with open(os.path.join(vad_path, vad_file_id+".vad"), "w") as vad_f:
        print("creating vad %s ..." % vad_file_id)
        for i, (seg_id, seg_start) in enumerate(sorted(segment_map[vad_file_id].items(), key=lambda t:t[0])):
            vad_list = []
            # start index
            s = 0
            # create a local list of alignment values
            align_list = align_dict[vad_file_id][seg_id]['es']
            for j in xrange(len(align_list)):
                # if 1st or last element, add to vad_list
                if ((j+1) == len(align_list)) or (align_list[j].end != align_list[j+1].start):
                    vad_list.append(((seg_start+align_list[s].start), (seg_start+align_list[j].end)))
                    s=j+1
            # write vad list to file        
            for vad_tup in vad_list:
                start = vad_tup[0]
                end = vad_tup[1]
                total_dur_10ms += (end-start)
                total_dur_10ms_ge500ms += ((end - start) if (end-start) >= 50 else 0)
                out_line = ("%d %d\n" %(start, end))
                vad_f.write(out_line)
            # end for
        # end looping over all segments
    # end writing vad file
    return total_dur_10ms, total_dur_10ms_ge500ms
    

### Create new directory for merged vads

In [35]:
uttr_fa_vads_path = config['es']['es_uttr_fa_vad']
if not os.path.exists(uttr_fa_vads_path):
    os.makedirs(uttr_fa_vads_path)

merged_fa_vads_path = config['es']['es_merge_fa_vad']
if not os.path.exists(merged_fa_vads_path):
    os.makedirs(merged_fa_vads_path)


In [36]:
# t1, t2, has_500ms_dur = create_uttr_vad_from_alignment(align_dict, uttr_fa_vads_path)
# # print(t1, t2)
# print(map(lambda t: "{0:.3f} hrs".format((t / 100.0 / 3600)), [t1, t2]))
# print("saving dict: %s" % has_500ms_fa_vad_dict_fname)
# pickle.dump(has_500ms_dur, open(has_500ms_fa_vad_dict_fname, "wb"))

In [37]:
# # check how many utterances have atleast 500ms VAD
# print("total utterances: %d" % len(has_500ms_dur))
# uttrs_with_500ms = {k:v for k, v in has_500ms_dur.items() if v}
# print("with 500ms: %d" % len(uttrs_with_500ms))

In [38]:
# save dev file with only 500ms segments

In [39]:
# dev_500ms_fname = config['es']['mt_dev_500ms_files']
# dev_fname = config['es']['mt_dev_test_files']
# with open(dev_fname, "r") as dev_f, open(dev_500ms_fname, "w") as dev_500ms_f:
#     for line in dev_f:
#         if line.strip() in has_500ms_dur and has_500ms_dur[line.strip()]:
#             dev_500ms_f.write(line)

In [40]:
# !wc $dev_500ms_fname

### Create merged vad for each file

In [41]:
def create_merged_vads():
    total_dur_10ms, total_dur_10ms_ge500ms = 0, 0
    for vad_file_id in segment_map:
        t1, t2 = create_vad_from_alignment(vad_file_id, align_dict, segment_map, merged_fa_vads_path)
        total_dur_10ms += t1
        total_dur_10ms_ge500ms += t2

In [42]:
# print(total_dur_10ms, total_dur_10ms_ge500ms)
# print(map(lambda t: "{0:.3f}".format((t / 100.0 / 3600)), [total_dur_10ms, total_dur_10ms_ge500ms]))

## Create features

In [43]:
def create_gold_feats(align_dict, gold_feats_dict_fname, es_key="es"):
    gold_feats_dict = {}
    for fid in align_dict:
        for sid in align_dict[fid]:
            gold_feats_dict[sid] = {}
            if align_dict[fid][sid][es_key] == []:
                # Only es_cnt can be empty, in which case include stop words
                gold_feats_dict[sid] = [w.word for w in align_dict[fid][sid]['es']]
            else:
                gold_feats_dict[sid] = [w.word for w in align_dict[fid][sid][es_key]]
    print("Saving gold features using key: %s" % es_key)
    pickle.dump(gold_feats_dict, open(gold_feats_dict_fname, "wb"))
    print("finished ...")
    return gold_feats_dict
        

In [44]:
def gold_feats():
    align_dict = pickle.load(open(align_dict_fname, "rb"))
    gold_feats_dict_fname = config['es']['gold_feats']
    gold_feats_dict = create_gold_feats(align_dict, gold_feats_dict_fname, es_key="es_cnt")

## Check English translations

In [45]:
es_words = [a.word for fid in align_dict for sid in align_dict[fid] for a in align_dict[fid][sid]['es']]
es_cnt_words = [a.word for fid in align_dict for sid in align_dict[fid] for a in align_dict[fid][sid]['es_cnt']]
en_words = [a.word for fid in align_dict for sid in align_dict[fid] for a in align_dict[fid][sid]['en']]
en_cnt_words = [a.word for fid in align_dict for sid in align_dict[fid] for a in align_dict[fid][sid]['en_cnt']]

In [46]:
from collections import Counter

In [47]:
es_words_freq = Counter(es_words)
es_cnt_words_freq = Counter(es_cnt_words)
en_words_freq = Counter(en_words)
en_cnt_words_freq = Counter(en_cnt_words)

In [48]:
print(sorted(en_words_freq.items(), reverse=True, key=lambda t:t[1])[:10])

[('THE', 5178), ('AND', 4629), ('THAT', 4080), ('I', 3849), ('TO', 3359), ('YES', 3345), ("'T", 2265), ('YOU', 2256), ('NO', 2135), ("'S", 2030)]


In [49]:
print(sorted(en_cnt_words_freq.items(), reverse=True, key=lambda t:t[1])[:10])

[('YES', 3345), ("'T", 2265), ("'S", 2030), ('WELL', 1829), ('AH', 1349), ('KNOW', 1100), ('OH', 1066), ('SEE', 934), ('YEAH', 904), ('LIKE', 889)]


In [50]:
print(sorted(es_words_freq.items(), reverse=True, key=lambda t:t[1])[:10])

[('QUE', 7089), ('NO', 6110), ('Y', 5037), ('A', 4310), ('DE', 4009), ('S\xc3\xad', 3667), ('LA', 3425), ('YA', 2782), ('EL', 2680), ('ES', 2587)]


In [51]:
print(sorted(es_cnt_words_freq.items(), reverse=True, key=lambda t:t[1])[:10])

[('AH', 1831), ('PUES', 1236), ('BUENO', 1186), ('BIEN', 1183), ('SI', 1045), ('<LAUGH>', 987), ('MMM', 976), ('AS\xc3\xad', 781), ('ENTONCES', 775), ('CLARO', 729)]


In [52]:
print([(w,f) for w, f in en_cnt_words_freq.items() if "'" in w])

[("'T", 2265), ("'S", 2030), ("'R", 1), ("'D", 34), ("'M", 627), ("'TS", 1), ("'VE", 184), ("'", 40), ("'OEUVRES", 1), ("'RE", 269), ("'CLOCK", 4), ("'LL", 540), ("O'CLOCK", 1), ("'AM", 8)]


In [53]:
print([(w,f) for w, f in es_words_freq.items() if "<" in w])

[('<SNEEZE>', 4), ('<COUGH>', 11), ('<LAUGH>', 987), ('<BREATH>', 16), ('<NOISE>', 450), ('<BACKGROUND>', 107)]


In [54]:
print([(w,f) for w, f in es_cnt_words_freq.items() if "<" in w])

[('<SNEEZE>', 4), ('<COUGH>', 11), ('<LAUGH>', 987), ('<BREATH>', 16), ('<NOISE>', 450), ('<BACKGROUND>', 107)]


## Key-word spotting, prepare data

In [109]:
def read_proto_list():
    protos = []
    with open(config['proto']['protos_list'], "r") as f:
        for line in f:
            protos.append(line.strip())
    return protos

In [132]:
def create_vad_norm_plp_for_protos(protos):
    if not os.path.exists(config['proto']['vad_path']):
        os.makedirs(config['proto']['vad_path'])
    if not os.path.exists(config['proto']['norm_plp_path']):
        os.makedirs(config['proto']['norm_plp_path'])
    if not os.path.exists(config['proto']['proto_float32_path']):
        os.makedirs(config['proto']['proto_float32_path'])
    if not os.path.exists(config['proto']['lsh_path']):
        os.makedirs(config['proto']['lsh_path'])
        
    sys.stderr.flush()
    with tqdm(total=len(protos)) as pbar:
        for i, pid in enumerate(protos, start=1):
            try:
                pid_base = os.path.splitext(pid)[0]
                plp_fname = os.path.join(config['proto']['proto_path'], pid)
                plp_f32_fname = os.path.join(config['proto']['proto_float32_path'], pid)
                plp_norm_fname = (os.path.join(config['proto']['norm_plp_path'], 
                                          "{0:s}.std.binary".format(pid_base)))
                vad_fname = (os.path.join(config['proto']['vad_path'], "{0:s}.vad".format(pid_base)))
                lsh_fname = (os.path.join(config['proto']['lsh_path'], 
                                          "{0:s}.std.lsh64".format(pid_base)))
                # read npy file to get shape
                x = np.load(plp_fname)
                # ZRTools use float32
                y = x.astype(np.float32)
                y.tofile(plp_f32_fname)

                # create vad file
                with open(vad_fname, "w") as f:
                    f.write("0\t{0:d}\n".format(y.shape[0]))

                # normalize plp
                normalize_plp(plp_f32_fname, vad_fname, plp_norm_fname)
                #print(os.path.getsize(plp_fname))

                # create lsh
                create_lsh_file(plp_norm_fname, vad_fname, lsh_proj_fname, lsh_fname)
                
                # update progress
                pbar.set_description("processing proto {0:s}".format(pid_base))
                pbar.update(1)
            except:
                print(" problem in file: {0:s}".format(pid), end=",")
    print("completed")
        

In [175]:
def create_lsh_for_call_norm_plps():
    if not os.path.exists(config['proto']['call_vad_path']):
        print("VAD files not found in folder: {0:s}".format(config['proto']['call_vad_path']))
        return
    if not os.path.exists(config['proto']['call_plp_path']):
        print("PLP files not found in folder: {0:s}".format(config['proto']['call_plp_path']))
        return
    if not os.path.exists(config['proto']['call_lsh_path']):
        os.makedirs(config['proto']['call_lsh_path'])
    
    sys.stderr.flush()
    calls = [f for f in os.listdir(config['proto']['call_plp_path']) if f.endswith(".std.binary")]
    with tqdm(total=len(calls)) as pbar:
        for i, call in enumerate(calls, start=1):
            try:
                call_base = call.replace(".std.binary", "")
                plp_norm_fname = (os.path.join(config['proto']['call_plp_path'], call))
                vad_fname = (os.path.join(config['proto']['call_vad_path'], 
                                          "{0:s}.vad".format(call_base)))
                lsh_fname = (os.path.join(config['proto']['call_lsh_path'], 
                                          "{0:s}.std.lsh64".format(call_base)))

                # read npy file - normalized plp
                x = np.fromfile(plp_norm_fname)
                # ZRTools use float32
                y = x.astype(np.float32)
                y.tofile(plp_norm_fname)

                # create lsh
                create_lsh_file(plp_norm_fname, vad_fname, lsh_proj_fname, lsh_fname)
                
                # update progress
                pbar.set_description("processing proto {0:s}".format(call_base))
                pbar.update(1)
                #print(i)
            except:
                print("problem in file: {0:s}".format(call), end=", ")
    print("completed")
        

In [176]:
protos = read_proto_list()
protos[:5]

['10TH.npy', '13.npy', '155.npy', '20TH.npy', '5TH.npy']

In [179]:
create_vad_norm_plp_for_protos(protos)

processing proto CATALOGS:  14%|█▍        | 805/5703 [00:20<02:09, 37.88it/s]

 problem in file: CASTAñEDA.npy,

processing proto DOOR:  27%|██▋       | 1549/5703 [00:40<01:45, 39.24it/s]   

 problem in file: DOñIHUE.npy, problem in file: DOñINHUE.npy,

processing proto FETCH:  34%|███▎      | 1924/5703 [00:49<01:35, 39.78it/s]    

 problem in file: FERNáNDEZ.npy, problem in file: FERNáN.npy,

processing proto HIGHLY:  42%|████▏     | 2388/5703 [01:01<01:28, 37.65it/s] 

 problem in file: HERNáNDEZ.npy,

processing proto HUGS:  43%|████▎     | 2470/5703 [01:03<01:21, 39.91it/s]   

 problem in file: HUARáS.npy,

processing proto ILLUSION:  44%|████▍     | 2515/5703 [01:04<01:21, 39.17it/s]

 problem in file: IGUAZú.npy,

processing proto JACKASS:  47%|████▋     | 2679/5703 [01:08<01:14, 40.68it/s]

 problem in file: IVáN.npy,

processing proto JUANITA:  48%|████▊     | 2735/5703 [01:10<01:11, 41.59it/s]

 problem in file: JOSé.npy,

processing proto LATELY:  50%|████▉     | 2841/5703 [01:13<01:11, 40.09it/s]

 problem in file: LARRAñAGA.npy,

processing proto MARIANELA:  54%|█████▍    | 3107/5703 [01:19<01:03, 40.90it/s]

 problem in file: MARAñON.npy,

processing proto MARIBI:  55%|█████▍    | 3112/5703 [01:20<01:03, 40.74it/s]   

 problem in file: MARíA.npy,

processing proto MENA:  56%|█████▌    | 3204/5703 [01:22<01:02, 39.69it/s]   

 problem in file: MéLIDA.npy,

processing proto MISSIONARY:  58%|█████▊    | 3288/5703 [01:24<00:59, 40.30it/s] 

 problem in file: MISIóN.npy,

processing proto MONTERREY:  58%|█████▊    | 3328/5703 [01:25<01:00, 39.17it/s]

 problem in file: MóNICA.npy,

processing proto NIKI:  61%|██████    | 3458/5703 [01:28<00:56, 39.68it/s]    

 problem in file: NICOLáS.npy,

processing proto NU:  62%|██████▏   | 3516/5703 [01:30<00:52, 41.67it/s]      

 problem in file: ´.npy, problem in file: ¨.npy, problem in file: ¡.npy, problem in file: ​.npy,

processing proto OCTAVIO:  62%|██████▏   | 3541/5703 [01:30<00:54, 39.97it/s]

 problem in file: OCAñA.npy,

processing proto PRY:  71%|███████▏  | 4067/5703 [01:44<00:39, 40.93it/s]    

 problem in file: protos.list,

processing proto REACTION:  73%|███████▎  | 4173/5703 [01:47<00:40, 38.10it/s]

 problem in file: RAúL.npy,

processing proto ZULEMA: 100%|█████████▉| 5679/5703 [02:26<00:00, 38.85it/s]

completed





In [178]:
create_lsh_for_call_norm_plps()

processing proto 048: 100%|██████████| 104/104 [00:05<00:00, 19.83it/s]

completed





In [183]:
def create_keyword_spotting_cmd_scripts(exp_path, wav_file_list, protos, exp_name, num_splits=1):
    disc_file_split_base = "keyword_spot_{0:d}.cmd"
    disc_file_split = os.path.join(exp_path, disc_file_split_base)
    disc_split_file = os.path.join(exp_path, "keyword_spot_split.txt")
    
    num_files = len(wav_file_list)
    num_protos = len(protos)
    
    exp_local_path = os.path.join("exp", exp_name)
    cmd_string = "scripts/plebdisc_filepair_keyword_spotting \"{0:s}\" \"{1:s}\" {2:s} 39\n"

    total_lines = num_files * num_protos
    lines_per_file = total_lines // num_splits
    smallfile = None
    curr_line = 0
    curr_file_num = 0
    
    sys.stderr.flush()
    with tqdm(total=num_protos) as pbar:
        for i in xrange(num_protos):
            pid_base = os.path.splitext(protos[i])[0]
            pbar.update(1)
#             if i % 20 == 0:
#                 print("Progress: {0:d} out of: {1:d}".format(curr_line+1, total_lines))
            for j in xrange(num_files):
                out_line = cmd_string.format(pid_base, wav_file_list[j], exp_local_path)
                if curr_line % lines_per_file == 0:
                    if smallfile:
                        smallfile.close()
                    small_filename = disc_file_split.format(curr_file_num)
                    smallfile = open(small_filename, "w")
                    curr_file_num += 1
                smallfile.write(out_line)
                curr_line += 1
    if smallfile:
        smallfile.close()

    # Making a list of commands to execute the split disc list
    full_split_cmd_string = "nice sh {0:s} 1> {1:s} 2>{2:s} &\n"
    split_cmd = os.path.join(exp_local_path, "matches","{0:s}.{1:d}")
    with open(disc_split_file, "w") as out_f:
        for i in xrange(curr_file_num):
            curr_split_file = os.path.join(exp_local_path, disc_file_split_base.format(i))
            split_cmd_out = split_cmd.format("keyword_spot_out", i)
            #split_cmd_err = split_cmd.format("err", i)
            split_cmd_err = "/dev/null"

            out_line = "nice sh "
            out_f.write(full_split_cmd_string.format(curr_split_file, \
                                                    split_cmd_out, \
                                                    split_cmd_err))

    print("Completed - keyword_spot cmd script")

In [184]:
create_keyword_spotting_cmd_scripts(exp_path=exp_path, wav_file_list=wav_file_list[:33], protos=protos,
                                    exp_name=exp_name, num_splits=25)

100%|██████████| 5703/5703 [00:00<00:00, 18350.43it/s]

Completed - keyword_spot cmd script





In [50]:
wav_file_list[:33]

['001',
 '002',
 '005',
 '006',
 '007',
 '009',
 '010',
 '011',
 '012',
 '013',
 '014',
 '015',
 '018',
 '021',
 '022',
 '023',
 '024',
 '025',
 '026',
 '027',
 '028',
 '029',
 '030',
 '031',
 '032',
 '033',
 '034',
 '035',
 '036',
 '037',
 '038',
 '039',
 '040']

In [106]:
len(protos)

5703

In [69]:
haha = range(1000000)
with tqdm(total=len(haha)) as pbar:
    sys.stderr.flush()
    for i in haha:
        pbar.set_description("processing proto {0:d}".format(i))
        pbar.update(1)

processing proto 999999: 100%|██████████| 1000000/1000000 [00:02<00:00, 463305.73it/s]
