In [3]:
from __future__ import print_function
from __future__ import division
import os
import pickle
import json
import subprocess
from IPython.display import display
from IPython.display import Audio
from collections import namedtuple
import sys
import nltk
from nltk.corpus import stopwords
import numpy as np
from tqdm import tqdm

In [4]:
with open("config.json") as json_data_file:
    config = json.load(json_data_file)

# Preprocessing CALLHOME for ZRTools


- Created: 26-Oct-2016


## Create mapping for start time for each segment

Format: Dictionary  
key: {key: value}  
*file: {file.seg.wav: start time}*  
Name: segment_start.dict, segment_start.txt  

In [5]:
def read_segments_file(seg_fname):
    segment_map = {}
    with open(seg_fname, "r") as seg_f:
        for i, line in enumerate(seg_f):
            if i == 0:
                continue
            try:
                line_items = line.strip().split()
                seg_key = line_items[0]
                file_id = line_items[1]
                if file_id not in segment_map:
                    segment_map[file_id] = {}
                seg_start = int(float(line_items[6])*100)
                segment_map[file_id][seg_key] = seg_start
            except ValueError:
                print("Incorrect line format at line: %d" % i)
    return segment_map
        

### Read segment map

In [6]:
def read_segment_map():
    segment_map = read_segments_file('../segments.txt')
    pickle.dump(segment_map, open(config['es']['segment_dict_fname'], "wb"))

## Create VAD files for merged wavs

In [5]:
def create_merged_vad_from_ed(vad_file_id, segment_map, seg_vad_path, merged_vad_path):
    total_dur_10ms = 0
    total_dur_10ms_ge500ms = 0
    with open(os.path.join(merged_vad_path, vad_file_id+".vad"), "w") as vad_f:
        print("creating vad %s ..." % vad_file_id)
        for i, (seg_id, seg_start) in enumerate(sorted(segment_map[vad_file_id].items(), key=lambda t:t[0])):
            with open(os.path.join(seg_vad_path, seg_id+".vad"), "r") as seg_vad_f:
                for line in seg_vad_f:
                    line_items = map(int, line.strip().split())
                    start = seg_start+line_items[0]
                    end = seg_start+line_items[1]
                    total_dur_10ms += (end-start)
                    total_dur_10ms_ge500ms += ((end - start) if (end-start) >= 50 else 0)
                    out_line = ("%d %d\n" %(start, end))
                    vad_f.write(out_line)
                # end for
            # end reading seg file
        # end looping over all segments
    # end writing vad file
    return total_dur_10ms, total_dur_10ms_ge500ms

### Create new directory for merged vads

In [6]:
# merged_ed_vads_path = "../mergedVads"
# seg_vad_path = "../vad"
# if not os.path.exists(merged_ed_vads_path):
#     os.makedirs(merged_ed_vads_path)

### Create merged vad for each file

In [7]:
# total_dur_10ms, total_dur_10ms_ge500ms = 0, 0
# for vad_file_id in segment_map:
#     t1, t2 = create_merged_vad(vad_file_id, segment_map, seg_vad_path, merged_vads_path)
#     total_dur_10ms += t1
#     total_dur_10ms_ge500ms += t2

In [8]:
# print(total_dur_10ms, total_dur_10ms_ge500ms)
# print(map(lambda t: "{0:.3f}".format((t / 100.0 / 3600)), [total_dur_10ms, total_dur_10ms_ge500ms]))

## Create PLP features

In [9]:
merged_wavs_path = "../mergeWavs"
plp_path = "../plp"
plp_norm_path = "../std_plp"
if not os.path.exists(plp_path):
    os.makedirs(plp_path)
if not os.path.exists(plp_norm_path):
    os.makedirs(plp_norm_path)

In [10]:
def create_file_lst(file_lst_fname):
    prefix = "../corpora/callhome/mergeWavs"
    wav_file_list = [os.path.join(prefix, wav_file) for \
                     wav_file in os.listdir(merged_wavs_path) if wav_file.endswith(".wav")]
    wav_file_list_string = "\n".join(wav_file_list)
    with open(file_lst_fname, "w") as out_f:
        out_f.write(wav_file_list_string)
    print("Finished writing files.lst")

In [11]:
# create_file_lst(config["es"]["lst_file"])

In [12]:
def create_plp(wav_fname, plp_fname):
    FEACALC = config['base']["feacalc"]
    subprocess.call([FEACALC,"-plp", \
                    "12", "-cep", "13", "-dom", "cep", "-deltaorder", \
                    "2", "-dither", "-frqaxis", "bark", "-samplerate", \
                    "8000", "-win", "25", "-step", "10", "-ip", \
                    "MSWAVE", "-rasta", "false", "-compress", \
                    "true", "-op", "swappedraw", "-o", plp_fname, wav_fname])

    
def normalize_plp(plp_fname, vad_fname, plp_norm_fname):
    STANDFEAT = config['base']["standfeat"]
    # Standardize binary file, for VAD regions only
    subprocess.call([STANDFEAT, "-D", "39", "-infile", \
                    plp_fname, "-outfile", plp_norm_fname, \
                    "-vadfile", vad_fname])

In [13]:
def create_and_normalize_plps():
    for i, file_id in enumerate(segment_map):
        wav_fname = os.path.join(merged_wavs_path, file_id+".wav")
        vad_fname = os.path.join(merged_fa_vads_path, file_id+".vad")
        plp_fname = os.path.join(plp_path, file_id+".binary")
        plp_norm_fname = os.path.join(plp_norm_path, file_id+".std.binary")

        #print(file_id, wav_fname, vad_fname, plp_fname, plp_norm_fname)

        # create PLP
        if i % 20 == 0:
            print("plp for file %s " % file_id)

        #if not os.path.exists(plp_fname):
        create_plp(wav_fname, plp_fname)

        if i % 20 == 0:
            print("normalizing plp %s" % file_id)

        #if not os.path.exists(plp_norm_fname):
        normalize_plp(plp_fname, vad_fname, plp_norm_fname)
    print("Completed!")

## Create LSH files

In [14]:
lsh_path = "../lsh"
if not os.path.exists(lsh_path):
    os.makedirs(lsh_path)
lsh_proj_fname = os.path.join(lsh_path, "proj_S64xD39_seed1")

In [15]:
def create_lsh_proj_file(lsh_proj_fname):
    subprocess.call([config['base']["lsh_genproj"], \
                     "-D","39","-S","64","-seed", \
                     "1","-projfile", lsh_proj_fname])

def create_lsh_file(plp_norm_fname, vad_fname, lsh_proj_fname, lsh_fname):
    LSH = config['base']["lsh"]
    subprocess.call([LSH, "-D", "39", "-S", "64", \
                    "-projfile", lsh_proj_fname, \
                    "-featfile", plp_norm_fname, "-sigfile", \
                    lsh_fname, "-vadfile", vad_fname])

In [16]:
if not os.path.exists(lsh_proj_fname):
    create_lsh_proj_file(lsh_proj_fname)

In [17]:
def create_lsh_files():
    for i, file_id in enumerate(segment_map):
        wav_fname = os.path.join(merged_wavs_path, file_id+".wav")
        vad_fname = os.path.join(merged_fa_vads_path, file_id+".vad")
        plp_norm_fname = os.path.join(plp_norm_path, file_id+".std.binary")
        lsh_fname = os.path.join(lsh_path, file_id+".std.lsh64")

        #print(file_id, wav_fname, vad_fname, plp_fname, plp_norm_fname)

        # create LSH
        if i % 20 == 0:
            print("lsh for file %s " % file_id)

        #if not os.path.exists(lsh_fname):
        create_lsh_file(plp_norm_fname, vad_fname, lsh_proj_fname, lsh_fname)

    print("Completed!")

## Create ZRTools discovery command files

In [7]:
exp_path = '../exp'
if not os.path.exists(exp_path):
    os.makedirs(exp_path)

# List of wav files
segment_map = pickle.load(open(config['es']['segment_dict_fname'], "rb"))
wav_file_list = sorted(segment_map.keys())
exp_name = 'callhome'

In [8]:
def create_files_base():
    with open(os.path.join(exp_path, 'files.base'), "w") as out_f:
        for wav_file in wav_file_list:
            out_f.write(wav_file+'\n')
    print("Generated files.base")

In [9]:
def create_discovery_cmd_scripts(exp_path, wav_file_list, exp_name, num_splits=1):
    disc_file_split_base = "disc_{0:d}.cmd"
    disc_file_split = os.path.join(exp_path, disc_file_split_base)
    disc_split_file = os.path.join(exp_path, "disc_split.txt")
    num_files = len(wav_file_list)
    exp_local_path = os.path.join("exp", exp_name)
    cmd_string = "scripts/plebdisc_filepair \"{0:s}\" \"{1:s}\" {2:s} 39\n"

    total_lines = num_files * num_files
    lines_per_file = total_lines // num_splits
    smallfile = None
    curr_line = 0
    curr_file_num = 0

    for i in xrange(num_files) :
        if i % 20 == 0:
            print("Progress: {0:d} out of: {1:d}".format(curr_line+1, total_lines))
        for j in xrange(num_files):
            out_line = cmd_string.format(wav_file_list[i], \
                                              wav_file_list[j], \
                                              exp_local_path)
            if curr_line % lines_per_file == 0:
                if smallfile:
                    smallfile.close()
                small_filename = disc_file_split.format(curr_file_num)
                smallfile = open(small_filename, "w")
                curr_file_num += 1
            smallfile.write(out_line)
            curr_line += 1
    if smallfile:
        smallfile.close()

    # Making a list of commands to execute the split disc list
    full_split_cmd_string = "nice sh {0:s} 1> {1:s} 2>{2:s} &\n"
    split_cmd = os.path.join(exp_local_path, "matches","{0:s}.{1:d}")
    with open(disc_split_file, "w") as out_f:
        for i in xrange(curr_file_num):
            curr_split_file = os.path.join(exp_local_path, disc_file_split_base.format(i))
            split_cmd_out = split_cmd.format("out", i)
            #split_cmd_err = split_cmd.format("err", i)
            split_cmd_err = "/dev/null"

            out_line = "nice sh "
            out_f.write(full_split_cmd_string.format(curr_split_file, \
                                                    split_cmd_out, \
                                                    split_cmd_err))

    print("Completed - disc.cmd")

In [21]:
# create_discovery_cmd_scripts(exp_path=exp_path, wav_file_list=wav_file_list, exp_name=exp_name, num_splits=25)

# Read transcripts, and translations into a dictionary

In [17]:
Align = namedtuple('Align', ['word', 'start', 'end'])

In [18]:
def read_alignment_file(align_fname, stopwords_corpus=None):
    align_list = []
    with open(align_fname, "r") as align_f:
        for line in align_f:
            line_items = line.strip().split()
            if len(line_items) != 3:
                raise ValueError
            start, end = map(lambda v: int(float(v)*100), line_items[1:3])
            if (not stopwords_corpus) or \
            (stopwords_corpus and line_items[0].lower() not in stopwords_corpus):
                align_list.append(Align(*[line_items[0], start, end]))
    if sorted(align_list, key=lambda t: t.start) != align_list and not align_fname.endswith("en"):
        raise IOError    
            
    return align_list

In [19]:
es_words_path = '../wav2es-words/'
en_words_path = '../wav2eng-words/'
align_dict_fname = config['es']['align_dict_fname']

In [20]:
# # test code
# stopwords_es = set(stopwords.words('spanish'))
# stopwords_en = set(stopwords.words('english'))
# display(read_alignment_file('../wav2es-words/001.001.es'))
# display(read_alignment_file('../wav2es-words/001.001.es', stopwords_corpus=stopwords_es))
# display(read_alignment_file('../wav2eng-words/001.001.en'))
# display(read_alignment_file('../wav2eng-words/001.001.en', stopwords_corpus=stopwords_en))

In [21]:
def get_file_list(file_path, file_ext):
    return [os.path.splitext(f)[0] for f in os.listdir(file_path) if f.endswith(file_ext)]

In [22]:
es_file_list = get_file_list(es_words_path, 'es')
en_file_list = get_file_list(en_words_path, 'en')

print(sorted(es_file_list) == sorted(en_file_list))
print(set(es_file_list)-set(en_file_list))

False
{'009.025'}


In [25]:
def create_alignment_dict():
    align_dict = {}
    stopwords_es = set(stopwords.words('spanish'))
    stopwords_en = set(stopwords.words('english'))
    for file_id in sorted(segment_map):
        print("Processing file: %s" % file_id)
        align_dict[file_id] = {}
        for seg_id in segment_map[file_id]:
            align_dict[file_id][seg_id] = {}
            es_fname = os.path.join(es_words_path, seg_id+".es")
            en_fname = os.path.join(en_words_path, seg_id+".en")
            align_dict[file_id][seg_id]["es"] = read_alignment_file(es_fname)
            align_dict[file_id][seg_id]["en"] = read_alignment_file(en_fname)
            align_dict[file_id][seg_id]["es_cnt"] = read_alignment_file(es_fname, stopwords_corpus=stopwords_es)
            align_dict[file_id][seg_id]["en_cnt"] = read_alignment_file(en_fname, stopwords_corpus=stopwords_en)
    return align_dict
        

In [26]:
align_dict = create_alignment_dict()
pickle.dump(align_dict, open(align_dict_fname, "wb"))

Processing file: 090
Processing file: 091
Processing file: 092
Processing file: 093
Processing file: 094
Processing file: 095
Processing file: 096
Processing file: 097
Processing file: 010
Processing file: 011
Processing file: 012
Processing file: 013
Processing file: 014
Processing file: 015
Processing file: 018
Processing file: 025
Processing file: 024
Processing file: 027
Processing file: 026
Processing file: 021
Processing file: 023
Processing file: 022
Processing file: 029
Processing file: 028
Processing file: 115
Processing file: 114
Processing file: 038
Processing file: 039
Processing file: 111
Processing file: 110
Processing file: 113
Processing file: 112
Processing file: 032
Processing file: 033
Processing file: 030
Processing file: 031
Processing file: 036
Processing file: 037
Processing file: 034
Processing file: 035
Processing file: 051
Processing file: 108
Processing file: 049
Processing file: 048
Processing file: 047
Processing file: 046
Processing file: 045
Processing fi

## Create VAD from alignments

In [27]:
align_dict = pickle.load(open(align_dict_fname, "rb"))
segment_map = pickle.load(open(config['es']['segment_dict_fname'], "rb"))
# has_500ms_fa_vad_dict_fname = config['es']['has_500ms_fa_vad_dict']

In [31]:
def create_uttr_vad_from_alignment(align_dict, vad_path):
    has_500ms_dur = {}
    total_dur_10ms = 0
    total_dur_10ms_ge500ms = 0
    for i, vad_file_id in enumerate(align_dict):
        if i % 20 == 0:
            print("Created vad for %d files id" % i)
        for seg_id in align_dict[vad_file_id]:
            with open(os.path.join(vad_path, seg_id+".vad"), "w") as vad_f:
                dur_10ms = 0
                dur_10ms_ge500ms = 0
                vad_list = []
                # start index
                s = 0
                # create a local list of alignment values
                align_list = align_dict[vad_file_id][seg_id]['es']
                for j in xrange(len(align_list)):
                    # if 1st or last element, add to vad_list
                    if ((j+1) == len(align_list)) or (align_list[j].end != align_list[j+1].start):
                        vad_list.append(((align_list[s].start), (align_list[j].end)))
                        s=j+1
                # write vad list to file        
                for vad_tup in vad_list:
                    start = vad_tup[0]
                    end = vad_tup[1]
                    dur_10ms += (end-start)
                    dur_10ms_ge500ms += ((end - start) if (end-start) >= 50 else 0)
                    out_line = ("%d %d\n" %(start, end))
                    vad_f.write(out_line)
                
                # set whether atleast one vad region of 500 ms
                has_500ms_dur[seg_id] = (dur_10ms_ge500ms > 0)
                # compute total durations
                total_dur_10ms += dur_10ms
                total_dur_10ms_ge500ms += dur_10ms_ge500ms 
                    
            # end for
        # end looping over all segments
    # end writing vad file
    return total_dur_10ms, total_dur_10ms_ge500ms, has_500ms_dur
    

In [32]:
def create_merged_vad_from_alignment(vad_file_id, align_dict, segment_map, vad_path):
    total_dur_10ms = 0
    total_dur_10ms_ge500ms = 0
    with open(os.path.join(vad_path, vad_file_id+".vad"), "w") as vad_f:
        print("creating vad %s ..." % vad_file_id)
        for i, (seg_id, seg_start) in enumerate(sorted(segment_map[vad_file_id].items(), key=lambda t:t[0])):
            vad_list = []
            # start index
            s = 0
            # create a local list of alignment values
            align_list = align_dict[vad_file_id][seg_id]['es']
            for j in xrange(len(align_list)):
                # if 1st or last element, add to vad_list
                if ((j+1) == len(align_list)) or (align_list[j].end != align_list[j+1].start):
                    vad_list.append(((seg_start+align_list[s].start), (seg_start+align_list[j].end)))
                    s=j+1
            # write vad list to file        
            for vad_tup in vad_list:
                start = vad_tup[0]
                end = vad_tup[1]
                total_dur_10ms += (end-start)
                total_dur_10ms_ge500ms += ((end - start) if (end-start) >= 50 else 0)
                out_line = ("%d %d\n" %(start, end))
                vad_f.write(out_line)
            # end for
        # end looping over all segments
    # end writing vad file
    return total_dur_10ms, total_dur_10ms_ge500ms
    

### Create new directory for merged vads

In [33]:
uttr_fa_vads_path = config['es']['es_uttr_fa_vad']
if not os.path.exists(uttr_fa_vads_path):
    os.makedirs(uttr_fa_vads_path)

merged_fa_vads_path = config['es']['es_merge_fa_vad']
if not os.path.exists(merged_fa_vads_path):
    os.makedirs(merged_fa_vads_path)


In [34]:
# t1, t2, has_500ms_dur = create_uttr_vad_from_alignment(align_dict, uttr_fa_vads_path)
# # print(t1, t2)
# print(map(lambda t: "{0:.3f} hrs".format((t / 100.0 / 3600)), [t1, t2]))
# print("saving dict: %s" % has_500ms_fa_vad_dict_fname)
# pickle.dump(has_500ms_dur, open(has_500ms_fa_vad_dict_fname, "wb"))

In [35]:
# # check how many utterances have atleast 500ms VAD
# print("total utterances: %d" % len(has_500ms_dur))
# uttrs_with_500ms = {k:v for k, v in has_500ms_dur.items() if v}
# print("with 500ms: %d" % len(uttrs_with_500ms))

In [36]:
# save dev file with only 500ms segments

In [37]:
# dev_500ms_fname = config['es']['mt_dev_500ms_files']
# dev_fname = config['es']['mt_dev_test_files']
# with open(dev_fname, "r") as dev_f, open(dev_500ms_fname, "w") as dev_500ms_f:
#     for line in dev_f:
#         if line.strip() in has_500ms_dur and has_500ms_dur[line.strip()]:
#             dev_500ms_f.write(line)

In [38]:
# !wc $dev_500ms_fname

### Create merged vad for each file

In [39]:
def create_merged_vads():
    total_dur_10ms, total_dur_10ms_ge500ms = 0, 0
    for vad_file_id in segment_map:
        t1, t2 = create_vad_from_alignment(vad_file_id, align_dict, segment_map, merged_fa_vads_path)
        total_dur_10ms += t1
        total_dur_10ms_ge500ms += t2

In [40]:
# print(total_dur_10ms, total_dur_10ms_ge500ms)
# print(map(lambda t: "{0:.3f}".format((t / 100.0 / 3600)), [total_dur_10ms, total_dur_10ms_ge500ms]))

## Create features

In [41]:
def create_gold_feats(align_dict, gold_feats_dict_fname, es_key="es"):
    gold_feats_dict = {}
    for fid in align_dict:
        for sid in align_dict[fid]:
            gold_feats_dict[sid] = {}
            if align_dict[fid][sid][es_key] == []:
                # Only es_cnt can be empty, in which case include stop words
                gold_feats_dict[sid] = [w.word for w in align_dict[fid][sid]['es']]
            else:
                gold_feats_dict[sid] = [w.word for w in align_dict[fid][sid][es_key]]
    print("Saving gold features using key: %s" % es_key)
    pickle.dump(gold_feats_dict, open(gold_feats_dict_fname, "wb"))
    print("finished ...")
    return gold_feats_dict
        

In [42]:
def gold_feats():
    align_dict = pickle.load(open(align_dict_fname, "rb"))
    gold_feats_dict_fname = config['es']['gold_feats']
    gold_feats_dict = create_gold_feats(align_dict, gold_feats_dict_fname, es_key="es_cnt")

## Check English translations

In [28]:
es_words = [a.word for fid in align_dict for sid in align_dict[fid] for a in align_dict[fid][sid]['es']]
es_cnt_words = [a.word for fid in align_dict for sid in align_dict[fid] for a in align_dict[fid][sid]['es_cnt']]
en_words = [a.word for fid in align_dict for sid in align_dict[fid] for a in align_dict[fid][sid]['en']]
en_cnt_words = [a.word for fid in align_dict for sid in align_dict[fid] for a in align_dict[fid][sid]['en_cnt']]

In [29]:
from collections import Counter

In [30]:
es_words_freq = Counter(es_words)
es_cnt_words_freq = Counter(es_cnt_words)
en_words_freq = Counter(en_words)
en_cnt_words_freq = Counter(en_cnt_words)

In [31]:
print(sorted(en_words_freq.items(), reverse=True, key=lambda t:t[1])[:10])

[('THE', 5178), ('AND', 4629), ('THAT', 4080), ('I', 3849), ('TO', 3359), ('YES', 3345), ("'T", 2265), ('YOU', 2256), ('NO', 2135), ("'S", 2030)]


In [32]:
print(sorted(en_cnt_words_freq.items(), reverse=True, key=lambda t:t[1])[:10])

[('YES', 3345), ("'T", 2265), ("'S", 2030), ('WELL', 1829), ('AH', 1349), ('KNOW', 1100), ('OH', 1066), ('SEE', 934), ('YEAH', 904), ('GOING', 889)]


In [33]:
print(sorted(es_words_freq.items(), reverse=True, key=lambda t:t[1])[:10])

[('QUE', 7089), ('NO', 6110), ('Y', 5037), ('A', 4310), ('DE', 4009), ('Sí', 3667), ('LA', 3425), ('YA', 2782), ('EL', 2680), ('ES', 2587)]


In [34]:
print(sorted(es_cnt_words_freq.items(), reverse=True, key=lambda t:t[1])[:10])

[('AH', 1831), ('PUES', 1236), ('BUENO', 1186), ('BIEN', 1183), ('SI', 1045), ('<LAUGH>', 987), ('MMM', 976), ('ASí', 781), ('ENTONCES', 775), ('CLARO', 729)]


In [35]:
print([(w,f) for w, f in en_cnt_words_freq.items() if "'" in w])

[("'M", 627), ("'S", 2030), ("'T", 2265), ("'LL", 540), ("'VE", 184), ("'D", 34), ("'RE", 269), ("'AM", 8), ("'", 40), ("'CLOCK", 4), ("O'CLOCK", 1), ("'OEUVRES", 1), ("'R", 1), ("'TS", 1)]


In [36]:
print([(w,f) for w, f in es_words_freq.items() if "<" in w])

[('<NOISE>', 450), ('<BACKGROUND>', 107), ('<LAUGH>', 987), ('<COUGH>', 11), ('<BREATH>', 16), ('<SNEEZE>', 4)]


In [37]:
print([(w,f) for w, f in es_cnt_words_freq.items() if "<" in w])

[('<NOISE>', 450), ('<BACKGROUND>', 107), ('<LAUGH>', 987), ('<COUGH>', 11), ('<BREATH>', 16), ('<SNEEZE>', 4)]


## Key-word spotting, prepare data

In [53]:
def read_proto_list():
    protos = []
    with open(config['proto']['protos_list'], "r") as f:
        for line in f:
            protos.append(line.strip())
    return protos

In [54]:
def create_vad_norm_plp_for_protos(protos):
    if not os.path.exists(config['proto']['vad_path']):
        os.makedirs(config['proto']['vad_path'])
    if not os.path.exists(config['proto']['proto_plp_binary']):
        os.makedirs(config['proto']['proto_plp_binary'])
    if not os.path.exists(config['proto']['lsh_path']):
        os.makedirs(config['proto']['lsh_path'])
        
    sys.stderr.flush()
    with tqdm(total=len(protos)) as pbar:
        for i, pid in enumerate(protos, start=1):
            try:
                pid_base = os.path.splitext(pid)[0]
                plp_fname = os.path.join(config['proto']['proto_path'], pid)
                plp_binary_fname = (os.path.join(config['proto']['proto_plp_binary'], 
                                          "{0:s}.std.binary".format(pid_base)))
                vad_fname = (os.path.join(config['proto']['vad_path'], "{0:s}.vad".format(pid_base)))
                lsh_fname = (os.path.join(config['proto']['lsh_path'], 
                                          "{0:s}.std.lsh64".format(pid_base)))
                # read npy file to get shape
                x = np.load(plp_fname)
                # ZRTools use float32, save as binary file
                y = x.ravel()
                y.tofile(plp_binary_fname)
                #np.save(open(plp_binary_fname, "wb"), y, allow_pickle=False)

                # create vad file
                with open(vad_fname, "w") as f:
                    f.write("0\t{0:d}\n".format(y.shape[0]))

                # create lsh
                create_lsh_file(plp_binary_fname, vad_fname, lsh_proj_fname, lsh_fname)
                
                # update progress
                pbar.set_description("processing proto {0:s}".format(pid_base))
                pbar.update(1)
            except:
                print(" problem in file: {0:s}".format(pid), end=",")
    print("completed")
        

In [55]:
def create_lsh_for_call_norm_plps():
    if not os.path.exists(config['proto']['call_vad_path']):
        print("VAD files not found in folder: {0:s}".format(config['proto']['call_vad_path']))
        return
    if not os.path.exists(config['proto']['call_plp_path']):
        print("PLP files not found in folder: {0:s}".format(config['proto']['call_plp_path']))
        return
    if not os.path.exists(config['proto']['call_lsh_path']):
        os.makedirs(config['proto']['call_lsh_path'])
    if not os.path.exists(config['proto']['call_plp_binary']):
        os.makedirs(config['proto']['call_plp_binary'])
    
    sys.stderr.flush()
    calls = [f for f in os.listdir(config['proto']['call_plp_path']) if f.endswith(".npy")]
    with tqdm(total=len(calls)) as pbar:
        for i, call in enumerate(calls, start=1):
            try:
                call_base = call.replace(".plp.npy", "")
                plp_fname = (os.path.join(config['proto']['call_plp_path'], call))
                plp_binary_fname = (os.path.join(config['proto']['call_plp_binary'], 
                                          "{0:s}.std.binary".format(call_base)))
                vad_fname = (os.path.join(config['proto']['call_vad_path'], 
                                          "{0:s}.vad".format(call_base)))
                lsh_fname = (os.path.join(config['proto']['call_lsh_path'], 
                                          "{0:s}.std.lsh64".format(call_base)))

                # read npy file - normalized plp
                x = np.load(plp_fname)
                # ZRTools use float32
                y = x.ravel()
                y.tofile(plp_binary_fname)
                #np.savetxt(open(plp_binary_fname, "w"), y, allow_pickle=False)

                # create lsh
                create_lsh_file(plp_binary_fname, vad_fname, lsh_proj_fname, lsh_fname)
                
                # update progress
                pbar.set_description("processing proto {0:s}".format(call_base))
                pbar.update(1)
                #print(i)
            except:
                print("problem in file: {0:s}".format(call), end=", ")
    print("completed")
        

In [78]:
def create_lsh_for_protos():
    if not os.path.exists(config['proto']['lsh_path']):
        os.makedirs(config['proto']['lsh_path'])
        
    protos = [f.replace(".std.binary", "") for f in os.listdir(config['proto']['proto_plp_binary']) 
              if f.endswith(".std.binary")]
    sys.stderr.flush()
    with tqdm(total=len(protos)) as pbar:
        for proto in protos:
            try:
                plp_fname = os.path.join(config['proto']['proto_plp_binary'], 
                                         "{0:s}.std.binary".format(proto))
                vad_fname = os.path.join(config['proto']['vad_path'], 
                                         "{0:s}.vad".format(proto))
                lsh_fname = os.path.join(config['proto']['lsh_path'], 
                                         "{0:s}.lsh".format(proto))
            
                # print(plp_fname, vad_fname, lsh_fname)
                # create lsh
                create_lsh_file(plp_fname, vad_fname, lsh_proj_fname, lsh_fname)

                # update progress
                pbar.set_description("processing proto {0:s}".format(proto))
                pbar.update(1)
            except:
                print("problem in file:", proto)
    print("Finished lsh for protos")
    

In [79]:
create_lsh_for_protos()

processing proto GOSSIP:  19%|█▉        | 1053/5458 [00:11<00:46, 94.61it/s]

problem in file: LARRAñAGA


processing proto FOLLOW:  23%|██▎       | 1247/5458 [00:13<00:42, 99.03it/s]  

problem in file: FERNáN


processing proto ROBBERY:  24%|██▍       | 1337/5458 [00:14<00:43, 95.69it/s]

problem in file: OCAñA


processing proto GEORGINA:  27%|██▋       | 1458/5458 [00:15<00:39, 102.20it/s] 

problem in file: HERNáNDEZ


processing proto FURROW:  30%|██▉       | 1621/5458 [00:17<00:38, 99.67it/s]

problem in file: DOñINHUE


processing proto DAUGHTERS:  33%|███▎      | 1790/5458 [00:18<00:36, 99.15it/s]

problem in file: MARAñON


processing proto BIKE:  42%|████▏     | 2284/5458 [00:24<00:35, 88.53it/s]

problem in file: ¡


processing proto NURSERY:  46%|████▌     | 2500/5458 [00:26<00:29, 99.01it/s]

problem in file: ​


processing proto ISN:  48%|████▊     | 2604/5458 [00:27<00:29, 95.95it/s]

problem in file: IGUAZú


processing proto PREGNANT:  53%|█████▎    | 2893/5458 [00:30<00:26, 97.34it/s]

problem in file: FERNáNDEZ


processing proto COMMENT:  57%|█████▋    | 3134/5458 [00:33<00:22, 101.07it/s]

problem in file: ´


processing proto LETTING:  59%|█████▊    | 3197/5458 [00:34<00:24, 93.84it/s]  

problem in file: DOñIHUE


processing proto KUNG:  67%|██████▋   | 3680/5458 [00:39<00:21, 81.53it/s]   

problem in file: NICOLáS


processing proto SUSPENSION:  72%|███████▏  | 3920/5458 [00:41<00:16, 96.12it/s]

problem in file: JOSé


processing proto GIRLFRINED:  75%|███████▌  | 4118/5458 [00:44<00:14, 91.63it/s]

problem in file: CASTAñEDA


processing proto VARIANT:  78%|███████▊  | 4281/5458 [00:45<00:11, 102.49it/s]

problem in file: MARíA


processing proto ASIAN:  81%|████████  | 4415/5458 [00:47<00:10, 94.98it/s]

problem in file: IVáN


processing proto MIAMI:  82%|████████▏ | 4495/5458 [00:48<00:10, 91.95it/s]    

problem in file: RAúL


processing proto DISRESPECTFUL:  89%|████████▉ | 4884/5458 [00:52<00:05, 101.05it/s]

problem in file: ¨


processing proto EXCHANGE:  92%|█████████▏| 5023/5458 [00:53<00:04, 99.14it/s]

problem in file: MóNICA


processing proto RELAX:  94%|█████████▍| 5123/5458 [00:54<00:03, 85.31it/s]  

problem in file: MISIóN


processing proto OBVIOSLY:  99%|█████████▉| 5397/5458 [00:57<00:00, 95.55it/s]

problem in file: MéLIDA
problem in file: HUARáS


processing proto LOVE: 100%|█████████▉| 5435/5458 [00:57<00:00, 93.77it/s]   

Finished lsh for protos





In [102]:
def create_lsh_for_calls():
    if not os.path.exists(config['proto']['call_lsh_path']):
        os.makedirs(config['proto']['call_lsh_path'])
        
    calls = [f.replace(".std.binary", "") for f in os.listdir(config['proto']['call_plp_binary']) 
              if f.endswith(".std.binary")]
    sys.stderr.flush()
    with tqdm(total=len(calls)) as pbar:
        for call in calls:
            try:
                plp_fname = os.path.join(config['proto']['call_plp_binary'], 
                                         "{0:s}.std.binary".format(call))
                vad_fname = os.path.join(config['proto']['call_vad_path'], 
                                         "{0:s}.vad".format(call))
                lsh_fname = os.path.join(config['proto']['call_lsh_path'], 
                                         "{0:s}.lsh".format(call))
            
                #print(plp_fname, vad_fname, lsh_fname)
                # create lsh
                create_lsh_file(plp_fname, vad_fname, lsh_proj_fname, lsh_fname)

                # update progress
                pbar.set_description("processing proto {0:s}".format(call))
                pbar.update(1)
            except:
                print("problem in file: {0:s}", call)
    print("Finished lsh for protos")
    

In [103]:
create_lsh_for_calls()

processing proto 110.175: 100%|██████████| 1314/1314 [00:14<00:00, 88.72it/s]

Finished lsh for protos





In [86]:
# protos = read_proto_list()
# protos[:5]

In [87]:
# create_vad_norm_plp_for_protos(protos)

In [88]:
# create_lsh_for_call_norm_plps()

In [104]:
protos = [f.replace(".lsh", "") for f in os.listdir(config['proto']['lsh_path']) 
              if f.endswith(".lsh")]
calls = [f.replace(".lsh", "") for f in os.listdir(config['proto']['call_lsh_path']) 
              if f.endswith(".lsh")]

In [109]:
def create_keyword_spotting_cmd_scripts(exp_path, wav_file_list, protos, exp_name, num_splits=1):
    disc_file_split_base = "keyword_spot_{0:d}.cmd"
    disc_file_split = os.path.join(exp_path, disc_file_split_base)
    disc_split_file = os.path.join(exp_path, "keyword_spot_split.txt")
    
    num_files = len(wav_file_list)
    num_protos = len(protos)
    
    exp_local_path = os.path.join("exp", exp_name)
    cmd_string = "scripts/plebdisc_filepair_keyword_spotting \"{0:s}\" \"{1:s}\" {2:s} 39\n"

    total_lines = num_files * num_protos
    lines_per_file = total_lines // num_splits
    smallfile = None
    curr_line = 0
    curr_file_num = 0
    
    sys.stderr.flush()
    with tqdm(total=num_protos) as pbar:
        for i in xrange(num_protos):
            pid_base = os.path.splitext(protos[i])[0]
            pbar.update(1)
#             if i % 20 == 0:
#                 print("Progress: {0:d} out of: {1:d}".format(curr_line+1, total_lines))
            for j in xrange(num_files):
                out_line = cmd_string.format(pid_base, wav_file_list[j], exp_local_path)
                if curr_line % lines_per_file == 0:
                    if smallfile:
                        smallfile.close()
                    small_filename = disc_file_split.format(curr_file_num)
                    smallfile = open(small_filename, "w")
                    curr_file_num += 1
                smallfile.write(out_line)
                curr_line += 1
    if smallfile:
        smallfile.close()

    # Making a list of commands to execute the split disc list
    full_split_cmd_string = "nice sh {0:s} 1> {1:s} 2>{2:s} &\n"
    split_cmd = os.path.join(exp_local_path, "keyword_matches","{0:s}.{1:d}")
    with open(disc_split_file, "w") as out_f:
        for i in xrange(curr_file_num):
            curr_split_file = os.path.join(exp_local_path, "keyword", disc_file_split_base.format(i))
            split_cmd_out = split_cmd.format("keyword_spot_out", i)
            #split_cmd_err = split_cmd.format("err", i)
            split_cmd_err = "/dev/null"

            out_line = "nice sh "
            out_f.write(full_split_cmd_string.format(curr_split_file, \
                                                    split_cmd_out, \
                                                    split_cmd_err))

    print("Completed - keyword_spot cmd script")

In [111]:
create_keyword_spotting_cmd_scripts(exp_path=exp_path, wav_file_list=calls, protos=protos,
                                    exp_name=exp_name, num_splits=25)

100%|██████████| 5434/5434 [00:14<00:00, 378.23it/s]

Completed - keyword_spot cmd script





In [99]:
config['proto']['call_lsh_path']

u'../antonissameer/week2/data/subset-test/lsh'

In [100]:
calls[:10]

[u'052.069',
 u'101.068',
 u'046.207',
 u'118.202',
 u'059.121',
 u'077.142',
 u'116.020',
 u'118.020',
 u'046.226',
 u'077.068']

In [101]:
len(protos)

5434

In [None]:
haha = range(1000000)
with tqdm(total=len(haha)) as pbar:
    sys.stderr.flush()
    for i in haha:
        pbar.set_description("processing proto {0:d}".format(i))
        pbar.update(1)