In [94]:
from __future__ import print_function
from __future__ import division
import os
import pickle
import json
import subprocess
from IPython.display import display
from IPython.display import Audio
import bisect
from collections import namedtuple

In [95]:
!jupyter nbconvert --to script preprocess.ipynb

[NbConvertApp] Converting notebook preprocess.ipynb to script
[NbConvertApp] Writing 14144 bytes to preprocess.py


In [96]:
from preprocess import read_file_list

# Map ZRTools output to transcripts

- Create modified .nodes file
- Create mapping between es words, and nodes

In [97]:
with open("config.json") as json_data_file:
    config = json.load(json_data_file)

In [98]:
# nodes_fname = config["es"]['nodes_fname']
seg_nodes_fname = config["es"]['seg_nodes_fname']
nodes_dict_fname = config["es"]['nodes_dict_fname']

edges_utd_fname = config["es"]['edges_utd_fname']
edges_olap_fname = config["es"]['edges_olap_fname']
edges_all_fname = config["es"]['edges_all_fname']
edges_score_fname = config["es"]['edges_score_fname']

clusters_utd_fname = config['es']['clusters_utd_fname']
clusters_fname = config['es']['clusters_fname']

feats_fname = config['es']['feats_fname']
feats_dict_fname = config['es']['feats_dict_fname']

In [99]:
Node = namedtuple('Node', ['seg', 'start', 'end'])

## Nodes - find transcript words corresponding to node start and end times

Create node dictionary:
    - node id
    - seg id
    - start time
    - end time

In [100]:
def map_nodes_align(seg_nodes_fname):
    nodes_dict = {}
    with open(seg_nodes_fname, "r") as seg_nodes_f:
        for nid, line in enumerate(seg_nodes_f, start=1):
            line_items = line.strip().split()
            seg_id = line_items[0]
            start_t, end_t = map(int, line_items[1:3])
            
            nodes_dict[nid] = Node(seg_id, start_t, end_t)
            if nid % 1000000 == 0:
                print('reading node %d' % nid)
    print("finished reading %d nodes" % nid)
    return nodes_dict
    pass

# Edges - create a valid edges file

In [101]:
# def read_edges(nodes_dict):
#     olap_dict = {}
#     pairs_list = []
#     # process clusters file
#     with open(config['es']['edges_olap_fname'], "r") as in_f:
#         for i, line in enumerate(in_f):
#             line_items = list(map(int, line.strip().split()))
#             olap_dict[line_items[0]] = line_items[0]
#             if len(line_items) > 1:
#                 for j in line_items[1:]:
#                     olap_dict[j] = line_items[0]

#     # Read edges dict
#     with open(config['es']['edges_utd_fname'], "r") as in_f:
#         for i, line in enumerate(in_f):
#             if i % 1000000 == 0:
#                 print("Processing line: %d" % (i+1))
#             line_items = line.strip().split()
#             node_1 = int(line_items[0])
#             node_2 = int(line_items[1])
#             if node_1 not in olap_dict:
#                 olap_dict[node_1] = node_1
#             if node_2 not in olap_dict:
#                 olap_dict[node_2] = node_2
#             dtw_val = float(line_items[2]) / 1000.0

#             node_1 = olap_dict[node_1]
#             node_2 = olap_dict[node_2]

#             # Add to pairs list as a tuple
#             pairs_list.append((min(node_1, node_2), max(node_1, node_2), dtw_val))


#     print("Finished - reading edges ...")
#     print("Removing duplicates in pairs list")
#     set_pairs = list(set(pairs_list))
#     print("Set length: %d and List length: %d" %(len(set_pairs), len(pairs_list)))
#     pairs_list = sorted(list(set_pairs))
# #     with open(config['es']['edges_list_fname'], "w") as out_f:
# #         for (n1, n2, dtw) in set_pairs:
# #             out_line = "%d\t%d\t%.3f\n" % (n1, n2, dtw)
# #             out_f.write(out_line)
#     pickle.dump(set_pairs, open(config['es']['edges_list_fname'], "wb"))
#     print("finished writing edges")
    
#     # validity check for duplicates
#     set_nodes_only = [(n1,n2) for n1, n2, dtw in set_pairs]
#     if len(set_pairs) != len(set_nodes_only):
#         raise IOError
#     return set_pairs   
    

In [102]:
def read_edges(nodes_dict):
    pairs_list = []

    # Read edges dict
    with open(config['es']['edges_all_fname'], "r") as in_f:
        for i, line in enumerate(in_f):
            if i % 1000000 == 0:
                print("Processing line: %d" % (i+1))
            line_items = line.strip().split()
            node_1 = int(line_items[0])
            node_2 = int(line_items[1])
            seg_id_1 = nodes_dict[node_1].seg
            seg_id_2 = nodes_dict[node_2].seg
            dtw_val = float(line_items[2]) / 1000.0

            # Add to pairs list as a tuple
            pairs_list.append((min(seg_id_1, seg_id_2), max(seg_id_1, seg_id_2), dtw_val))


    print("Finished - reading edges ...")
    print("Removing duplicates in pairs list")
    set_pairs = list(set(pairs_list))
    print("Set length: %d and List length: %d" %(len(set_pairs), len(pairs_list)))
#     pairs_list = sorted(list(set_pairs))
    pairs_list = sorted(pairs_list)
#     with open(config['es']['edges_list_fname'], "w") as out_f:
#         for (n1, n2, dtw) in set_pairs:
#             out_line = "%d\t%d\t%.3f\n" % (n1, n2, dtw)
#             out_f.write(out_line)
    pickle.dump(pairs_list, open(config['es']['edges_list_fname'], "wb"))
    print("finished writing edges")
    
    # validity check for duplicates
    set_nodes_only = [(n1,n2) for n1, n2, dtw in set_pairs]
    if len(set_pairs) != len(set_nodes_only):
        raise IOError
    return pairs_list   
    

# Clusters

- Save list of clusters
- Generate bag of cluster ids for each segment
    - use nodes per segment as replace with cluster id
    - if no node found, use cluster id -1

In [103]:
def load_clusters(clusters_utd_fname):
    clusters = []
    with open(clusters_utd_fname, "r") as in_f:
        for line in in_f:
            try:
                nodes = map(int, line.strip().split())
                clusters.append(nodes)
            except:
                print(line)                    
    return clusters

In [104]:
def generate_pseudowords_for_segments(nodes_dict, clusters, feats_fname, feats_dict_fname):
    feats_dict = {}
    total_errors = 0
    display_den = len(clusters) // 10
    
    for clid, nodes in enumerate(clusters):
        if clid % display_den == 0:
            print('processing cluster %d out of %d' % (clid, len(clusters)))
        for nid in nodes:
            node = nodes_dict[nid]
            if node.seg not in feats_dict:
                feats_dict[node.seg] = []
            feats_dict[node.seg].append(str(clid))
    
    print("total clusters: %d" % clid)
    # Get complete list of segment ids
    file_list = read_file_list()
    segids = file_list["train"] + file_list["test"]
    
    with open(feats_fname, "w") as out_f:
        for seg_id in sorted(segids):
            # adding -1 for missing pseudotext
            if seg_id not in feats_dict:
                #outline = "-1\n"
                total_errors += 1
                feats_dict[seg_id] = ['-1']
            
            outline = " ".join(map(str,sorted(feats_dict[seg_id])))
            outline = outline.strip() + "\n"
            out_f.write(outline)
            
    print("Finished writing features file: %s" % os.path.basename(feats_fname))
    print("Writing to file: %s" % os.path.basename(feats_dict_fname))
    pickle.dump(feats_dict, open(feats_dict_fname, "wb"))
    print("Pseudowords not found for: %d segments, out of total: %d segments" % (total_errors, len(segids)))
    

In [105]:
def check_dur_pseudotext(segment_map, nodes_dict, clusters, feats_fname, feats_dict_fname):
    dur_dict = {}
    total_errors = 0
    total_dur = 0
    display_den = len(clusters) // 10
    
    for clid, nodes in enumerate(clusters):
        if clid % display_den == 0:
            print('processing cluster %d out of %d' % (clid, len(clusters)))
        for nid in nodes:
            node = nodes_dict[nid]
            if node.seg not in dur_dict:
                dur_dict[node.seg] = set()
            dur_dict[node.seg] |= set(range(node.start, node.end+1))
            
    # Get complete list of segment ids
    file_list = read_file_list()
    segids = file_list["train"] + file_list["test"]
    
    print("Total segments with pseudotext: %d" % len(dur_dict))
    print("Total calls with pseudotext: %d" % len(set([sid.split(".")[0] for sid in dur_dict])))
    print("Total segments: %d" % len(segids))
    total_dur = sum([len(dur_list) for dur_list in dur_dict.values()])
    print("Total duration: %d secs, %.2f hours" % (total_dur, total_dur / 100 / 3600))    

### Main

In [106]:
file_list = read_file_list()
segids = file_list["train"] + file_list["test"]

In [107]:
len(segids)

259

In [108]:
nodes_dict = map_nodes_align(seg_nodes_fname)

finished reading 1938 nodes


In [109]:
pickle.dump(nodes_dict, open(nodes_dict_fname, "wb"))

In [110]:
nodes_dict = pickle.load(open(nodes_dict_fname, "rb"))

In [111]:
edges_list = read_edges(nodes_dict)

Processing line: 1
Finished - reading edges ...
Removing duplicates in pairs list
Set length: 853 and List length: 3756
finished writing edges


In [112]:
edges_seg_ids = ["{0:s}_{1:s}".format(s1, s2) for s1, s2, dtw in edges_list]

In [113]:
len(edges_list)

3756

In [114]:
from collections import Counter

In [115]:
edges_seg_ids = Counter(["{0:s}_{1:s}".format(s1, s2) for s1, s2, dtw in edges_list])

In [116]:
edges_seg_ids.most_common(60)

[('ghost_ghost', 779),
 ('fooling_fooling', 611),
 ('strongbear_strongbear', 351),
 ('owlman_owlman', 332),
 ('telescope_telescope', 284),
 ('fooling_ghost', 187),
 ('oldcouple_oldcouple', 116),
 ('047_047', 83),
 ('227_227', 61),
 ('050_050', 52),
 ('133_133', 52),
 ('136_136', 51),
 ('fooling_telescope', 49),
 ('ghost_telescope', 46),
 ('ghost_owlman', 43),
 ('oldcouple_owlman', 43),
 ('ghost_strongbear', 38),
 ('fooling_owlman', 35),
 ('fooling_strongbear', 32),
 ('ghost_oldcouple', 24),
 ('owlman_strongbear', 23),
 ('owlman_telescope', 21),
 ('201_201', 19),
 ('240_240', 18),
 ('041_041', 17),
 ('215_215', 17),
 ('180_180', 15),
 ('fooling_oldcouple', 15),
 ('055_055', 12),
 ('250_250', 9),
 ('250_ghost', 9),
 ('168_168', 8),
 ('oldcouple_strongbear', 8),
 ('201_ghost', 7),
 ('030_030', 6),
 ('191_191', 6),
 ('197_197', 6),
 ('226_226', 6),
 ('strongbear_telescope', 6),
 ('050_ghost', 5),
 ('227_fooling', 5),
 ('stonewoman_stonewoman', 5),
 ('030_owlman', 4),
 ('041_owlman', 4),
 (

In [117]:
test_set = file_list["test"]

In [118]:
edges_test_seg_ids = Counter(["{0:s}_{1:s}".format(s1, s2) 
                              for s1, s2, dtw in edges_list 
                              if s1 in test_set and s2 in test_set])

In [119]:
len(edges_test_seg_ids)

22

In [120]:
sum([c for i, c in edges_test_seg_ids.items()])

3052

In [121]:
clusters = load_clusters(clusters_utd_fname)

In [122]:
pickle.dump(clusters, open(clusters_fname, "wb"))

In [123]:
generate_pseudowords_for_segments(nodes_dict, clusters, feats_fname, feats_dict_fname)

processing cluster 0 out of 206
processing cluster 20 out of 206
processing cluster 40 out of 206
processing cluster 60 out of 206
processing cluster 80 out of 206
processing cluster 100 out of 206
processing cluster 120 out of 206
processing cluster 140 out of 206
processing cluster 160 out of 206
processing cluster 180 out of 206
processing cluster 200 out of 206
total clusters: 205
Finished writing features file: pseudowords.feats
Writing to file: pseudowords.dict
Pseudowords not found for: 199 segments, out of total: 259 segments


In [124]:
# valid_pairs = read_edges()

In [125]:
def main():
    nodes_dict = map_nodes_align(seg_nodes_fname)
    pickle.dump(nodes_dict, open(nodes_dict_fname, "wb"))
    valid_pairs = read_edges(nodes_dict)

    clusters = load_clusters(clusters_utd_fname)
    pickle.dump(clusters, open(clusters_fname, "wb"))

    generate_pseudowords_for_segments(nodes_dict, clusters, feats_fname, feats_dict_fname)

    #check_dur_pseudotext(segment_map, nodes_dict, clusters, feats_fname, feats_dict_fname)
    print("maining")

if __name__ == "__main__":
    main()

finished reading 1938 nodes
Processing line: 1
Finished - reading edges ...
Removing duplicates in pairs list
Set length: 853 and List length: 3756
finished writing edges
processing cluster 0 out of 206
processing cluster 20 out of 206
processing cluster 40 out of 206
processing cluster 60 out of 206
processing cluster 80 out of 206
processing cluster 100 out of 206
processing cluster 120 out of 206
processing cluster 140 out of 206
processing cluster 160 out of 206
processing cluster 180 out of 206
processing cluster 200 out of 206
total clusters: 205
Finished writing features file: pseudowords.feats
Writing to file: pseudowords.dict
Pseudowords not found for: 199 segments, out of total: 259 segments
maining


In [57]:
def get_output_files():
    out_files = sorted([os.path.join(config["es"]["matches"], f) 
                        for f in os.listdir(config["es"]["matches"]) if f.startswith("out.")], 
                       key=lambda s: int(s.rsplit(".", 1)[1]))
    return out_files

In [None]:
for wav_file in [f for f in os.listdir("../arapaho/") if f.endswith("ED.wav")]:
    mono_wav_fname = wav_file.replace(".mp3.SOX-CONVERTED.wav", ".wav")
    mono_wav_fname = os.path.join("../arapaho-test-wavs", mono_wav_fname)
    subprocess.call([config['base']["sox"], os.path.join("../arapaho-mp3s/", wav_file), 
                     mono_wav_fname, "remix", "1-2"])

In [134]:
train_test_str = "test"
dual_wav_path = "../ainu/ainu-{0:s}-wavs/".format(train_test_str)
mono_wav_path = "../ainu/ainu-{0:s}-mono-wavs/".format(train_test_str)
mono_8k_wav_path = "../ainu/{0:s}-wavs/".format(train_test_str)

In [135]:
for wav_file in [f for f in os.listdir(dual_wav_path) if f.endswith(".wav")]:
    #mono_wav_fname = wav_file.replace(".mp3.SOX-CONVERTED.wav", ".wav")
    mono_wav_fname = os.path.join(mono_wav_path, wav_file)
    subprocess.call([config['base']["sox"], os.path.join(dual_wav_path, wav_file), 
                     mono_wav_fname, "remix", "1-2"])

In [136]:
for wav_file in [f for f in os.listdir(mono_wav_path) if f.endswith(".wav")]:
    in_wav_fname = os.path.join(mono_wav_path, wav_file)
    mono_8k_wav_fname = os.path.join(mono_8k_wav_path, wav_file)
    
    subprocess.call([config['base']["sox"], "-t", "wav", 
                     in_wav_fname, "-t", "wav", "-e", "signed-integer", 
                      "-b", "16", "-c", "1", "-r", "8000", "--no-dither", 
                     mono_8k_wav_fname])

In [97]:
# $SOXBIN -t wav ${SRCFILE} -t wav -e signed-integer -b 16 -c 1 -r 8000 --no-dither ${WAV_FILE}

CoffeeScript	   JavaScript		    preprocess.ipynb
Colors		   keyword_spot_post.ipynb  preprocess.py
config_file.ipynb  Load			    __pycache__
config.json	   map_utd_output.ipynb     Shell
Editing		   map_utd_output.py	    Spell
eval_utd.ipynb	   misc.ipynb		    Text,
General		   Moving		    visualize_utd.ipynb
Helper		   mt_exp.ipynb


In [10]:
for wav_file in [f for f in os.listdir("../arapaho/arapaho-train-mono-wavs/") if f.endswith(".wav")]:
    vad_fname = os.path.join("../arapaho/arapaho-train-evads", wav_file.replace(".wav", ".vad"))
    mono_wav_fname = os.path.join("../arapaho/arapaho-train-mono-wavs/", wav_file)
    print(" ".join(map(str, [config['base']["energy"], "-i", 
                     mono_wav_fname, "-o", vad_fname, "-s", "0.4", "-e", "0"])))
    subprocess.call(["python", config['base']["energy"], "-i", 
                     mono_wav_fname, "-o", vad_fname, "-s", "0.4", "-e", "0"])


../../../ZRTools/scripts/mark_energy.py -i ../arapaho/arapaho-train-mono-wavs/111.wav -o ../arapaho/arapaho-train-evads/111.vad -s 0.4 -e 0
../../../ZRTools/scripts/mark_energy.py -i ../arapaho/arapaho-train-mono-wavs/250.wav -o ../arapaho/arapaho-train-evads/250.vad -s 0.4 -e 0
../../../ZRTools/scripts/mark_energy.py -i ../arapaho/arapaho-train-mono-wavs/025.wav -o ../arapaho/arapaho-train-evads/025.vad -s 0.4 -e 0
../../../ZRTools/scripts/mark_energy.py -i ../arapaho/arapaho-train-mono-wavs/029.wav -o ../arapaho/arapaho-train-evads/029.vad -s 0.4 -e 0
../../../ZRTools/scripts/mark_energy.py -i ../arapaho/arapaho-train-mono-wavs/045.wav -o ../arapaho/arapaho-train-evads/045.vad -s 0.4 -e 0
../../../ZRTools/scripts/mark_energy.py -i ../arapaho/arapaho-train-mono-wavs/122.wav -o ../arapaho/arapaho-train-evads/122.vad -s 0.4 -e 0
../../../ZRTools/scripts/mark_energy.py -i ../arapaho/arapaho-train-mono-wavs/150.wav -o ../arapaho/arapaho-train-evads/150.vad -s 0.4 -e 0
../../../ZRTools/scr

In [90]:
# python scripts/mark_energy.py -i $WAV_FILE -o $VAD_FILE.tmp -s 0.4 -e 0
# cat $VAD_FILE.tmp | awk '{print int($1*100),int($2*100);}' > $VAD_FILE
# rm $VAD_FILE.tmp