In [42]:
from __future__ import print_function
from __future__ import division
import os
import cPickle as pickle
import json
import subprocess
from IPython.display import display
from IPython.display import Audio
import bisect
from collections import namedtuple

# Map ZRTools output to transcripts

- Create modified .nodes file
- Create mapping between es words, and nodes

In [43]:
with open("config.json") as json_data_file:
    config = json.load(json_data_file)

In [45]:
nodes_fname = config["es"]['nodes_fname']
seg_nodes_fname = config["es"]['seg_nodes_fname']
nodes_dict_fname = config["es"]['nodes_dict_fname']

edges_utd_fname = config["es"]['edges_utd_fname']
edges_olap_fname = config["es"]['edges_olap_fname']
edges_all_fname = config["es"]['edges_all_fname']
edges_score_fname = config["es"]['edges_score_fname']

clusters_utd_fname = config['es']['clusters_utd_fname']
clusters_fname = config['es']['clusters_fname']
clusters_stats_fname = config['es']['clusters_stats_fname']

pairs_fname = config['es']['score_pairs_fname']
eval_fname = config['es']['eval_pairs_fname']

feats_fname = config['es']['feats_fname']
feats_dict_fname = config['es']['feats_dict_fname']

In [46]:
Align = namedtuple('Align', ['word', 'start', 'end'])
Node = namedtuple('Node', ['file', 'seg', 'start', 'end', 'es', 'es_cnt'])

In [47]:
segment_map = pickle.load(open(config['es']['segment_dict_fname'], "rb"))
align_dict = pickle.load(open(config['es']['align_dict_fname'], "rb"))

## Nodes - identify the segment to which the node belongs

Lookout for:
1. Patterns that go across segment boundaries
2. ...

In [48]:
def search_segid(node_start, node_end, file_id, segment_map):
    seg_id_list, start_time_list = zip(*sorted(segment_map[file_id].items(), key=lambda t:t[0]))

    # Binary search to find segment where the node starts and ends in
    # we subtract 1 as bisect returns the index where we can insert a value keeping
    # the sort order. We do not expect it to be 0, as the node will always have a 0 or positive start
    # time
    seg_id_start = bisect.bisect(start_time_list, node_start)-1
    s1 = seg_id_list[seg_id_start]
    seg_id_end = bisect.bisect(start_time_list, node_end)-1
    s2 = seg_id_list[seg_id_end]
    
    if seg_id_start == seg_id_end:
        start = node_start - segment_map[file_id][s1]
        end = node_end - segment_map[file_id][s1]
        return s1, start, end, 0
    else:
        # Calculate which segment overlaps more
        #print (file_id, node_start, node_end, seg_id, seg_id_start, seg_id_end, seg_id_list[seg_id_start-1], seg_id_list[seg_id_end-1])
        if (segment_map[file_id][s2] - node_start) >= (node_end - segment_map[file_id][s2]):
            shift_value = node_end - segment_map[file_id][s2]
            start = node_start - segment_map[file_id][s1] - shift_value
            end = segment_map[file_id][s2] - segment_map[file_id][s1]
            print("More in s1", start, end, shift_value)
            return s1, start, end, 1
        else:
            shift_value = segment_map[file_id][s2] - node_start
            start = 0
            end = node_end - segment_map[file_id][s2] + shift_value
            print("More in s2", start, end, shift_value)
            return s2, start, end, 1
    print (file_id, node_start, node_end, seg_id_start, seg_id_end)
    raise ValueError

In [49]:
# Test code
print(search_segid(20509, 20641, "042", segment_map))
print(search_segid(0, 51, "038", segment_map))

More in s1 429 561 8
('042.079', 429, 561, 1)
('038.001', 0, 51, 0)


## Nodes - create a master_graph.segnodes file replacing nodes with their segment ids

In [51]:
def create_segmented_nodes(nodes_fname, segment_map, seg_nodes_fname):
    total_errors = 0
    with open(nodes_fname, "r") as nodes_f, open(seg_nodes_fname, "w") as segnodes_f:
        for i, line in enumerate(nodes_f):
            line_items = line.strip().split(None, 3)
            file_id = line_items[0]
            node_start, node_end = map(int, line_items[1:3])
            try:
                seg_id, seg_node_start, seg_node_end, e = search_segid(node_start, node_end, file_id, segment_map)
                total_errors += e
                outline = "%s\t%d\t%d\t%s\n" % (seg_id, seg_node_start, seg_node_end, line_items[3])
                segnodes_f.write(outline)
            except ValueError:
                print("Incorrect line format at line: %d\n%s" % (i, line))
                
    print("Total nodes: %d" % (i+1))
    print("Total errors: %d" % total_errors)
    print("completed")
            

In [53]:
create_segmented_nodes(nodes_fname, segment_map, seg_nodes_fname)

More in s2 0 52 4
More in s2 0 121 9
More in s2 0 57 1
More in s2 0 56 9
More in s1 221 273 5
More in s2 0 55 3
More in s2 0 62 3
More in s2 0 59 3
More in s2 0 61 4
More in s2 0 63 1
More in s1 254 321 15
More in s1 265 321 19
More in s2 0 56 1
More in s1 258 321 6
More in s1 264 321 9
More in s1 231 292 0
More in s2 0 55 9
More in s1 264 321 1
More in s2 0 53 2
More in s1 260 321 4
More in s1 253 321 12
More in s1 348 401 19
More in s1 92 148 2
More in s2 0 55 1
More in s2 0 61 6
More in s1 3 59 1
More in s1 290 346 1
More in s2 0 52 1
More in s2 0 58 2
More in s2 0 52 1
More in s1 153 211 2
More in s2 0 59 1
More in s2 0 65 4
More in s2 0 65 3
More in s1 262 355 0
More in s2 0 82 2
More in s2 0 106 5
More in s1 468 519 2
More in s1 449 609 9
More in s1 601 751 4
More in s2 0 59 4
More in s1 375 451 5
More in s1 32 88 2
More in s1 392 448 0
More in s1 466 517 1
More in s1 249 303 2
More in s1 73 124 1
More in s2 0 51 1
More in s2 0 52 1
More in s1 245 296 1
More in s1 250 303 1
More 

## Nodes - find transcript words corresponding to node start and end times

Create node dictionary:
    - node id
    - file id
    - seg id
    - start time
    - end time
    - es words    

In [54]:
def find_align_words_for_node(align_words_list, start, end):
    #display(align_words_list, start, end)
    words, start_times, end_times = zip(*(align_words_list))
    start_i = bisect.bisect(end_times, start)
    # end index will be 1 beyond the actual end
    end_i = bisect.bisect(start_times, end)
    return words[start_i:end_i]

In [55]:
display(find_align_words_for_node(align_dict["001"]["001.224"]["es"], 191, 246))
display(find_align_words_for_node(align_dict["001"]["001.274"]["es"], 45, 100))

('VAMOS', 'A', 'VER')

('ESTA', 'MECHITA')

In [56]:
def map_nodes_align(seg_nodes_fname, segment_map, align_dict):
    total_errors = 0
    nodes_dict = {}
    with open(seg_nodes_fname, "r") as seg_nodes_f:
        for nid, line in enumerate(seg_nodes_f, start=1):
            line_items = line.strip().split()
            file_id = line_items[0].split('.')[0]
            seg_id = line_items[0]
            start_t, end_t = map(int, line_items[1:3])
            es_w = find_align_words_for_node(align_dict[file_id][seg_id]['es'], start_t, end_t)
            if len(align_dict[file_id][seg_id]['es_cnt']) > 0:
                es_cnt_w = find_align_words_for_node(align_dict[file_id][seg_id]['es_cnt'], start_t, end_t)
            else:
                es_cnt_w = tuple()
                total_errors += 1
            nodes_dict[nid] = Node(file_id, seg_id, start_t, end_t, es_w, es_cnt_w)
            if nid % 100000 == 0:
                print('reading node %d' % nid)
    print("finished reading %d nodes" % nid)
    print('No content words found for %d nodes' % total_errors)
    return nodes_dict
    pass

In [57]:
nodes_dict = map_nodes_align(seg_nodes_fname, segment_map, align_dict)
pickle.dump(nodes_dict, open(nodes_dict_fname, "wb"))

finished reading 29694 nodes
No content words found for 156 nodes


# Edges - create a valid edges file

In [58]:
def read_edges():
    olap_dict = {}
    pairs_list = []
    # process clusters file
    with open(config['es']['edges_olap_fname'], "r") as in_f:
        for i, line in enumerate(in_f):
            line_items = map(int, line.strip().split())
            olap_dict[line_items[0]] = line_items[0]
            if len(line_items) > 1:
                for j in line_items[1:]:
                    olap_dict[j] = line_items[0]

    # Read edges dict
    with open(config['es']['edges_utd_fname'], "r") as in_f:
        for i, line in enumerate(in_f):
            if i % 1000 == 0:
                print("Processing line: %d" % (i+1))
            line_items = line.strip().split()
            node_1 = int(line_items[0])
            node_2 = int(line_items[1])
            if node_1 not in olap_dict:
                olap_dict[node_1] = node_1
            if node_2 not in olap_dict:
                olap_dict[node_2] = node_2
            dtw_val = float(line_items[2]) / 1000.0

            node_1 = olap_dict[node_1]
            node_2 = olap_dict[node_2]

            # Add to pairs list as a tuple
            pairs_list.append((min(node_1, node_2), max(node_1, node_2), dtw_val))


    print("Finished - reading edges ...")
    print("Removing duplicates in pairs list")
    set_pairs = list(set(pairs_list))
    print("Set length: %d and List length: %d" %(len(set_pairs), len(pairs_list)))
    pairs_list = sorted(list(set_pairs))
    with open(config['es']['edges_score_fname'], "w") as out_f:
        for (n1, n2, dtw) in set_pairs:
            out_line = "%d\t%d\t%.3f\n" % (n1, n2, dtw)
            out_f.write(out_line)
    pickle.dump(set_pairs, open(config['es']['score_pairs_fname'], "wb"))
    print("finished writing edges")
    
    # validity check for duplicates
    set_nodes_only = [(n1,n2) for n1, n2, dtw in set_pairs]
    if len(set_pairs) != len(set_nodes_only):
        raise IOError
    return pairs_list   
    

In [59]:
valid_pairs = read_edges()

Processing line: 1
Processing line: 1001
Processing line: 2001
Processing line: 3001
Processing line: 4001
Processing line: 5001
Processing line: 6001
Processing line: 7001
Processing line: 8001
Processing line: 9001
Processing line: 10001
Processing line: 11001
Processing line: 12001
Processing line: 13001
Processing line: 14001
Finished - reading edges ...
Removing duplicates in pairs list
Set length: 13370 and List length: 14847
finished writing edges


# Clusters

- Save list of clusters
- Generate bag of cluster ids for each segment
    - use nodes per segment as replace with cluster id
    - if no node found, use cluster id -1

In [79]:
segids = []
for fid in segment_map:
    segids.extend(segment_map[fid].keys())
print(len(segids))

17394


In [60]:
def load_clusters(clusters_fname):
    clusters = []
    with open(clusters_fname, "r") as in_f:
        for line in in_f:
            nodes = map(int, line.strip().split())
            clusters.append(nodes)
    return clusters

In [62]:
clusters = load_clusters(clusters_utd_fname)
pickle.dump(clusters, open(clusters_fname, "wb"))

In [84]:
def generate_pseudowords_for_segments(segment_map, nodes_dict, clusters, feats_fname, feats_dict_fname):
    feats_dict = {}
    total_errors = 0
    display_den = len(clusters) // 10
    
    for clid, nodes in enumerate(clusters, start=1):
        if clid % display_den == 0:
            print('processing cluster %d out of %d' % (clid, len(clusters)))
        for nid in nodes:
            node = nodes_dict[nid]
            if node.seg not in feats_dict:
                feats_dict[node.seg] = []
            feats_dict[node.seg].append(clid)
    
    pickle.dump(feats_dict, open(feats_dict_fname, "wb"))
    print("Generation complete, now writing to file: %s" % os.path.basename(feats_dict_fname))
    
    # Get complete list of segment ids
    segids = []
    for fid in segment_map:
        segids.extend(segment_map[fid].keys())
    
    with open(feats_fname, "w") as out_f:
        for seg_id in sorted(segids):
            if seg_id not in feats_dict:
                outline = "-1\n"
                total_errors += 1
            else:
                outline = " ".join(map(str,sorted(feats_dict[seg_id])))
                outline = outline.strip() + "\n"
            out_f.write(outline)
    print("Finished writing features file: %s" % os.path.basename(feats_fname))
    print("Psuedowords not found for: %d segments, out of total: %d segments" % (total_errors, len(segids)))
    

In [85]:
generate_pseudowords_for_segments(segment_map, nodes_dict, clusters, feats_fname, feats_dict_fname)

processing cluster 914 out of 9141
processing cluster 1828 out of 9141
processing cluster 2742 out of 9141
processing cluster 3656 out of 9141
processing cluster 4570 out of 9141
processing cluster 5484 out of 9141
processing cluster 6398 out of 9141
processing cluster 7312 out of 9141
processing cluster 8226 out of 9141
processing cluster 9140 out of 9141
Generation complete, now writing to file: pseudowords.dict
Finished writing features file: pseudowords.feats
Psuedowords not found for: 10273 segments, out of total: 17394 segments
