In [69]:
from __future__ import print_function
from __future__ import division
import os
import cPickle as pickle
import json
import subprocess
from IPython.display import display
from IPython.display import Audio
import bisect
from collections import namedtuple

# Map ZRTools output to transcripts

- Create modified .nodes file
- Create mapping between es words, and nodes

In [70]:
with open("config.json") as json_data_file:
    config = json.load(json_data_file)

In [71]:
nodes_fname = os.path.join(config["es"]["zrt_out_path"], "master_graph.nodes")
edges_fname = os.path.join(config["es"]["zrt_out_path"], "master_graph.zedges")
clusters_fname = os.path.join(config["es"]["zrt_out_path"], "master_graph.dedups")
seg_nodes_fname = os.path.join(config["es"]["zrt_out_path"], "master_graph.segnodes")

In [72]:
Align = namedtuple('Align', ['es_word', 'start', 'end'])

In [73]:
segment_map = pickle.load(open("../segments.dict", "rb"))
align_dict = pickle.load(open("../align_dict.p", "rb"))

## Nodes - identify the segment to which the node belongs

Lookout for:
1. Patterns that go across segment boundaries
2. ...

In [19]:
def grade(score, breakpoints=[60, 70, 80, 90], grades='FDCBA'):
    i = bisect.bisect(breakpoints, score)
    return grades[i], i

[grade(score) for score in [33, 60, 77, 70, 89, 90, 100]]

[('F', 0), ('D', 1), ('C', 2), ('C', 2), ('B', 3), ('A', 4), ('A', 4)]

In [60]:
def search_segid(node_start, node_end, file_id, segment_map):
    seg_id_list, start_time_list = zip(*sorted(segment_map[file_id].items(), key=lambda t:t[0]))

    # Binary search to find segment where the node starts and ends in
    # we subtract 1 as bisect returns the index where we can insert a value keeping
    # the sort order. We do not expect it to be 0, as the node will always have a 0 or positive start
    # time
    seg_id_start = bisect.bisect(start_time_list, node_start)-1
    s1 = seg_id_list[seg_id_start]
    seg_id_end = bisect.bisect(start_time_list, node_end)-1
    s2 = seg_id_list[seg_id_end]
    
#     for x in xrange(len(start_time_list)-1,-1,-1):
#         if node_start >= start_time_list[x]:
#             seg_id_start = x
#             s1 = seg_id_list[x]
#             break
    
#     for y in xrange(len(start_time_list)-1,-1,-1):
#         if node_end >= start_time_list[y]:
#             seg_id_end = y
#             s2 = seg_id_list[y]
#             break
    
    if seg_id_start == seg_id_end:
        start = node_start - segment_map[file_id][s1]
        end = node_end - segment_map[file_id][s1]
        return s1, start, end, 0
    else:
        # Calculate which segment overlaps more
        #print (file_id, node_start, node_end, seg_id, seg_id_start, seg_id_end, seg_id_list[seg_id_start-1], seg_id_list[seg_id_end-1])
        if (segment_map[file_id][s2] - node_start) >= (node_end - segment_map[file_id][s2]):
            shift_value = node_end - segment_map[file_id][s2]
            start = node_start - segment_map[file_id][s1] - shift_value
            end = segment_map[file_id][s2] - segment_map[file_id][s1]
            print("More in s1", start, end, shift_value)
            return s1, start, end, 1
        else:
            shift_value = segment_map[file_id][s2] - node_start
            start = 0
            end = node_end - segment_map[file_id][s2] + shift_value
            print("More in s2", start, end, shift_value)
            return s2, start, end, 1
    print (file_id, node_start, node_end, seg_id_start, seg_id_end)
    raise ValueError

In [59]:
# Test code
print(search_segid(20509, 20641, "042", segment_map))
print(search_segid(0, 51, "038", segment_map))

More in s1 429 561 8
('042.079', 429, 561, 1)
('038.001', 0, 51, 0)


In [56]:
def create_segmented_nodes(nodes_fname, segment_map, seg_nodes_fname):
    total_errors = 0
    with open(nodes_fname, "r") as nodes_f, open(seg_nodes_fname, "w") as segnodes_f:
        for i, line in enumerate(nodes_f):
            line_items = line.strip().split(None, 3)
            file_id = line_items[0]
            node_start, node_end = map(int, line_items[1:3])
            try:
                seg_id, seg_node_start, seg_node_end, e = search_segid(node_start, node_end, file_id, segment_map)
                total_errors += e
                outline = "%s\t%d\t%d\t%s\n" % (seg_id, seg_node_start, seg_node_end, line_items[3])
                segnodes_f.write(outline)
            except ValueError:
                print("Incorrect line format at line: %d\n%s" % (i, line))
                
    print("Total nodes: %d" % (i+1))
    print("Total errors: %d" % total_errors)
    print("completed")
            

In [74]:
nodes_fname, seg_nodes_fname

(u'../../../ZRTools/exp/callhome/matches/config0.80-0.90-0.80-50/master_graph.nodes',
 u'../../../ZRTools/exp/callhome/matches/config0.80-0.90-0.80-50/master_graph.segnodes')

In [57]:
create_segmented_nodes(nodes_fname, segment_map, seg_nodes_fname)

More in s2 0 52 4
More in s2 0 121 9
More in s2 0 57 1
More in s2 0 56 9
More in s1 221 273 5
More in s2 0 55 3
More in s2 0 62 3
More in s2 0 59 3
More in s2 0 61 4
More in s2 0 63 1
More in s1 254 321 15
More in s1 265 321 19
More in s2 0 56 1
More in s1 258 321 6
More in s1 264 321 9
More in s1 231 292 0
More in s2 0 55 9
More in s1 264 321 1
More in s2 0 53 2
More in s1 260 321 4
More in s1 253 321 12
More in s1 348 401 19
More in s1 92 148 2
More in s2 0 55 1
More in s2 0 61 6
More in s1 3 59 1
More in s1 290 346 1
More in s2 0 52 1
More in s2 0 58 2
More in s2 0 52 1
More in s1 153 211 2
More in s2 0 59 1
More in s2 0 65 4
More in s2 0 65 3
More in s1 262 355 0
More in s2 0 82 2
More in s2 0 106 5
More in s1 468 519 2
More in s1 449 609 9
More in s1 601 751 4
More in s2 0 59 4
More in s1 375 451 5
More in s1 32 88 2
More in s1 392 448 0
More in s1 466 517 1
More in s1 249 303 2
More in s1 73 124 1
More in s2 0 51 1
More in s2 0 52 1
More in s1 245 296 1
More in s1 250 303 1
More 

In [89]:
def find_align_words_for_node(align_words_list, start, end):
    display(align_words_list, start, end)
    print((segment_map["001"]["001.224"]+start)/100.0, "=", (segment_map["001"]["001.224"]+end)/100.0)
    pass

In [90]:
def map_nodes_align(seg_nodes_fname, segment_map, align_dict):
    total_errors = 0
    pass

In [91]:
find_align_words_for_node(align_dict["001"]["001.224"]["es"], 191, 246)

[Align(es_word='DECIDIDOS', start=10, end=72),
 Align(es_word='EN', start=72, end=84),
 Align(es_word='VOLVER', start=84, end=121),
 Align(es_word='VAMOS', start=193, end=229),
 Align(es_word='A', start=229, end=235),
 Align(es_word='VER', start=235, end=261),
 Align(es_word='VAMOS', start=301, end=341),
 Align(es_word='A', start=341, end=347),
 Align(es_word='VER', start=347, end=369)]

191

246

480.24 = 480.79
