In [173]:
from __future__ import print_function
from __future__ import division
import os
import cPickle as pickle
import json
import subprocess
from IPython.display import display
from IPython.display import Audio
import bisect
from collections import namedtuple
import numpy as np
import pandas as pd
from collections import Counter

In [174]:
with open("config.json") as json_data_file:
    config = json.load(json_data_file)

In [175]:
nodes_fname = config["es"]['nodes_fname']
seg_nodes_fname = config["es"]['seg_nodes_fname']
nodes_dict_fname = config["es"]['nodes_dict_fname']

edges_utd_fname = config["es"]['edges_utd_fname']
edges_olap_fname = config["es"]['edges_olap_fname']
edges_all_fname = config["es"]['edges_all_fname']
edges_score_fname = config["es"]['edges_score_fname']

clusters_utd_fname = config['es']['clusters_utd_fname']
clusters_fname = config['es']['clusters_fname']
clusters_stats_fname = config['es']['clusters_stats_fname']

pairs_fname = config['es']['score_pairs_fname']
eval_fname = config['es']['eval_pairs_fname']

feats_fname = config['es']['feats_fname']
feats_dict_fname = config['es']['feats_dict_fname']

In [176]:
Align = namedtuple('Align', ['word', 'start', 'end'])
Node = namedtuple('Node', ['file', 'seg', 'start', 'end', 'es', 'es_cnt'])

In [177]:
Eval = namedtuple('Eval', ['n1', 'n2', 'dtw', 'es_sim', 'es_cnt_sim', 'en_j_sim'])

In [178]:
segment_map = pickle.load(open(config['es']['segment_dict_fname'], "rb"))
align_dict = pickle.load(open(config['es']['align_dict_fname'], "rb"))
nodes_dict = pickle.load(open(nodes_dict_fname, "rb"))
pairs_list = pickle.load(open(pairs_fname, "rb"))
clusters = pickle.load(open(clusters_fname, "rb"))

# Evaluate all valid pairs discovered

- Calculate es sim, es content word sim, en content word sim
- Calculate num correct pairs, num total, num content correct, at D=80, and D=87

In [179]:
def similarity_jaccard(w_list1, w_list2):
    common_keys = set(w_list1) & set(w_list2)
    union_keys = set(w_list1) | set(w_list2)
    jaccard_dist = (0 if len(union_keys) == 0 else len(common_keys) / len(union_keys))
    return jaccard_dist

In [180]:
def similarity_match_any(w_list1, w_list2):
    common_words_len = len(set(w_list1) & set(w_list2))
    return max(min(1, common_words_len), 0)

In [181]:
def eval_utd_pairs(pairs_list, nodes_dict, segment_map, eval_fname):
    eval_list = []
    
    display_den = int(len(pairs_list) / 10)
    
    for pair_num, pair in enumerate(pairs_list, start=1):
        
        if pair_num % display_den == 0:
            print("Evluating pair number: %d" % pair_num)
            
        n1_id = pair[0]
        n2_id = pair[1]
        n1 = nodes_dict[pair[0]]
        n2 = nodes_dict[pair[1]]
        dtw = pair[2]
        
        es_sim = similarity_match_any(n1.es, n2.es)
        es_cnt_sim = similarity_match_any(n1.es_cnt, n2.es_cnt)
        
        en_n1 = [a.word for a in align_dict[n1.file][n1.seg]['en_cnt']]
        en_n2 = [a.word for a in align_dict[n2.file][n2.seg]['en_cnt']]
        en_j_sim = similarity_jaccard(en_n1, en_n2)
        
        #eval_dict[pair_num] = Eval(n1_id, n2_id, dtw, es_sim, es_cnt_sim, en_j_sim)
        eval_list.append(Eval(n1_id, n2_id, dtw, es_sim, es_cnt_sim, en_j_sim))
    
    # Saving eval
    pickle.dump(eval_list, open(eval_fname, "wb"))
    print("Finished evaluating %d pairs" % pair_num)
    return eval_list

In [182]:
eval_list = eval_utd_pairs(pairs_list, nodes_dict, segment_map, eval_fname)

Evluating pair number: 1337
Evluating pair number: 2674
Evluating pair number: 4011
Evluating pair number: 5348
Evluating pair number: 6685
Evluating pair number: 8022
Evluating pair number: 9359
Evluating pair number: 10696
Evluating pair number: 12033
Evluating pair number: 13370
Finished evaluating 13370 pairs


In [183]:
eval_df = pd.DataFrame(eval_dict, columns=Eval._fields)

In [184]:
# All matches
print('Total pairs evaluated: %d' % len(eval_list))

es_match = [i for i, e in enumerate(eval_list) if e.es_sim == 1]
es_cnt_match = [i for i, e in enumerate(eval_list) if e.es_cnt_sim == 1]
print('pairs with es word match: %d' % len(es_match))
print('pairs with es content word match: %d' % len(es_cnt_match))

# D=0.87
es_87_pairs = [i for i, e in enumerate(eval_list) if e.dtw >= 0.88]
es_87_match = [i for i, e in enumerate(eval_list) if e.es_sim == 1 and e.dtw >= 0.88]
es_87_cnt_match = [i for i, e in enumerate(eval_list) if e.es_cnt_sim == 1 and e.dtw >= 0.88]
print('pairs with D >= 0.87: %d' % len(es_87_pairs))
print('pairs with es word match: %d' % len(es_87_match))
print('pairs with es content word match: %d' % len(es_87_cnt_match))

Total pairs evaluated: 13370
pairs with es word match: 4000
pairs with es content word match: 2559
pairs with D >= 0.87: 10164
pairs with es word match: 3393
pairs with es content word match: 2304


# Clusters

- calculate cluster purity, and most common word
- generate features, parallel corpora

In [190]:
def calc_cluster_stats(clusters, nodes_dict):
    cluster_stats = {'words':[], 'purity':[], 'most_common':[], 'avg_purity':0.0}
    avg_purity_num = 0
    avg_purity_den = 0
    for i, nodes in enumerate(clusters):
        cnt_words = []
        for node in nodes:
            cnt_words.extend(list(nodes_dict[node].es_cnt))
        cnt_words = [w.lower().decode("utf-8") for w in cnt_words]
        cluster_stats['words'].append(cnt_words)
        counter_words = Counter(cnt_words)
        most_common_word = counter_words.most_common(1)[0] if len(cnt_words) > 0 else (': (', 0)
        #print(i, most_common_word)
        cluster_stats['most_common'].append(most_common_word[0])
        temp_len = len(cnt_words) if len(cnt_words) > 0 else 1
        cluster_stats['purity'].append(most_common_word[1] / (temp_len * 1.0))
        avg_purity_num += most_common_word[1]
        avg_purity_den += temp_len
    cluster_stats['avg_purity'] = avg_purity_num / avg_purity_den
    print('Finished calculation cluster stats')
    print('Average cluster purity: %0.3f' % cluster_stats['avg_purity'])
    pickle.dump(cluster_stats, open(clusters_stats_fname, "wb"))
    return cluster_stats

In [191]:
clusters_stats = calc_cluster_stats(clusters, nodes_dict)

Finished calculation cluster stats
Average cluster purity: 0.373


# Bag of pseudowords

- For each segment, generate a bag of cluster ids based on the nodes discovered
- default cluster id: -1 when no nodes found
