In [3]:
# -*- coding: utf-8 -*-

In [1]:
from __future__ import print_function
from __future__ import division
from prettytable import PrettyTable
import textwrap
from tabulate import tabulate
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import json
import os
import sys
import string
import subprocess
import cPickle
import editdistance
import nltk
import time
import math
from collections import Counter
import IPython
from IPython import display
from nltk.corpus import stopwords
from collections import defaultdict
from timeit import default_timer as timer
from matplotlib import rcParams
import networkx as nx
from IPython.display import display
from matplotlib.ticker import MultipleLocator, \
     FormatStrFormatter, AutoMinorLocator
%matplotlib inline

ZRT output utilities:

# Class - ZRTPrep

Prepare for ZRT experiments
- Generate:
 - lst file


In [7]:
class ZRTPrep(object):
    def __init__(self, config):
        self.base_config = config["base"]
        
    def read_file_list(self):
        pass
    # Convert sph files to wav files
    def gen_sph2wav(self):
        pass
    
    # Split audio into multiple channels
    def split_channels(self):
        pass
    
    def zrt_init_file_list(self, lst_file):
        self.b_wav_files = []
        self.b_base_name = []
        self.b_vad_files = []
        self.b_exp_name = os.path.splitext(os.path.basename(lst_file))[0]
        with open(lst_file,"r") as in_f:
            for fil in in_f:
                fil = fil.strip()
                self.b_wav_files.append(fil)
                self.b_base_name.append(os.path.splitext(os.path.basename(fil))[0])
#                 self.b_vad_files.append(fil.replace(".wav", ".vad"))
                self.b_vad_files.append(fil.replace(".wav", ".evad"))
        
    def zrt_create_exp_dirs(self, exp_path):
        self.b_exp_path = exp_path
        self.b_feats_path = os.path.join(exp_path, "feats")
        self.b_lsh_path = os.path.join(exp_path, "lsh")
        self.b_matches_path = os.path.join(exp_path, "matches")
        if not os.path.exists(self.b_feats_path):
            os.makedirs(self.b_feats_path)
        if not os.path.exists(self.b_lsh_path):
            os.makedirs(self.b_lsh_path)
        if not os.path.exists(os.path.join(exp_path, "results")):
            os.makedirs(os.path.join(exp_path, "results"))
        if not os.path.exists(os.path.join(exp_path, "matches")):
            os.makedirs(self.b_matches_path)
            
    def zrt_gen_files_base(self):
        if os.path.exists(self.b_exp_path):
            with open(os.path.join(self.b_exp_path, "files.base"), "w") as out_f:
                for fil_base_name in sorted(self.b_base_name):
                    out_f.write("{0:s}\n".format(fil_base_name))
            
            
    def zrt_gen_lsh_proj_file(self):
        self.b_proj_fil_name = os.path.join(self.b_exp_path, "proj_S64xD39_seed1")
        subprocess.call([self.base_config["lsh_genproj"], \
                         "-D","39","-S","64","-seed", \
                         "1","-projfile", self.b_proj_fil_name])
    
    def zrt_gen_plp_files(self):
        FEACALC = self.base_config["feacalc"]
        STANDFEAT = self.base_config["standfeat"]
        num_files = len(self.b_wav_files)
        for i, wav_fil in enumerate(self.b_wav_files):
            if i % 1000 == 0:
                print("Completed: {0:d} out of: {1:d}".format(i, num_files))
            feat_fil = os.path.join(self.b_feats_path, self.b_base_name[i] + ".binary")
            std_feat_fil = os.path.join(self.b_feats_path, self.b_base_name[i] + ".std.binary")
            #print(wav_fil, feat_fil, std_feat_fil, self.b_vad_files[i])
            
            # Generate feat file
#             print(" ".join([FEACALC,"-plp", \
#                             "12", "-cep", "13", "-dom", "cep", "-deltaorder", \
#                             "2", "-dither", "-frqaxis", "bark", "-samplerate", \
#                             "8000", "-win", "25", "-step", "10", "-ip", \
#                             "MSWAVE", "-rasta", "false", "-compress", \
#                             "true", "-op", "swappedraw", "-o", feat_fil, wav_fil]))
            subprocess.call([FEACALC,"-plp", \
                            "12", "-cep", "13", "-dom", "cep", "-deltaorder", \
                            "2", "-dither", "-frqaxis", "bark", "-samplerate", \
                            "8000", "-win", "25", "-step", "10", "-ip", \
                            "MSWAVE", "-rasta", "false", "-compress", \
                            "true", "-op", "swappedraw", "-o", feat_fil, wav_fil])

            # Standardize binary file, for VAD regions only
            subprocess.call([STANDFEAT, "-D", "39", "-infile", \
                            feat_fil, "-outfile", std_feat_fil, \
                            "-vadfile", self.b_vad_files[i]])
            
            print(print("Completed - FEAT generation"))
    
    def zrt_gen_lsh_files(self):
        LSH = self.base_config["lsh"]
        num_files = len(self.b_wav_files)
        for i, wav_fil in enumerate(self.b_wav_files):
            if i % 1000 == 0:
                print("Completed: {0:d} out of: {1:d}".format(i, num_files))
            lsh_fil = os.path.join(self.b_lsh_path, self.b_base_name[i] + ".std.lsh64")
            std_feat_fil = os.path.join(self.b_feats_path, self.b_base_name[i] + ".std.binary")
            if os.path.exists(std_feat_fil):
                pass
                subprocess.call([LSH, "-D", "39", "-S", "64", \
                                "-projfile", self.b_proj_fil_name, \
                                "-featfile", std_feat_fil, "-sigfile", \
                                lsh_fil, "-vadfile", self.b_vad_files[i]])
            else:
                print("File not found: %s" % std_feat_fil)
        print("Completed - LSH")
    
    def zrt_gen_disc_cmd(self, num_splits=1):
        disc_file = os.path.join(self.b_exp_path, "disc.cmd")
        disc_file_split_base = "disc_{0:d}.cmd"
        disc_file_split = os.path.join(self.b_exp_path, disc_file_split_base)
        disc_split_file = os.path.join(self.b_exp_path, "disc_split.txt")
        num_files = len(self.b_base_name)
        exp_local_path = os.path.join("exp", self.b_exp_name)
        cmd_string = "scripts/plebdisc_filepair \"{0:s}\" \"{1:s}\" {2:s} 39\n"
        
        total_lines = num_files * num_files
        lines_per_file = total_lines // num_splits
        smallfile = None
        curr_line = 0
        curr_file_num = 0
        
        for i in xrange(num_files) :
            if i % 1000 == 0:
                print("Progress: {0:d} out of: {1:d}".format(curr_line+1, total_lines))
            for j in xrange(num_files):
                out_line = cmd_string.format(self.b_base_name[i], \
                                                  self.b_base_name[j], \
                                                  exp_local_path)
                if curr_line % lines_per_file == 0:
                    if smallfile:
                        smallfile.close()
                    small_filename = disc_file_split.format(curr_file_num)
                    smallfile = open(small_filename, "w")
                    curr_file_num += 1
                smallfile.write(out_line)
                curr_line += 1
        if smallfile:
            smallfile.close()
        
        # Making a list of commands to execute the split disc list
        full_split_cmd_string = "nice sh {0:s} 1> {1:s} 2>{2:s} &\n"
        split_cmd = os.path.join(exp_local_path, "matches","{0:s}.{1:d}")
        with open(disc_split_file, "w") as out_f:
            for i in xrange(curr_file_num):
                curr_split_file = os.path.join(exp_local_path, disc_file_split_base.format(i))
                split_cmd_out = split_cmd.format("out", i)
                #split_cmd_err = split_cmd.format("err", i)
                split_cmd_err = "/dev/null"
                
                out_line = "nice sh "
                out_f.write(full_split_cmd_string.format(curr_split_file, \
                                                        split_cmd_out, \
                                                        split_cmd_err))
        
        print("Completed - disc.cmd")
    
    
    def zrt_calc_dur_from_evad(self):
        dur_ms = 0
        for fil in self.b_vad_files:
            with open(fil, "r") as in_f:
                for line in in_f:
                    line_items = map(int, line.strip().split())
                    dur_ms += ((line_items[1] - line_items[0]) * 10)
#         print("Total number of files: %d" % len(self.b_vad_files))
#         print("duration: %d (ms), %.2f (hours)" %(dur_ms, dur_ms / (1000 * 3600)))
        return dur_ms
    
    def zrt_init_out_folders(self, zrt_out_path):
        self.nodes_file = os.path.join(zrt_out_path, "master_graph.nodes")
        self.edges_file = os.path.join(zrt_out_path, "master_graph.edges")
        self.clusters_file = os.path.join(zrt_out_path, "master_graph.clusters")
        self.matches_file = os.path.join(zrt_out_path, "master_graph")
        print(self.nodes_file)
    
    
    pass


# Class - CallHomeZRTPrep

CallHome dataset specific processing

In [6]:
'''
Data structure to store mapping between CallHome data, translations
and transcriptions
'''
class VadInfo(object):
    def __init__(self, start=0, end=0, chid=0):
        self.start = start
        self.end = end
        self.chid = chid
    def __str__(self):
        return ' '.join(map(str,[self.start, self.end, self.chid]))
    def __repr__(self):
        return str(self)

class TranscriptInfo(object):
    def __init__(self, start=0, end=0, word=''):
        self.start = start
        self.end = end
        self.word = word
    def __str__(self):
#         y = PrettyTable(["start(ms)", "end(ms)", "word"], hrules=True)
#         y.align["start(ms)"] = "r"
#         y.add_row([self.start*10, self.end*10, self.word])
#         return str(y)
        return ("%d---%d (ms) :\t%s\n" %(self.start*10, self.end*10, self.word))
    def __repr__(self):
        return str(self)
        
class FileInfo(object):
        def __init__(self):
            self.source_file = ''
            self.target_file = ''
            self.vad = VadInfo()
            self.trim_pairs = ()
        def __str__(self):
            return "{0:s} {1:s} {2:s}".format(self.source_file, \
                                             self.target_file, \
                                             self.vad)
        def __repr__(self):
            return str(self)
'''
Class to manage all CallHome specific data processing
'''
class CallHomeZRTPrep(ZRTPrep):
    '''
    Constructor - read config file
    Path to required files, folders and utilities
    '''
    def __init__(self, config_file):
        with open(config_file) as json_data_file:
            config = json.load(json_data_file)
        super(CallHomeZRTPrep, self).__init__(config)
        self.ch_config = config["es"]
    '''
    Initialize filemap: filename to integer id mapping
    This id is used to refer to all transcriptions
    '''
    def read_filemap(self):
        self.filename2i = {}
        with open(self.ch_config['filename_map'], "r") as in_f:
            for i, file_name in enumerate(in_f, start=1):
                self.filename2i[file_name.strip()] = i
    '''
    The fisher-callhome corpus merges some of the vad regions
    These regions need to be merged while generating single channel
    audio files and the names need to be consistent with the 
    transcriptions
    '''
    def read_mapping_for_vad_trans(self):
        with open(self.ch_config["trans_map"], "r") as in_f:
            self.trans_map = {}
            self.vad_map = {}
            trans_count = {}
            for i, line in enumerate(in_f, start=0):
                line_items = line.strip().split()
                sp_fil = line_items[0]
                vad_ids = map(int, line_items[1].split('_'))
                if sp_fil not in self.trans_map:
                    self.trans_map[sp_fil] = {}
                    self.vad_map[sp_fil] = {}
                    trans_count[sp_fil] = 1
                self.vad_map[sp_fil][trans_count[sp_fil]] = vad_ids
                self.trans_map[sp_fil][trans_count[sp_fil]] = i
                trans_count[sp_fil] += 1
    
    def read_vad_info(self, filename):
        vad_info_dict = {}
        with open(filename, 'r') as in_f:
            for i, line in enumerate(in_f, start=1):
                line = line.split(None, 3)
                if len(line) > 3:
                    start = int(line[0].replace('.',''))
                    end = int(line[1].replace('.',''))
                    if ('A' in line[2]) and (':' in line[2]):
                        chid = 1
                    elif ('B' in line[2]) and (':' in line[2]):
                        chid = 2
                    else:
                        print("Channel id not found")
                        return {}
                    vad_info_dict[i] = VadInfo(start,end,chid)
            return vad_info_dict
        
    def get_transcript_path(self, sp_fil):
        # Check train, evltest, devtest folders
        sp_fil_ext = sp_fil + ".txt"
        sub_folders = ["train", "evltest", "devtest"]
        corpus_path = self.ch_config["data_path"]
        check_paths = \
                [os.path.join(corpus_path,x,sp_fil_ext) \
                 for x in sub_folders]
        check_paths_exists = map(os.path.isfile,check_paths)
        if any(check_paths_exists):
            sp_fil_path = [f for t, f in enumerate(check_paths) \
                     if check_paths_exists[t] == True][0]
            return sp_fil_path
        else:
            print('%s not found' % sp_fil)
            return ''
        
    def read_file_info(self):
        self.vad_info = {}
        self.source_wav_full_path = {}
        for sp_fil in self.filename2i:
            # Get complete filename:
            sp_fil_path = self.get_transcript_path(sp_fil)
            # Read vad info
            self.vad_info[sp_fil] = self.read_vad_info(sp_fil_path)
            self.source_wav_full_path[sp_fil] = sp_fil_path.replace('.txt', '')
    
    def read_translations(self):
        with open(self.ch_config["trans_file"], "r") as in_f:
            self.trans_lines = in_f.readlines()
    
    def read_transcript(self, filename):
        transcrpt_info_list = []
        with open(filename, "r") as in_f:
            for line in in_f:
                line_items = line.strip().split()
                start = int(line_items[1])
                end = int(line_items[2])
                word = line_items[0]
                transcrpt_info_list.append(TranscriptInfo(start, \
                               end=end, word=word))
        return transcrpt_info_list
    
    def read_transcripts(self, folder_path):
        transcrpt_dict = {}
        file_list = [f for \
                     f in os.listdir(folder_path) if \
                     os.path.isfile(os.path.join(folder_path, f)) and \
                    f.endswith(("words", "phones"))]
        for f in file_list:
            dict_key, _ = os.path.splitext(f)
            transcrpt_dict[dict_key] = \
                    self.read_transcript(os.path.join(folder_path, f))
        
        return transcrpt_dict
    
    def read_en_words(self):
        pass
    
    def filter_es_content_words(self):
        print("Creating Spanish content words dictionary ...")
        self.es_cnt_words_dict = {}
        for i, sp_fil in enumerate(self.es_words_dict):
            if i % 5000 == 0:
                print("Processed %d speech utterances" % i)
            
            self.es_cnt_words_dict[sp_fil] = []
            for t in self.es_words_dict[sp_fil]:
                if t.word.lower().decode("utf-8") not in stopwords.words('spanish'):
                    self.es_cnt_words_dict[sp_fil].append(t)        
        print("Finished generating Spanish content words dict ...")
    
    def read_es_words(self):
        print("Reading Spanish word transcriptions ...")
        self.es_words_dict = \
            self.read_transcripts(self.ch_config["es_word_path"])
        print("Finished reading Spanish transcriptions ...")

    def read_es_phones(self):
        print("Reading Spanish phone transcriptions ...")
        self.es_phones_dict = \
            self.read_transcripts(self.ch_config["es_phone_path"])
        print("Finished reading Spanish phone transcriptions ...")
    
    def create_file_dict(self):
        # Loop through translation map
        # We need to create a file for each trans map location
        self.file_dict = {}
        self.en_words_dict = {}
        self.en_cnt_words_dict = {}
        for sp_fil in self.vad_map:
            # Get integer name in 3 digits
            fil_id = "{0:03d}".format(self.filename2i[sp_fil])
            # Loop over all translation lines
            for i, vids in enumerate(self.vad_map[sp_fil].values(),\
                                    start=1):
                # Get starting vad id
                vad_id = "{0:03d}".format(vids[0])
                # For each vad entry, generate an individual
                # file entry
                key = "%s.%s" %(fil_id, vad_id)
                if key not in self.file_dict:
                    self.file_dict[key] = FileInfo()
                    self.en_words_dict[key] = []
                # Get start and end time:
                start_time = self.vad_info[sp_fil][vids[0]].start
                total_time = 0
                trim_pairs_list = []
                for vid in vids:
                    total_time += (self.vad_info[sp_fil][vid].end - self.vad_info[sp_fil][vid].start)
                    # Add start and end times in order
                    trim_pairs_list.append(self.vad_info[sp_fil][vid].start)
                    trim_pairs_list.append(self.vad_info[sp_fil][vid].end)
                
                
                for h in xrange(1, len(trim_pairs_list)-1,2):
                    if trim_pairs_list[h] > trim_pairs_list[h+1]:
                        trim_pairs_list[h+1] = trim_pairs_list[h]
                        print(sp_fil)
                        print(trim_pairs_list)
                
                total_time = 0
                for h in xrange(1, len(trim_pairs_list),2):
                    total_time += (trim_pairs_list[h] - trim_pairs_list[h-1])
                    
                
                # Calculate end time based on duration
                end_time = start_time + total_time
                # Channel id
                chid = self.vad_info[sp_fil][vids[0]].chid
                # Set up file info object
                self.file_dict[key].source_file = sp_fil
                
                self.file_dict[key].target_file = \
                    os.path.join(self.ch_config["out_path"], \
                                key)
                self.file_dict[key].vad.start = start_time
                self.file_dict[key].vad.end = end_time
                self.file_dict[key].vad.chid = chid
                # Assign the trim list
                self.file_dict[key].trim_pairs = tuple(trim_pairs_list)
                # Get translations
                en_line = \
                    self.trans_lines[self.trans_map[sp_fil][i]]
                en_line = en_line.translate(string.maketrans("",""),\
                                            string.punctuation)
                en_line = en_line.lower()
                self.en_words_dict[key] = en_line.strip().split()
                
                # Get English content words
                en_w_in_uttr = self.en_words_dict[key]
                en_w_cnt_in_uttr = \
                    [word for word in en_w_in_uttr if word.decode("utf-8") not in stopwords.words('english')]

                self.en_cnt_words_dict[key] = en_w_cnt_in_uttr
    
    def read_speaker_info(self):
        pass
    
    def read_call_info(self):
        pass
    
    def create_out_folders(self):
        out_path = self.ch_config["out_path"]
        tmp_path = os.path.join(self.ch_config["out_path"], "tmp")
        if not os.path.exists(self.ch_config["out_path"]):
            os.makedirs(self.ch_config["out_path"])
        if not os.path.exists(tmp_path):
            os.makedirs(tmp_path)
        return out_path, tmp_path
        
    def convert_sph_to_wav(self):
        _, tmp_path = self.create_out_folders()
        for i, (sp_fil, sp_fil_path) in enumerate(self.source_wav_full_path.items()):
            if i % 10 == 0:
                print("Converted: %d files" %i)
            sph_file = sp_fil_path + ".sph"
            wav_file = os.path.join(tmp_path, sp_fil+".wav")
            # If file exists, delete it
            if os.path.exists(wav_file):
                os.remove(wav_file)
            
            # Create wav file
            subprocess.call([self.base_config["sph2pipe"], "-f", \
                             "rif", "-p", sph_file, wav_file])
            
            # Create low res wav file
            low_res_wav_file = os.path.join(tmp_path, sp_fil+"_low.wav")
            subprocess.call([self.base_config["sox"], "-t", \
                 "wav", wav_file, "-t", "wav", \
                 "-e", "signed-integer", "-b", \
                "16", "-c", "2", "-r", "8000", \
                "--no-dither", low_res_wav_file])

            # Create channel specific wav files
            ch1_wav_file = os.path.join(tmp_path, sp_fil+"_1.wav")
            ch2_wav_file = os.path.join(tmp_path, sp_fil+"_2.wav")
            subprocess.call([self.base_config["sox"], \
                            low_res_wav_file, ch1_wav_file, \
                            "remix", "1"])
            subprocess.call([self.base_config["sox"], \
                            low_res_wav_file, ch2_wav_file, \
                            "remix", "2"])
            
            # delete dual channel wav file
            os.remove(wav_file)
            os.remove(low_res_wav_file)
            print("Finished")
    
    def gen_wav_files(self):
        # we have the details of all the files to be created in
        # file_dict
        # Create output folder if it does not exist:
        out_path, tmp_path = self.create_out_folders()
        # iterate over files in es words, as some files in the corpus are corrupt
        # file_dict has all 20K files, whereas, es_words_dict has the correct
        # 17K files
        # Use self.es_words_dict instead of self.file_dict
        for i, sp_fil in enumerate(self.es_words_dict):
            # Read file info
            fil_info = self.file_dict[sp_fil]

            if i % 1000 == 0:
                print("Completed: %d files" %i)
            
            SOXBIN = self.base_config["sox"]
            source_file = fil_info.source_file
            chid = fil_info.vad.chid
            
            in_wav = os.path.join(tmp_path, \
                                  "{0:s}_{1:d}.wav".format(source_file,chid))
            out_wav = fil_info.target_file + ".wav"
            out_vad = fil_info.target_file + ".vad"
            out_evad = fil_info.target_file + ".evad"

            if not os.path.exists(in_wav):
                print("File: %s not found!" % in_wav)
                return

            # Remove existing files
            if os.path.exists(out_wav):
                os.remove(out_wav)
            if os.path.exists(out_vad):
                os.remove(out_vad)

            # Generate list of trim pairs
            GEN_CMD_BASE = [SOXBIN, in_wav, out_wav, "trim"]
            GEN_CMD_TRIM_INFPO = [str(fil_info.trim_pairs[0]/100)]
            for t in fil_info.trim_pairs[1:]:
                GEN_CMD_TRIM_INFPO.append("={0:.2f}".format((t/100)))

            # Generate start and end info for VAD file
            start = str(fil_info.vad.start / 100)
            #dur = str((fil_info.vad.end - fil_info.vad.start) / 100)
            #print(in_wav, out_wav, start, dur)
            # Create wav
            #subprocess.call([SOXBIN, in_wav, out_wav, "trim", start, dur])
            GEN_CMD = GEN_CMD_BASE + GEN_CMD_TRIM_INFPO
            #print(' '.join(GEN_CMD))
            subprocess.call(GEN_CMD)
            # Create vad
            #print("{0:d} {1:d}\n".format(0, (fil_info.vad.end - fil_info.vad.start + 1)))
            with open(out_vad, "w") as out_f:
                out_f.write("{0:d} {1:d}\n".format(0, (fil_info.vad.end - fil_info.vad.start)))
                
            # Create energy based VAD
            energy_script = "../../ZRTools/scripts/mark_energy.py"
            subprocess.call(["python",energy_script,"-i", out_wav,"-o", out_evad,"-s","0.4","-e","0"]) 
        print("Finished %d files " % (i+1))
    
    def get_file_list(self, to_save=False, num_speakers=0, start_speaker=0):
        # Get the keys from align dict
        # This list excludes the corrupted files, and hence, we do not need to 
        # process them
        
        set_align = set([x.split('.')[0] for x in self.es_words_dict.keys()])
        if num_speakers > 0:
            list_align = sorted(list(set_align))[start_speaker:(start_speaker+num_speakers)]
            set_align = set(list_align)
        
        wav_fil_list = []
        out_path = self.ch_config["out_path"]
        for wav_fil in self.es_words_dict:
            if wav_fil.split('.')[0] in set_align:
                wav_fil_list.append(os.path.join(out_path, wav_fil+".wav"))
        wav_fil_list = sorted(wav_fil_list)
        
        if to_save:
            with open(self.ch_config["lst_file"], "w") as out_f:
                for wav_fil in wav_fil_list:
                    out_f.write("%s\n" % wav_fil)
        
        return wav_fil_list
        

    def save_dicts(self):
        # Save all the important dictionaries
        dicts_path = self.ch_config["dicts_path"]
        if not os.path.exists(dicts_path):
            os.makedirs(dicts_path)
        # Save dicts
        names_dict = self.ch_config["dict_names"]
        dict_name_map = [(self.file_dict,names_dict["file_info"]), \
                     (self.en_words_dict,names_dict["en_words"]), \
                     (self.es_words_dict,names_dict["es_words"]), \
                     (self.es_phones_dict,names_dict["es_phones"])]
        for fil, fil_name in dict_name_map:
            print("Saving file: %s" % fil_name)
            cPickle.dump(fil, open(os.path.join(dicts_path,fil_name), "wb"))
        print("Finished saving all dictionaries")
    
    def save_state(self):
        state_dict = {}
        state_dict['file_info'] = self.file_dict
        state_dict['es_phones'] = self.es_phones_dict
        state_dict['es_words'] = self.es_words_dict
        state_dict['en_words'] = self.en_words_dict
        state_dict['es_cnt_words'] = self.es_cnt_words_dict
        state_dict['en_cnt_words'] = self.en_cnt_words_dict
        
        print("Saving prep state dictionary ...")
        prep_state_dict_path = os.path.join(self.ch_config["exp_path"], "prep_state.dict")
        cPickle.dump(state_dict, open(prep_state_dict_path, "wb"))
        print("Finished saving prep state dictionary ...")

    
    def load_state(self):
        print("Loading prep state dictionary ...")
        prep_state_dict_path = os.path.join(self.ch_config["exp_path"], "prep_state.dict")
        state_dict = cPickle.load(open(prep_state_dict_path, "rb"))
        self.file_dict = state_dict['file_info']
        self.es_phones_dict = state_dict['es_phones']
        self.es_words_dict = state_dict['es_words']
        self.en_words_dict = state_dict['en_words']
        self.es_cnt_words_dict = state_dict['es_cnt_words']
        self.en_cnt_words_dict = state_dict['en_cnt_words']
        
        print("Finished loading prep state dictionary ...")
        
        # Setting up ZRT base
        #_ = self.get_file_list(to_save=True, num_speakers=20, start_speaker=41)
        # set the exp path and the lst file
        self.zrt_create_exp_dirs(self.ch_config["exp_path"])
        self.zrt_init_file_list(self.ch_config["lst_file"])
        self.zrt_init_out_folders(self.ch_config['zrt_out_path'])
        
    def something_to_do(self):
        # Read file map        
        self.read_filemap()
        self.read_mapping_for_vad_trans()
        self.read_translations()
        self.read_file_info()
        self.create_file_dict()
        self.read_es_words()
        self.filter_es_content_words()
        self.read_es_phones()
        _ = self.get_file_list(to_save=True, num_speakers=20, start_speaker=41)
        # set the exp path and the lst file
        self.zrt_create_exp_dirs(self.ch_config["exp_path"])
        self.zrt_init_file_list(self.ch_config["lst_file"])
        self.zrt_init_out_folders(self.ch_config['zrt_out_path'])
        self.save_state()
    
    def something_else_to_do(self):
        self.convert_sph_to_wav()
    
    def some_more_stuff(self):
        self.gen_wav_files()
        
    def ch_zrt_gen_exp_command(self):
#         self.zrt_create_exp_dirs(self.ch_config["exp_path"])
#         self.zrt_init_file_list(self.ch_config["lst_file"])
        self.zrt_gen_files_base()
        self.zrt_gen_disc_cmd(num_splits=25)
        
    def ch_zrt_plp(self):  
        self.zrt_gen_plp_files()
    
    def ch_zrt_lsh(self):
        self.zrt_gen_lsh_proj_file()
        self.zrt_gen_lsh_files()
        
    def print_config(self):
        print('\n'.join(self.base_config.values()))
        print('\n'.join(self.ch_config.values()))
        

# Class CallHomeEval

Evaluate the ZRT on CallHome data

In [10]:
class NodeInfo(object):
    def __init__(self, wav_fil, start, end):
        self.wav_fil = wav_fil
        self.start = start
        self.end = end
        # es words from transcription
        self.es_words = ()
        self.es_cnt_words = ()
        self.es_phones = ()
        # Future use - add speaker info
        self.spkrinfo = 0
        # Future use - add call info
        self.callinfo = 0
        
    def __str__(self):
        return ' '.join(map(str,[self.wav_fil, self.start, self.end]))
    
    def get_es_words(self):
        return ' '.join(self.es_words)
    
    def get_es_phones(self):
        return ' '.join(self.es_phones)
    
    def get_dur_ms(self):
        return (self.end - self.start) * 10
    
#     def get_content_words(self):
#         es_words_tokenized = " ".join(self.es_words).decode('utf-8')
#         return tuple([token for token in nltk.word_tokenize(es_words_tokenized) \
#                 if token.lower() not in stopwords.words('spanish')])
    
#     def has_content_words(self):
#         return (len(self.get_content_words()) > 0)
    
    def __repr__(self):
        return str(self)

class PairInfo(object):
    def __init__(self, param_dict):
        pass
    

class CallHomeEval(ZRTPrep):
    '''
    Constructor - read config file
    Path to required files, folders and utilities
    '''
    def __init__(self, config_file):
        with open(config_file) as json_data_file:
            config = json.load(json_data_file)
        super(CallHomeEval, self).__init__(config)
        self.ch_config = config["es"]
        # set the exp path and the lst file
        self.zrt_create_exp_dirs(self.ch_config["exp_path"])
        self.zrt_init_file_list(self.ch_config["lst_file"])
        self.zrt_init_out_folders(self.ch_config['zrt_out_path'])
    
    def find_aligned_words(wav_fil, start, end, transcript_words):
        aligned_words = []
        for word in transcript_words:
            if (word.start >= start and word.start <= end) \
            or (word.end >= start and word.end <= end) \
            or (start >= word.start and start <= word.end) \
            or (end >= word.start and end <= word.end):
                aligned_words.append(word.word)
        return tuple(aligned_words)
    
    def read_nodes(self):
        self.node_dict = {}
        with open(self.nodes_file, "r") as in_f:
            for i, line in enumerate(in_f, start=1):
                if i % 100000 == 0:
                    print("Processing line: %d" % i)
                line_items = line.strip().split()
                wav_fil = line_items[0]
                start = int(line_items[1])
                end = int(line_items[2])
                node = NodeInfo(wav_fil=wav_fil, start=start, end=end)
                node.es_words = self.find_aligned_words(start, end, self.es_words_dict[wav_fil])
                node.es_cnt_words = self.find_aligned_words(start, end, self.es_cnt_words_dict[wav_fil])
                node.es_phones = self.find_aligned_words(start, end, self.es_phones_dict[wav_fil])
                #if i < 5:
                    #print(node.es_words)
                    #print(self.find_aligned_words(start, end, self.es_words_dict[wav_fil]))
                self.node_dict[i] = node
        
        print("Finished - reading nodes ...")

    
    def read_edges(self):
        clusters_dict = {}
        # Pairs list is used for scoring discovered pairs
        # It contains only one entry per discovered pair
        # The node ids in a discovered pair are stored as a single entry
        # in the form of a tuple
        self.pairs_list = []
        # Edges dict is used for label spreading/propagation
        # We add an entry for each node in a discovered pair
        # Therefore, it will contain double the entries as pairs dict
        # It serves as an adjancency list for each node
        self.edges_dict = {}
        # process clusters file
        with open(self.clusters_file, "r") as in_f:
            for i, line in enumerate(in_f):
                line_items = line.strip().split()
                line_items = map(int, line_items)
                clusters_dict[line_items[0]] = line_items[0]
                if len(line_items) > 1:
                    for j in line_items[1:]:
                        clusters_dict[j] = line_items[0]
        
        # Read edges dict
        with open(self.edges_file, "r") as in_f:
            for i, line in enumerate(in_f):
                if i % 100000 == 0:
                    print("Processing line: %d" % (i+1))
                line_items = line.strip().split()
                node_1 = int(line_items[0])
                node_2 = int(line_items[1])
                if node_1 not in clusters_dict:
                    clusters_dict[node_1] = node_1
                if node_2 not in clusters_dict:
                    clusters_dict[node_2] = node_2
                dtw_val = float(line_items[2]) / 1000.0
                
                node_1 = clusters_dict[node_1]
                node_2 = clusters_dict[node_2]
                
                # Add to pairs list as a tuple
                self.pairs_list.append((min(node_1, node_2), max(node_1, node_2)))

                # Add to edges dict
                if node_1 not in self.edges_dict:
                    self.edges_dict[node_1] = {}
                if node_2 not in self.edges_dict:
                    self.edges_dict[node_2] = {}
                
                self.edges_dict[node_1][node_2] = dtw_val
                self.edges_dict[node_2][node_1] = dtw_val
    
        print("Finished - reading edges ...")
        self.clusters_dict = clusters_dict
        
        print("Removing duplicates in pairs list")
        set_pairs = set(self.pairs_list)
        print("Set length: %d and List length: %d" %(len(set_pairs), len(self.pairs_list)))
        self.pairs_list = sorted(list(set_pairs))
        
    def similarity_edit_distance(self, w_list1, w_list2):
        edit_dist = editdistance.eval(w_list1, w_list2)
        max_len = max(len(w_list1), len(w_list2))
        if max_len > 0:
            return (max_len-edit_dist) / (max_len * 1.0), edit_dist
        else:
            return 0, -1
    
    def have_words_in_common(self, w_list1, w_list2):
        common_words_len = len(set(w_list1) & set(w_list2))
        return max(min(1, common_words_len), 0)
        
    def eval_pairs(self):
        self.eval_pairs_list = [{} for i in xrange(len(self.pairs_list))]
        no_match_count = 0
        sil_match_count = 0
        # loop through the pairs list
        for i, (n1, n2) in enumerate(self.pairs_list):
            if i % 3000 == 0:
                print("Processing line: %d" % (i+1))
            # local dict
            eval_dict = {}
            # Add node ids
            eval_dict['n1'] = n1
            eval_dict['n2'] = n2
            # Add speech files
            sp1_chid = self.file_info_dict[self.node_dict[n1].wav_fil].vad.chid
            sp2_chid = self.file_info_dict[self.node_dict[n2].wav_fil].vad.chid
            eval_dict['uid1'] = self.node_dict[n1].wav_fil
            eval_dict['uid2'] = self.node_dict[n2].wav_fil
            eval_dict['chid1'] = sp1_chid
            eval_dict['chid2'] = sp2_chid
            # Add similarity based on DTW from the ZRT output
            eval_dict['zrt_sim'] = self.edges_dict[n1][n2]
            # add duration
            # Use the minimum duration from the two nodes
            eval_dict['dur'] = min(self.node_dict[n1].get_dur_ms(), self.node_dict[n2].get_dur_ms())
            # add score over the es words
            # Check for ['sil', 'sp']
            esw_1 = self.node_dict[n1].es_words
            esw_2 = self.node_dict[n2].es_words
            set_esw_1 = set(esw_1)-set(['sil', 'sp'])
            set_esw_2 = set(esw_2)-set(['sil', 'sp'])
            eval_dict['es_w_sim'] = self.have_words_in_common(set_esw_1, \
                                                               set_esw_2)
            # add score over content es words            
            content_words_n1 = set(self.node_dict[n1].es_cnt_words) - set(['sil', 'sp'])
            content_words_n2 = set(self.node_dict[n2].es_cnt_words) - set(['sil', 'sp'])
            
            eval_dict['cnt_es_w_sim'] = self.have_words_in_common(content_words_n1, \
                                                                        content_words_n2)
            eval_dict['cnt_es_w_check'] = ((len(content_words_n1) > 0) & (len(content_words_n2) > 0))
            # add edit distance score over es phones
            eval_dict['es_p_sim'], _ = self.similarity_edit_distance(self.node_dict[n1].es_phones, \
                                                                 self.node_dict[n2].es_phones)
            # add similarity based on english translations at time 0
            eval_dict['en_w_hgr_sim_0'] = self.en_w_hgr_sim_0[n1][n2]
#             eval_dict['en_w_jcrd_sim_0'] = self.en_w_jcrd_sim_0[n1][n2]
#             eval_dict['en_w_count_sim_0'] = self.en_w_count_sim_0[n1][n2]
            eval_dict['en_w_count_sim_0'] = (0 if self.en_w_hgr_sim_0[n1][n2] ==  0 else 1)
            
            # add similarity based on english translations at time 0
            eval_dict['en_w_cnt_hgr_sim_0'] = self.en_w_cnt_hgr_sim_0[n1][n2]
#             eval_dict['en_w_cnt_jcrd_sim_0'] = self.en_w_cnt_jcrd_sim_0[n1][n2]
#             eval_dict['en_w_cnt_count_sim_0'] = self.en_w_cnt_count_sim_0[n1][n2]
            eval_dict['en_w_cnt_count_sim_0'] = (0 if self.en_w_cnt_hgr_sim_0[n1][n2] == 0 else 1)
            
            # add es words
            eval_dict['es_w_n1'] = ' '.join(self.node_dict[n1].es_words)
            eval_dict['es_w_n2'] = ' '.join(self.node_dict[n2].es_words)
            
            eval_dict['no_mtch'] = False
            eval_dict['sil_only'] = False
            
            # No match counter
            if ((len(esw_1) == 0) or (len(esw_2) == 0)):
                no_match_count += 1
                eval_dict['no_mtch'] = True
            
            if (((len(set(esw_1)) > 0) and (len(set_esw_1)==0)) or \
                ((len(set(esw_2)) > 0) and (len(set_esw_2)==0))):
                sil_match_count += 1
                eval_dict['sil_only'] = True
            
            # Add eval pair
            self.eval_pairs_list[i] = eval_dict
        
        print("Total matches: {0:d}".format((i+1)))
        print("matches with missing transcriptions: {0:d}".format(no_match_count))
        print("matches with only sil, sp: {0:d}".format(sil_match_count))
        print("Finished - evaluating %d pairs ..." % (i+1))        
        print("Setting up eval data frame")
        self.eval_df_full = pd.DataFrame(self.eval_pairs_list)
        self.eval_df = self.eval_df_full[(self.eval_df_full['no_mtch'] == False) & \
                                         (self.eval_df_full['sil_only'] == False)]
        print("Finished ...")
    
#     def play_audio_segment(self, fil, start, end):
#         start_time = str(start)
#         end_time = str(end)
#         start_time = start_time[:-2] + '.' + start_time[-2:]
#         end_time = "=" + end_time[:-2] + '.' + end_time[-2:]
#         play_params = "play " + fil + ' ' + 'trim ' + \
#                                     start_time + " " + end_time
#         print(play_params)
#         subprocess.call(["play", fil, \
#                          "trim", start_time, end_time])
    
#     def play_nodes_pair(self, indx):
#         n1 = self.node_dict[self.eval_pairs_list[indx]['n1']]
#         print("Playing node 1: %s" % str(n1))
#         print(self.es_words_dict[n1.wav_fil])
#         self.play_audio_segment(self.file_info_dict[n1.wav_fil].target_file+".wav", n1.start, n1.end)
#         time.sleep(0.5)
#         n2 = self.node_dict[self.eval_pairs_list[indx]['n2']]
#         print("Playing node 2: %s" % str(n2))
#         print(self.es_words_dict[n2.wav_fil])
#         self.play_audio_segment(self.file_info_dict[n2.wav_fil].target_file+".wav", n2.start, n2.end)
#         print("Finished playing ...")
    
    def gen_segment_wav(self, fil, start, end, out_fil):
        start_time = "{0:0.2f}".format(start/100)
        end_time = "={0:0.2f}".format(end/100)
#         print(' '.join(["sox", fil, out_fil, \
#                          "trim", start_time, "{0:s}".format(end_time)]))
        subprocess.call(["sox", fil, out_fil, \
                         "trim", start_time, "{0:s}".format(end_time)])
    
    
    def play_node_wav(self, nid, detail=False):
        print("Playing node id: %d" % nid)
        pair_wavs_path = os.path.join(self.ch_config["zrt_out_path"], "wavs")

        
        if not os.path.exists(pair_wavs_path):
            os.makedirs(pair_wavs_path)
        
        n1 = self.node_dict[nid]

        out_table_1 = PrettyTable(["node id", "ES transcript", "EN translation"], hrules=True)
        
        out_table_1.padding_width = 1
        
        out_table_1.add_row([nid, ' '.join(n1.es_words), textwrap.fill(' '.join(self.en_words_dict[n1.wav_fil]),50)])
                
        print("Transcript + Translation details")
        print(out_table_1)
        print("Node details")
        
        chid = self.file_info_dict[n1.wav_fil].vad.chid            
        uid = n1.wav_fil
        
        out_table_3 = PrettyTable(["node id", "uttrnce id", "spk id", "start(ms)", "end(ms)", "dur(ms)"], hrules=True)
        out_table_3.add_row([nid, uid, \
                             chid, n1.start*10, n1.end*10, ((n1.end - n1.start)*10)])        
        print(out_table_3)
        
        if detail:
            print("ES transcript for node: %d" % nid)
            print(self.print_es_words(nid))
        
        out_fil_n1 = os.path.join(pair_wavs_path, "n{0:d}.wav".format(nid))
        if not os.path.exists(out_fil_n1):
            self.gen_segment_wav(self.file_info_dict[n1.wav_fil].target_file+".wav", \
                                 n1.start, n1.end, out_fil_n1)
        return IPython.display.Audio(out_fil_n1)
    
    def play_pair_wav(self, pair_id, detail=False):
        print("Playing evaluation pair id: %d" % pair_id)
        pair_wavs_path = os.path.join(self.ch_config["zrt_out_path"], "wavs")
        out_fil = os.path.join(self.ch_config["zrt_out_path"], "wavs", "p{0:d}.wav".format(pair_id))
        
        if not os.path.exists(pair_wavs_path):
            os.makedirs(pair_wavs_path)
        
        eval_pid_dict = self.eval_pairs_list[pair_id]
        n1 = self.node_dict[eval_pid_dict['n1']]
        n2 = self.node_dict[eval_pid_dict['n2']]

        out_table_1 = PrettyTable(["node id", "ES transcript", "EN translation"], hrules=True)
        out_table_2 = PrettyTable(["Similarity Type", "Similarity Value"], hrules=True)
        
        out_table_1.padding_width = 1
        out_table_2.padding_width = 1
        
        out_table_1.add_row([eval_pid_dict['n1'], eval_pid_dict['es_w_n1'],\
                             textwrap.fill(' '.join(self.en_words_dict[n1.wav_fil]),50)])
        out_table_1.add_row([eval_pid_dict['n2'], eval_pid_dict['es_w_n2'],\
                             textwrap.fill(' '.join(self.en_words_dict[n2.wav_fil]),50)])
        
        out_table_2.add_row(["ZRT", "{0:0.3f}".format(eval_pid_dict['zrt_sim'])])
        out_table_2.add_row(["ES content* word match", eval_pid_dict['cnt_es_w_sim']])
        out_table_2.add_row(["ES word match", eval_pid_dict['es_w_sim']])
        out_table_2.add_row(["ES phoneme Edit Similarity", "{0:0.3f}".format(eval_pid_dict['es_p_sim'])])
        out_table_2.add_row(["EN content* word match", eval_pid_dict['en_w_cnt_count_sim_0']])
        out_table_2.add_row(["EN content* words Hellinger Similarity", \
                             "{0:0.3f}".format(eval_pid_dict['en_w_cnt_hgr_sim_0'])])
        out_table_2.add_row(["EN word match", eval_pid_dict['en_w_count_sim_0']])
        out_table_2.add_row(["EN words Hellinger Similarity", "{0:0.3f}".format(eval_pid_dict['en_w_hgr_sim_0'])])
        
        print("Pair text details")
        print(out_table_1)
        print("Similarity metrics")
        print(out_table_2)
        print("Node details")
        
        out_table_3 = PrettyTable(["node id", "uttrnce id", "spk id", "start(ms)", "end(ms)", "dur(ms)"], hrules=True)
        out_table_3.add_row([eval_pid_dict['n1'], eval_pid_dict["uid1"], \
                             eval_pid_dict["chid1"], n1.start*10, n1.end*10, ((n1.end - n1.start)*10)])
        out_table_3.add_row([eval_pid_dict['n2'], eval_pid_dict["uid2"], \
                             eval_pid_dict["chid2"], n2.start*10, n2.end*10, ((n2.end - n2.start)*10)])
        
        print(out_table_3)
        
        if detail:
            print("ES transcript for node: %d" % eval_pid_dict['n1'])
            print(self.print_es_words(eval_pid_dict['n1']))
            print("\nES transcript for node: %d" % eval_pid_dict['n2'])
            print(self.print_es_words(eval_pid_dict['n2']))
        
        if not os.path.exists(out_fil):
            out_fil_n1 = os.path.join(pair_wavs_path, "n{0:d}.wav".format(self.eval_pairs_list[pair_id]['n1']))
            self.gen_segment_wav(self.file_info_dict[n1.wav_fil].target_file+".wav", \
                                 n1.start, n1.end, out_fil_n1)

            out_fil_n2 = os.path.join(pair_wavs_path, "n{0:d}.wav".format(self.eval_pairs_list[pair_id]['n2']))
            self.gen_segment_wav(self.file_info_dict[n2.wav_fil].target_file+".wav", \
                                 n2.start, n2.end, out_fil_n2)
    #                 "sox short.ogg -p pad 0 6|sox - long.ogg output.ogg"
#             print("merge command {0:s}".format(' '.join(["sox", out_fil_n1, "-p", "pad", "0", "0.5", "|", \
#                          "sox", "-", out_fil_n2, out_fil])))
            ps = subprocess.Popen(("sox", out_fil_n1, "-p", "pad", "0", "0.5"), stdout=subprocess.PIPE)
            output = subprocess.check_output(("sox", "-", out_fil_n2, out_fil), stdin=ps.stdout)
            ps.wait()
            os.remove(out_fil_n1)
            os.remove(out_fil_n2)
        return IPython.display.Audio(out_fil)
    
    def print_es_words(self, nid):
        out_table_1 = PrettyTable(["start(ms)", "end(ms)", "word"], hrules=True)
        for v in self.es_words_dict[self.node_dict[nid].wav_fil]:
            out_table_1.add_row([v.start*10, v.end*10, v.word])
        return out_table_1
    
    def play_node_source_wav(self, nid):
        print("File: %des " % (self.file_info_dict[self.node_dict[nid].wav_fil].target_file + ".wav"))
        return IPython.display.Audio(self.file_info_dict[self.node_dict[nid].wav_fil].target_file + ".wav")
    
    def save_state(self):
        state_dict = {}
        state_dict['nodes'] = self.node_dict
        state_dict['edges'] = self.edges_dict
        state_dict['pairs_list'] = self.pairs_list
        state_dict['eval_pairs'] = self.eval_pairs_list
        state_dict['w2i'] = self.w2i
        state_dict['i2w'] = self.i2w
        state_dict['en_words_stats'] = self.en_words_stats
        state_dict['en_vocab'] = self.en_vocab
        
        print("Saving state dictionary ...")
        post_state_dict_path = os.path.join(self.ch_config["zrt_out_path"], "post_state.dict")
        cPickle.dump(state_dict, open(post_state_dict_path, "wb"))
        print("Finished saving state dictionary ...")

    
    def load_state(self, full=False):
        print("Loading prep state dictionary ...")
        prep_state_dict_path = os.path.join(self.ch_config["exp_path"], "prep_state.dict")
        state_dict = cPickle.load(open(prep_state_dict_path, "rb"))
        self.file_info_dict = state_dict['file_info']
        self.es_phones_dict = state_dict['es_phones']
        self.en_words_dict = state_dict['en_words']
        self.es_words_dict = state_dict['es_words']
        self.es_cnt_words_dict = state_dict['es_cnt_words']
        self.en_cnt_words_dict = state_dict['en_cnt_words']
        print("Finished loading prep state dictionary ...")
        
        if full:
            print("Loading post state dictionary ...")
            post_state_dict_path = os.path.join(self.ch_config["zrt_out_path"], "post_state.dict")
            state_dict = cPickle.load(open(post_state_dict_path, "rb"))
            self.node_dict = state_dict['nodes']
            self.edges_dict = state_dict['edges']
            self.pairs_list = state_dict['pairs_list']
            self.eval_pairs_list = state_dict['eval_pairs']
            self.w2i = state_dict['w2i']
            self.i2w = state_dict['i2w']
            self.en_words_stats = state_dict['en_words_stats']
            self.en_vocab = state_dict['en_vocab']

            print("Finished loading post state dictionary ...")
        
            print("Setting up eval data frame ...")
            self.eval_df_full = pd.DataFrame(self.eval_pairs_list)
            self.eval_df = self.eval_df_full[(self.eval_df_full['no_mtch'] == False) & \
                                             (self.eval_df_full['sil_only'] == False)]
            print("Saving dataframe ...")
            df_filename = os.path.join(self.ch_config["zrt_out_path"], "zrt.df")
            self.eval_df_full.to_pickle(df_filename)
            print("Finished saving dataframe ...")
        print("Finished ...")
    
    def eval_precision_recall(self, sim_col='zrt_sim', step=0.005):
        min_sim = self.eval_df[sim_col].min()
        max_sim = self.eval_df[sim_col].max()
        
        # Get the full count of good scores
        num_good_scores = len(self.eval_df[self.eval_df['cnt_es_w_sim'] > 0])
        num_bad_scores = len(self.eval_df[self.eval_df['cnt_es_w_sim'] == 0])
        print("# eval scores: %d" % len(self.eval_df))
        print("# good scores: %d" % num_good_scores)
        print("#  bad scores: %d" % num_bad_scores)
        
        # Generate a range of threshold values
        thresh_array = np.arange(min_sim-step, max_sim+step, step)
        precision_array = np.zeros(len(thresh_array))
        recall_array = np.zeros(len(thresh_array))
        inverse_recall_array = np.zeros(len(thresh_array))
        for i, thresh in enumerate(thresh_array):
            # Get the count of pairs which meet the threshold
            num_thresh_scores = len(self.eval_df[self.eval_df[sim_col] >= thresh])
            # Get the count of pairs which are good based on content word matches
            num_good_thresh_scores = len(self.eval_df[(self.eval_df[sim_col] >= thresh) & \
                                                  (self.eval_df['cnt_es_w_sim'] > 0)])
            if num_thresh_scores > 0:
                precision_array[i] = (num_good_thresh_scores * 1.0) / num_thresh_scores
            else:
                precision_array[i] = 0.0
            recall_array[i] = (num_good_thresh_scores * 1.0) / num_good_scores
            
            num_bad_scores_thresh = len(self.eval_df[(self.eval_df[sim_col] < thresh) & \
                                                  (self.eval_df['cnt_es_w_sim'] == 0)])
            
            inverse_recall_array[i] = \
                1.0 - ((num_bad_scores_thresh * 1.0) / num_bad_scores)
            
        print('Average Precision - using sim: %s' % sim_col)
        print('-' * 40)
        print('\t%0.5f' % 
              np.trapz(precision_array[::-1], recall_array[::-1]))
        print('-' * 40)

        return precision_array, recall_array, inverse_recall_array
    
    def plot_precision_recall(self, both=False):
        fig = plt.figure()
        plt.tick_params(which='both', labelsize=14)
        fig.set_size_inches(18.5, 10.5)
        
        p_vals, r_vals, ir_vals = self.eval_precision_recall()
        if both:
            p_i_vals, r_i_vals, ir_i_vals = \
                        self.eval_precision_recall(sim_col='sim mixed')
        

        ax_0 = plt.subplot2grid((3,2),(0, 0))
        ax_0.plot(p_vals, label='Precision')
        ax_0.plot(r_vals, label='Recall')
        ax_0.plot(ir_vals, label='Inverse Recall')
        ax_0.set_ylabel("original", fontsize=20)
        ax_0.set_xlabel("Threshold", fontsize=20)
        plt.tick_params(which='both', labelsize=14)
        ax_0.legend(fontsize=14)
        
        if both:
            ax_i = plt.subplot2grid((3,2),(0, 1))
            ax_i.plot(p_i_vals, label='Precision')
            ax_i.plot(r_i_vals, label='Recall')
            ax_i.plot(ir_i_vals, label='Inverse Recall')
            ax_i.set_ylabel("interpolated", fontsize=20)
            ax_i.set_xlabel("Threshold", fontsize=20)
            plt.tick_params(which='both', labelsize=14)
            ax_i.legend(fontsize=14)
#         plt.legend(fontsize=14)
        
        ax_pr = plt.subplot2grid((3,2),(1, 0), colspan=2)
        ax_pr.plot(r_vals, p_vals, label='orig')
        if both:
            ax_pr.plot(r_i_vals, p_i_vals, label='interp')
        ax_pr.set_ylabel("Precision", fontsize=20)
        ax_pr.set_xlabel("Recall", fontsize=20)
        plt.xlim(0.0,1.0)
        plt.ylim(0.0,1.0)
        plt.tick_params(which='both', labelsize=14)
        ax_pr.legend(fontsize=14)
#         plt.legend(fontsize=14)

        ax_roc = plt.subplot2grid((3,2),(2, 0), colspan=2)
        ax_roc.plot(ir_vals, r_vals, label='orig')
        if both:
            ax_roc.plot(ir_i_vals, r_i_vals, label='interp')
        ax_roc.set_ylabel("TPR", fontsize=20)
        ax_roc.set_xlabel("FPR", fontsize=20)
        plt.xlim(0.0,1.0)
        plt.ylim(0.0,1.0)
        plt.tick_params(which='both', labelsize=14)
        ax_roc.legend(fontsize=14)
#         plt.legend(fontsize=14)

        fig.tight_layout()
    
    def gen_vocab(self):
        print("Generating vocabulary of English words ... ")
        # Loop through node list, and append en words
        # to list
        en_words_in_all_nodes = []
        for nid, node_info in self.node_dict.items():
            en_words_in_all_nodes.extend(self.en_words_dict[node_info.wav_fil])
        
        # set en vocab
        self.en_words_stats = Counter(en_words_in_all_nodes)
        self.en_vocab = set(en_words_in_all_nodes)
        
        # word to integer, and integer to word mapping
        self.w2i = {}
        self.i2w = {}
        for i,w in enumerate(self.en_vocab):
            self.w2i[w] = i
            self.i2w[i] = w
        print("Finished vocab ...")
    
    def gen_en_cnt_w_dict(self):
        print("Generating dictionary of English content words ... ")
        # Loop through node list, and append en words
        # to list
        self.en_cnt_words_dict = {}
        for i, sp_fil in enumerate(self.en_words_dict):
            if i % 5000 == 0:
                print("Processed %d speech utterances" % i)
            en_w_in_uttr = self.en_words_dict[sp_fil]
            en_w_cnt_in_uttr = \
                [word for word in en_w_in_uttr if word.lower().decode("utf-8") not in stopwords.words('english')]
            
            self.en_cnt_words_dict[sp_fil] = en_w_cnt_in_uttr
        
        print("Finished generating English content word dict ...")
        
    def gen_en_words_belief(self):
        print("Generating initial belief over English translations")
        self.en_w_belief = {}
        self.en_w_cnt_belief = {}
        for i, (nid, node_info) in enumerate(self.node_dict.items()):
            if i % 100000 == 0:
                print("Processed %d nodes" % (i+1))
            en_words_in_uttr = self.en_words_dict[node_info.wav_fil]
            num_words = len(en_words_in_uttr)
            self.en_w_belief[nid] = {self.w2i[w]: 1/num_words for w in en_words_in_uttr}
            # Get the ids of content words
            en_w_cnt_in_uttr = self.en_cnt_words_dict[node_info.wav_fil]
            num_cnt_word = len(en_w_cnt_in_uttr)
            self.en_w_cnt_belief[nid]  = {self.w2i[w]: 1/num_cnt_word for w in en_w_cnt_in_uttr}
            
    def similarity_hellinger(self, p1_dict, p2_dict):
        common_keys = set(p1_dict.keys()) & set(p2_dict.keys())
        b_coeff = 0.0
        for key in common_keys:
            b_coeff += math.sqrt(p1_dict[key] * p2_dict[key])

        temp_val = max(0.0, (1.0-b_coeff))
        hellinger_dist = math.sqrt(temp_val)
        return (1.0-hellinger_dist)
    
    def similarity_jaccard(self, p1_dict, p2_dict):
        common_keys = set(p1_dict.keys()) & set(p2_dict.keys())
        union_keys = set(p1_dict.keys()) | set(p2_dict.keys())
        jaccard_dist = (0 if len(union_keys) == 0 else len(common_keys) / len(union_keys))
        return jaccard_dist
    
    def similarity_en_count(self, p1, p2):
        en_cnt_match = len(set(p1.keys()) & set(p2.keys()))
        return min(1, en_cnt_match)
    
    def gen_en_words_sim(self, en_w_belief_dict):
        print("Generating similarity between nodes based on English translations ...")
        # create or reset sim matrix
        en_w_hgr_sim = {}
#         en_w_jcrd_sim = {}
#         en_w_cnt_sim = {}
        
        # loop over the edges list, and add sim val
        for n1, node_sim_dict in self.edges_dict.items():
            en_w_hgr_sim[n1] = {}
#             en_w_jcrd_sim[n1] = {}
#             en_w_cnt_sim[n1] = {}
            for n2, sim_val in node_sim_dict.items():
                # compute belief similarity between n1 and n2
                # only if in different utterances
                if self.node_dict[n1].wav_fil != self.node_dict[n2].wav_fil:
                    en_w_hgr_sim[n1][n2] = self.similarity_hellinger(en_w_belief_dict[n1], en_w_belief_dict[n2])
#                     en_w_jcrd_sim[n1][n2] = self.similarity_jaccard(en_w_belief_dict[n1], en_w_belief_dict[n2])
#                     en_w_cnt_sim[n1][n2] = self.similarity_en_count(en_w_belief_dict[n1], en_w_belief_dict[n2])
                else:
                    en_w_hgr_sim[n1][n2] = 0
#                     en_w_jcrd_sim[n1][n2] = 0
#                     en_w_cnt_sim[n1][n2] = 0
        print("Finished generating similarity ...")
        #return en_w_hgr_sim, en_w_jcrd_sim, en_w_cnt_sim
        return en_w_hgr_sim
    
    
    def exp_wav_files_dur(self):
        total_dur = 0; total_dur_trns = 0
        # Get duration from VAD files
        for key in sorted(self.b_base_name):
            # Get duration from non sil, sp parts in transcriptions
            for t in self.es_words_dict[key]:
                total_dur_trns += (((t.end-t.start)*10) if t.word not in ['sil','sp'] else 0)
                
            # Get duration from utterance level VAD    
            total_dur += ((self.file_info_dict[key].vad.end - self.file_info_dict[key].vad.start)*10)
        
        print("Total wav files: %d" %(len(self.b_base_name)))
        out_t = PrettyTable(["Type", "Total duration (secs)", "Total duration (hours)"], hrules=True)
        out_t.add_row(["Utterance level VAD", total_dur//1000, "{0:.2f}".format(total_dur/(1000*3600))])
        out_t.add_row(["Transcript non sil/sp word durations", total_dur_trns//1000, \
                       "{0:.2f}".format(total_dur_trns/(1000*3600))])
        
        evad_dur_ms = self.zrt_calc_dur_from_evad()
        out_t.add_row(["Energy based", evad_dur_ms//1000, \
                       "{0:.2f}".format(evad_dur_ms/(1000*3600))])
        
        
        print(out_t)

    
    def init_speaker_info(self):
        # MISSING INFO and EXTRA INFO
        # More than one gender per channel
        # Age, and speaker id only available for evaltst files
        print("Reading speaker and call info ...")
        # Read caller info:
        # gender, age, id
        # id for channel A is known, for channel B, always assign unique id per speech file
        # Info to be stored in a dictionary
        # Key: utterance id: speech_file.chid
        # Set of files in the corpus:
        keys_corpus_files = set([x.source_file for x in self.file_info_dict.values()])
        self.spkrinfo = {}
        keys_callinfo = []
        keys_spkrinfo = []
        # Process callinfo.tbl
        with open(self.ch_config["callinfo"], "r") as in_f:
            for i, line in enumerate(in_f, start=0):
                if any(substring in line for substring in ["siA ntalkers", "siB ntalkers"]):
                    line_items = line.strip().split()
                    sp_fil = line_items[0]
                    if "sp_" not in sp_fil:
                        sp_fil = "sp_" + sp_fil
                    if sp_fil in keys_corpus_files:
                        chid = line_items[1].replace('si', '')
                        if chid == "A":
                            chid = 1
                        elif chid == "B":
                            chid = 2
                        else:
                            print("Incorrect chid")
                        dict_key = "{0:s}.{1:d}".format(sp_fil, chid)
                        if dict_key not in self.spkrinfo:
                            self.spkrinfo[dict_key] = {}
                            keys_callinfo.append(dict_key)
                        nGender, nSpkrs = line_items[2].split("=")
                        self.spkrinfo[dict_key][nGender] = nSpkrs
        
        print("callinfo.tbl - # speakers: %d" % (len(set(keys_callinfo))/2))
        print("Finished call info ...")
        
        # Process spkrinfo.tbl
        with open(self.ch_config["spkrinfo"], "r") as in_f:
            for i, line in enumerate(in_f, start=0):
                line_items = line.strip().split(',')
                sp_fil = line_items[0]
                if sp_fil in keys_corpus_files:
                    # Add info for channel A
                    dict_key = dict_key = "{0:s}.{1:d}".format(sp_fil, 1)
                    if dict_key not in self.spkrinfo:
                        self.spkrinfo[dict_key] = {}
                        keys_spkrinfo.append(dict_key)
                    self.spkrinfo[dict_key]["age"] = int(-1 if line_items[2] == '' else line_items[2])
                    self.spkrinfo[dict_key]["spid"] = line_items[-1]
                    # Add info for channel B
                    dict_key = dict_key = "{0:s}.{1:d}".format(sp_fil, 2)
                    if dict_key not in self.spkrinfo:
                        self.spkrinfo[dict_key] = {}
                        keys_spkrinfo.append(dict_key)
                    # Info not available
                    self.spkrinfo[dict_key]["age"] = -1
                    self.spkrinfo[dict_key]["spid"] = "uknown_"+dict_key
        
        print("callinfo.tbl - # speakers: %d" % (len(set(keys_spkrinfo))))
        print("Finished speaker info ...")
        self.keys_callinfo = set(keys_callinfo)
        self.keys_spkrinfo = set(keys_spkrinfo)
#         print(set(keys_callinfo) & set(keys_spkrinfo))
#         print(len(set(keys_callinfo)), len(set(keys_spkrinfo)))
#         print(set(keys_callinfo)-set(keys_spkrinfo))
#         print(set(keys_spkrinfo)-set(keys_callinfo))
        
        pass
        
    
    def init_label_prop(self):
        self.label_prop_edges = LabelProp(self.edges_dict, alpha=0.1)
        self.label_prop_en_words = LabelProp(self.en_w_belief, alpha=0.2)
        self.label_prop_en_cnt_words = LabelProp(self.en_w_cnt_belief, alpha=0.2)
    
    
    def init_zrt(self, fast=False):
        if fast == False:        
            # Load ZRT prep output
            self.load_state(full=False)
            # Read matches.nodes
            self.read_nodes()
            # Read matches.edges
            self.read_edges()
            # Generate english vocab
            self.gen_vocab()
            # Initialise uniform belief over the english translations
            # for each disovered node
            self.gen_en_words_belief()
            # Generate similarity matrix between nodes, using the english translations
            self.en_w_hgr_sim_0 = self.gen_en_words_sim(self.en_w_belief)
            # Generate similarity matrix between nodes, using the english translations
            self.en_w_cnt_hgr_sim_0 = self.gen_en_words_sim(self.en_w_cnt_belief)
            #self.init_speaker_info()
            self.eval_pairs()
            self.save_state()
        else:
            self.load_state(full=True)
            # Initialise uniform belief over the english translations
            # for each disovered node
            self.gen_en_words_belief()
            # Generate similarity matrix between nodes, using the english translations
            self.en_w_hgr_sim_0 = self.gen_en_words_sim(self.en_w_belief)
            # Generate similarity matrix between nodes, using the english translations
            self.en_w_cnt_hgr_sim_0 = self.gen_en_words_sim(self.en_w_cnt_belief)
            #self.init_speaker_info()
            self.eval_pairs()
            
#         self.init_label_prop()
    
    def interpolate_sim(self, alpha):
        print("Interpolating similarity matrix ...")
        # Compute the similarity matrix as per follows:
        # sim(n1,n2) = alpha * ZRT(n1,n2) + (1-alpha) * EN_translation_sim(n1,n2)
        self.sim_mix = {}
        self.sim_mix_cont = {}
        for n1, node_sim_dict in self.edges_dict.items():
            self.sim_mix[n1] = {}
            self.sim_mix_cont[n1] = {}
            for n2, val in node_sim_dict.items():
                if self.node_dict[n1].wav_fil != self.node_dict[n2].wav_fil:
                    # interpolate using similarity over entire English translation
                    self.sim_mix[n1][n2] = ((1-alpha)*val) + (alpha*self.en_w_hgr_sim_0[n1][n2])
                    # interpolate using content words in English translation
                    self.sim_mix_cont[n1][n2] = ((1-alpha)*val) + (alpha*self.en_w_cnt_hgr_sim_0[n1][n2])
                else:
                    self.sim_mix[n1][n2] = val
                    self.sim_mix_cont[n1][n2] = val
        
        print("Finished interpolating similarity matrix ...")
        
        print("Updating data frame ...")
        eval_pairs_sim_mixed_list = []
        for entry in self.eval_pairs_list:
            eval_pairs_sim_mixed_list.append(self.sim_mix_cont[entry['n1']][entry['n2']])
        self.eval_df_full['sim mixed'] = pd.Series(eval_pairs_sim_mixed_list, index=self.eval_df_full.index)
        self.eval_df = self.eval_df_full[(self.eval_df_full['no_mtch'] == False) & \
                                         (self.eval_df_full['sil_only'] == False)]
        print("Finished updating data frame ...")
        
        pass
    
    
    def calc_node_overlap(self, n1, n2):
        den = max(self.node_dict[n1].end, self.node_dict[n2].end) - \
              min(self.node_dict[n1].start, self.node_dict[n2].start)
        num = (self.node_dict[n1].end - self.node_dict[n1].start) + \
              (self.node_dict[n2].end - self.node_dict[n2].start)
        olap = (num / den) - 1
        return olap
    
    def gen_zrt_df(self):
        #df_filename = os.path.join(self.ch_config["zrt_out_path"], "zrt.df")
        df_zrt = self.eval_df_full.copy(deep=True)
        df_zrt['spk1'] = df_zrt.apply(lambda row: ("%s.%s" % (row['chid1'], row['uid1'])), axis=1)
        df_zrt['spk2'] = df_zrt.apply(lambda row: ("%s.%s" % (row['chid2'], row['uid2'])), axis=1)
        cols_to_rename = {'zrt_sim':'ZRT', u'es_w_n1':u'ES words n1', u'es_w_n2':u'ES words n2', 
                   u'es_w_sim':'ES word match', u'es_p_sim':'ES phone edit sim',
                   u'cnt_es_w_sim':'ES cont match', u'en_w_cnt_hgr_sim_0':'EN sim'}

        cols_to_show = [u'n1', u'n2', 'ZRT', 'ES words n1', 'ES words n2', 
                'ES word match', 'ES phone edit sim', 'ES cont match', 
                'EN sim', 'sim mixed', u'spk1', u'spk2']
        df_zrt = df_zrt.rename(columns=cols_to_rename)
        return df_zrt, cols_to_show
    
    def label_prop_stuff(self):
        pass
    
    
    pass

In [None]:
class LabelProp(object):
    
    def __init__(self, belief0, alpha):
        self.belief_0 = belief0.copy()
        self.belief_n = belief0.copy()
        self.alpha = alpha
        
    def label_spread(self, sim_dict):
        # Create a placeholder for the new belief
        new_belief = {}
        # Loop over all the entries in the similarity matrix
        for i, (n1, node_sim_dict) in enumerate(sim_dict.items()):
#             if i < 1:
#                 print("Label spreading start ...")
#                 print(n1)
#                 print(self.belief_n[n1])
            
            # Create new belief for node id
            new_belief[n1] = {}
            # Loop through all the similar nodes to this node id
            temp_belief = {}
            for n2, sim_val in node_sim_dict.items():
                for bid, bval in self.belief_n[n2].items():
                    if bid not in temp_belief:
                        temp_belief[bid] = sim_val * bval
                    else:
                        temp_belief[bid] += sim_val * bval
            # Normalize new belief
            sum_belief = sum(temp_belief.values())
            if sum_belief > 0:
                temp_belief = {k:(v/sum_belief) for (k,v) in temp_belief.items()}
            # Combine new belief: temp_belief, and current belief: self.belief_n
            new_belief_keys = set(self.belief_n[n1].keys()) | set(temp_belief.keys())
            for key in new_belief_keys:
                new_belief[n1][key] = (self.alpha * temp_belief.get(key,0) + \
                                       ((1-self.alpha) * self.belief_0[n1].get(key,0)))
#             if i < 1:
#                 print("Label spreading end ...")
#                 print(n1)
#                 print(new_belief[n1])

            
        # Update current belief
        self.belief_n = new_belief
    
    
    def normalize_dict(self, some_dict):
        in_dict = some_dict.copy()
        vals = []
        for dict_1 in in_dict.values():
            vals.extend(dict_1.values())
        max_val = max(vals)
        min_val = min(vals)
        # Normalize values
        for node_id in in_dict:      
            in_dict[node_id] = \
                {k: (v-min_val) / (max_val-min_val) \
                 for k, v in in_dict[node_id].items()}
        return in_dict
        
        
    
    
    

In [None]:
def draw_uttr_graph(edges_dict):
    nodes = set(edges_dict.keys()[:5])
    G = nx.Graph()
    for node in nodes:
        G.add_node(node)
    
    pos = nx.shell_layout(G)
    nx.draw(G, pos)

# TODO:

- Add function to create files.base
- Add function to create wav files from pairs output, add a limit on the number of wavs created
- Start adding evaluation metrics

- Add a visualization to show connections between utterances, maybe like a heatmap? or a graph with nodes as the utterances, and edges as the number of pairs found

# ZRT Prep

In [None]:
# ch_prep = CallHomeZRTPrep("config.json")

In [None]:
# %%timeit -n1 -r1
# ch_prep.something_to_do()

In [None]:
# %%timeit -n1 -r1
# ch_prep.load_state()

In [None]:
# %%timeit -n1 -r1
# ch_prep.gen_wav_files()

In [None]:
# ch_prep.b_vad_files[:1]
# # ch_prep.b_vad_files[:-1]
# print(len(ch_prep.b_vad_files))
# print(ch_prep.b_vad_files[-1])

In [None]:
# %%timeit -n1 -r1
# ch_prep.ch_zrt_plp();

In [None]:
# %%timeit -n1 -r1
# ch_prep.ch_zrt_lsh();

In [None]:
# %%timeit -n1 -r1
# ch_prep.ch_zrt_gen_exp_command()

# ZRT Post

In [1]:
def callhome_eval_init(config_file = "config.json", fast=False):
    ch = CallHomeEval(config_file)
    ch.init_zrt(fast)
    ch.zrt_calc_dur_from_evad()
    ch.exp_wav_files_dur()
    ch.interpolate_sim(.1)
    return ch

In [None]:
# ch = CallHomeEval("config.json")

In [None]:
# %%timeit -n1 -r1
# ch.init_zrt(fast=False)

In [None]:
# %%timeit -n1 -r1
# ch.init_zrt(fast=True)

In [None]:
# print(ch.b_vad_files[1])
# ch.zrt_calc_dur_from_evad()
# ch.exp_wav_files_dur()

In [None]:
# %%timeit -n1 -r1
# ch.interpolate_sim(.1)

In [None]:
# %%timeit -n1 -r1
# ch.init_label_prop()

In [None]:
# %%timeit -n1 -r1
# df_filename = os.path.join(ch.ch_config["zrt_out_path"], "zrt.df")
# print("Saving dataframe ...")
# ch.eval_df_full.to_pickle(df_filename)
# print("Finished saving dataframe ...")

In [None]:
# dedup_file = os.path.join(ch.ch_config["zrt_out_path"], "master_graph.dedups")
# cluster_depth = []
# with open(dedup_file, "r") as in_f:
#     for line in in_f:
#         cluster_depth.append(len(line.strip().split()))

#     pass

# print(np.max(cluster_depth))
# print(np.mean(sorted(cluster_depth, reverse=True)[1:]))
# print(sorted(cluster_depth, reverse=True)[:10])

# set_valid_ids = set(ch.eval_df['n1'].values) | set(ch.eval_df['n2'].values)
# edges_depth = [(k, len(v)) for k, v in ch.edges_dict.items() if k in set_valid_ids]
# print(sorted(edges_depth, reverse=True, key=lambda t:t[1])[3000:3010])
# # print(len(ch.eval_df))
# # print(ch.eval_df['n1'].values[:5]))
# print(len(set_valid_ids))

# possible_bad_entries = 0
# possible_good_entries = []
# for i, entry in enumerate(ch.eval_pairs_list):
#     if 'sil' in entry['es_w_n1'] or 'sil' in entry['es_w_n2'] or 'sp' in entry['es_w_n1'] or 'sp' in entry['es_w_n2']:
#         possible_bad_entries += 1
#     else:
#         possible_good_entries.append(i)

# print(possible_bad_entries, len(ch.eval_pairs_list))
# print(possible_good_entries[:10])

# set_valid_ids = set(ch.eval_df['n1'].ix[possible_good_entries].values) | set(ch.eval_df['n2'].ix[possible_good_entries].values)
# edges_depth = [(k, len(v)) for k, v in ch.edges_dict.items() if k in set_valid_ids]
# print(sorted(edges_depth, reverse=True, key=lambda t:t[1])[:10])
# # print(len(ch.eval_df))
# # print(ch.eval_df['n1'].values[:5]))
# print(len(set_valid_ids))


In [None]:
# !python ../../ZRTools/scripts/mark_energy.py -i ../../corpora/callhome_es/out/062.216.wav  -o ../../corpora/callhome_es/out/062.216.tmp -s 0.4 -e 0 
# !cat ../../corpora/callhome_es/out/062.216.tmp

# LP utilities
utilities to support label propagation over the ZRT output

Class lpexp:
- read pair wise matches information
- for each segment, read transcription info - both words and phonemes
- for each segment, read corresponding English translation in the vad region


# Notes:

- For config0.80-0.97-0.80-50, nodes 1317, and 1318 are from different speakers, and also male and female

## Notes - 1-Aug-2016

- For slideshow, watch: https://github.com/damianavila/RISE

## Notes - 27-Jul-2016

- For pairs which are good phonetic matches, but different semantically, how should we handle them? Just lowering the similarity may not be a good idea, depending upon the downstream application. For example, for translation, we might want to retain the discovered pair for pseudoword decoding, but we can append a unique id for each of the nodes, to indicate, a different part and a common part. E.g. n1, n2 = operation, variation. The common part is r\*ation. we can then decode these nodes as following: UNIQ_1 r\*ation, and UNIQ_2 r\*ation. Need to think about this.

## Notes - 22-Jul-2016

- Generated wav files and vad files after fixing the multiple utterances into one bug
- Transcription errors, where words are not missing in the transcriptions is also causing evaluation problems. E.g. from speech file: "sp_0731", vad id: 156, "527.34 535.78 A: entonces tú ve y pídele al párroco de San Antonio todos los datos que se requieran, todo lo que se necesita para este". There is an extra "para" spoken after "Antonio". This is caught by ZRT, but evaluation fails, as no alignment

## Notes - 18-jul-2016

- Pair 72, example where ES transcriptions are incorrect. The transcripts includes words at times when there is only silence. This creates a problem as we are unable to filter out the pair just using
- Pair 0, example where we have a good DTW match, but different words. Here label spreading should help us filter out the pairs
- Pair 13, example with high DTW, but 0 EN sim. Label Spreading should lower the DTW value
- Pair 18973, example with low(ish) DTW: 0.859, and high EN sim. Label spreading should increase the DTW value
- Very few examples, where low DTW and high EN sim. In these examples as well, a majority are actually poor matches, corresponding to "sil", but with overlapping words in EN translations. ** Hyperparameter to control increasing DTW score should be low **


## Analysis:

1. Show pairs with missing matches, but no silent - done
2. Show pairs with NO/ZERO match on the English side. Can we filter these out?

## To do:

1. Add fnction to print:
 - ES transcript
 - EN transcript
 - File info
 - Return the wav file as IPython display audio object
 - Code snippet
 
    ```python
    print(ch.es_words_dict['001.202'])
    print(ch.es_words_dict['006.116'])
    print(ch.file_info_dict['001.202'])
    print(ch.file_info_dict['006.116'])
    ch.node_dict[71].es_words
    ```

2. Check ES transcripts for 'SIL' in the middle. Can see at the start and end. 'SP' occurs in the middle.
3. Expand tail end of DTW threshold. Right now, only looking at pairs with DTW of 0.80 and above. Maybe there are many poor pairs below this threshold? In the current set, the majority of pairs have DTW of 1.0
4. Plot cross speaker, and cross file matches
5. **FIX** bug in channel id parsing for transcripts. There are multiple speakers, therefore, only checking for "A:" or "B:" will be wrong, as "A1:" and other similar ids exist
6. Change the selected files to start from file id 41 onwards, as these are training files.
7. **FIX** when merging multiple transcript lines, into 1 sentence, compute the start and end time carefully
8. Compute the total duration of audio. Needs to be two parts, one from the vad regions used to split the wav files into utterances, and the second, for the actual non ['sil', 'sp'] parts in the transcriptions.

# Debug code

In [None]:
# !pwd