In [2]:
from __future__ import print_function
from __future__ import division
import os
import cPickle as pickle
import json
import subprocess
from IPython.display import display
from IPython.display import Audio
from collections import namedtuple
from itertools import izip
import sys
import nltk
from nltk.corpus import stopwords

# Preprocessing CALLHOME for ZRTools


- Created: 26-Oct-2016


## Create mapping for start time for each segment

Format: Dictionary  
key: {key: value}  
*file: {file.seg.wav: start time}*  
Name: segment_start.dict, segment_start.txt  

In [20]:
def read_segments_file(seg_fname):
    segment_map = {}
    with open(seg_fname, "r") as seg_f:
        for i, line in enumerate(seg_f):
            if i == 0:
                continue
            try:
                line_items = line.strip().split()
                seg_key = line_items[0]
                file_id = line_items[1]
                if file_id not in segment_map:
                    segment_map[file_id] = {}
                seg_start = int(float(line_items[6])*100)
                segment_map[file_id][seg_key] = seg_start
            except ValueError:
                print("Incorrect line format at line: %d" % i)
    return segment_map
        

### Read segment map

In [21]:
segment_map = read_segments_file('../segments.txt')
pickle.dump(segment_map, open("../segments.dict", "wb"))

## Create VAD files for merged wavs

In [3]:
def create_merged_vad(vad_file_id, segment_map, seg_vad_path, merged_vad_path):
    total_dur_10ms = 0
    total_dur_10ms_ge500ms = 0
    with open(os.path.join(merged_vad_path, vad_file_id+".vad"), "w") as vad_f:
        print("creating vad %s ..." % vad_file_id)
        for i, (seg_id, seg_start) in enumerate(sorted(segment_map[vad_file_id].items(), key=lambda t:t[0])):
            with open(os.path.join(seg_vad_path, seg_id+".vad"), "r") as seg_vad_f:
                for line in seg_vad_f:
                    line_items = map(int, line.strip().split())
                    start = seg_start+line_items[0]
                    end = seg_start+line_items[1]
                    total_dur_10ms += (end-start)
                    total_dur_10ms_ge500ms += ((end - start) if (end-start) >= 50 else 0)
                    out_line = ("%d %d\n" %(start, end))
                    vad_f.write(out_line)
                # end for
            # end reading seg file
        # end looping over all segments
    # end writing vad file
    return total_dur_10ms, total_dur_10ms_ge500ms

### Create new directory for merged vads

In [15]:
merged_vads_path = "../mergedVads"
seg_vad_path = "../vad"
if not os.path.exists(merged_vads_path):
    os.makedirs(merged_vads_path)

### Create merged vad for each file

In [16]:
total_dur_10ms, total_dur_10ms_ge500ms = 0, 0
for vad_file_id in segment_map:
    t1, t2 = create_merged_vad(vad_file_id, segment_map, seg_vad_path, merged_vads_path)
    total_dur_10ms += t1
    total_dur_10ms_ge500ms += t2

creating vad 090 ...
creating vad 091 ...
creating vad 092 ...
creating vad 093 ...
creating vad 094 ...
creating vad 095 ...
creating vad 096 ...
creating vad 097 ...
creating vad 010 ...
creating vad 011 ...
creating vad 012 ...
creating vad 013 ...
creating vad 014 ...
creating vad 015 ...
creating vad 018 ...
creating vad 025 ...
creating vad 024 ...
creating vad 027 ...
creating vad 026 ...
creating vad 021 ...
creating vad 023 ...
creating vad 022 ...
creating vad 029 ...
creating vad 028 ...
creating vad 115 ...
creating vad 114 ...
creating vad 038 ...
creating vad 039 ...
creating vad 111 ...
creating vad 110 ...
creating vad 113 ...
creating vad 112 ...
creating vad 032 ...
creating vad 033 ...
creating vad 030 ...
creating vad 031 ...
creating vad 036 ...
creating vad 037 ...
creating vad 034 ...
creating vad 035 ...
creating vad 051 ...
creating vad 108 ...
creating vad 049 ...
creating vad 048 ...
creating vad 047 ...
creating vad 046 ...
creating vad 045 ...
creating vad 

In [19]:
print(total_dur_10ms, total_dur_10ms_ge500ms)
print(map(lambda t: "{0:.3f}".format((t / 100.0 / 3600)), [total_dur_10ms, total_dur_10ms_ge500ms]))

4724341 3091219
['13.123', '8.587']


## Create PLP features

In [11]:
merged_wavs_path = "../mergeWavs"
plp_path = "../plp"
plp_norm_path = "../std_plp"
if not os.path.exists(plp_path):
    os.makedirs(plp_path)
if not os.path.exists(plp_norm_path):
    os.makedirs(plp_norm_path)

In [8]:
with open("config.json") as json_data_file:
    config = json.load(json_data_file)

In [14]:
def create_file_lst(file_lst_fname):
    prefix = "../corpora/callhome/mergeWavs"
    wav_file_list = [os.path.join(prefix, wav_file) for \
                     wav_file in os.listdir(merged_wavs_path) if wav_file.endswith(".wav")]
    wav_file_list_string = "\n".join(wav_file_list)
    with open(file_lst_fname, "w") as out_f:
        out_f.write(wav_file_list_string)
    print("Finished writing files.lst")

In [15]:
create_file_lst(config["es"]["lst_file"])

Finished writing files.lst


In [21]:
def create_plp(wav_fname, plp_fname):
    FEACALC = config['base']["feacalc"]
    subprocess.call([FEACALC,"-plp", \
                    "12", "-cep", "13", "-dom", "cep", "-deltaorder", \
                    "2", "-dither", "-frqaxis", "bark", "-samplerate", \
                    "8000", "-win", "25", "-step", "10", "-ip", \
                    "MSWAVE", "-rasta", "false", "-compress", \
                    "true", "-op", "swappedraw", "-o", plp_fname, wav_fname])

    
def normalize_plp(plp_fname, vad_fname, plp_norm_fname):
    STANDFEAT = config['base']["standfeat"]
    # Standardize binary file, for VAD regions only
    subprocess.call([STANDFEAT, "-D", "39", "-infile", \
                    plp_fname, "-outfile", plp_norm_fname, \
                    "-vadfile", vad_fname])

In [44]:
for i, file_id in enumerate(segment_map):
    wav_fname = os.path.join(merged_wavs_path, file_id+".wav")
    vad_fname = os.path.join(merged_vads_path, file_id+".vad")
    plp_fname = os.path.join(plp_path, file_id+".binary")
    plp_norm_fname = os.path.join(plp_norm_path, file_id+".std.binary")
    
    #print(file_id, wav_fname, vad_fname, plp_fname, plp_norm_fname)
    
    # create PLP
    if i % 20 == 0:
        print("plp for file %s " % file_id)
    
    if not os.path.exists(plp_fname):
        create_plp(wav_fname, plp_fname)
    
    if i % 20 == 0:
        print("normalizing plp %s" % file_id)
    
    if not os.path.exists(plp_norm_fname):
        normalize_plp(plp_fname, vad_fname, plp_norm_fname)
print("Completed!")

plp for file 090 
normalizing plp 090
plp for file 023 
normalizing plp 023
plp for file 051 
normalizing plp 051
plp for file 052 
normalizing plp 052
plp for file 072 
normalizing plp 072
plp for file 006 
normalizing plp 006
Completed!


## Create LSH files

In [45]:
merged_wavs_path = "../mergeWavs"
plp_path = "../plp"
plp_norm_path = "../std_plp"
lsh_path = "../lsh"
if not os.path.exists(lsh_path):
    os.makedirs(lsh_path)
lsh_proj_fname = os.path.join(lsh_path, "proj_S64xD39_seed1")

In [56]:
def create_lsh_proj_file(lsh_proj_fname):
    subprocess.call([config['base']["lsh_genproj"], \
                     "-D","39","-S","64","-seed", \
                     "1","-projfile", lsh_proj_fname])

def create_lsh_file(plp_norm_fname, vad_fname, lsh_proj_fname, lsh_fname):
    LSH = config['base']["lsh"]
    subprocess.call([LSH, "-D", "39", "-S", "64", \
                    "-projfile", lsh_proj_fname, \
                    "-featfile", plp_norm_fname, "-sigfile", \
                    lsh_fname, "-vadfile", vad_fname])

In [57]:
create_lsh_proj_file(lsh_proj_fname)
os.path.exists(lsh_proj_fname)

True

In [58]:
for i, file_id in enumerate(segment_map):
    wav_fname = os.path.join(merged_wavs_path, file_id+".wav")
    vad_fname = os.path.join(merged_vads_path, file_id+".vad")
    plp_norm_fname = os.path.join(plp_norm_path, file_id+".std.binary")
    lsh_fname = os.path.join(lsh_path, file_id+".std.lsh64")
    
    #print(file_id, wav_fname, vad_fname, plp_fname, plp_norm_fname)
    
    # create LSH
    if i % 20 == 0:
        print("lsh for file %s " % file_id)
    
    if not os.path.exists(lsh_fname):
        create_lsh_file(plp_norm_fname, vad_fname, lsh_proj_fname, lsh_fname)
        
print("Completed!")

lsh for file 090 
lsh for file 023 
lsh for file 051 
lsh for file 052 
lsh for file 072 
lsh for file 006 
Completed!


## Create ZRTools discovery command files

In [72]:
exp_path = '../exp'
if not os.path.exists(exp_path):
    os.makedirs(exp_path)

# List of wav files
wav_file_list = sorted(segment_map.keys())
exp_name = 'callhome'

In [78]:
with open(os.path.join(exp_path, 'files.base'), "w") as out_f:
    for wav_file in wav_file_list:
        out_f.write(wav_file+'\n')
print("Generated files.base")

Generated files.base


In [76]:
def create_discovery_cmd_scripts(exp_path, wav_file_list, exp_name, num_splits=1):
    disc_file_split_base = "disc_{0:d}.cmd"
    disc_file_split = os.path.join(exp_path, disc_file_split_base)
    disc_split_file = os.path.join(exp_path, "disc_split.txt")
    num_files = len(wav_file_list)
    exp_local_path = os.path.join("exp", exp_name)
    cmd_string = "scripts/plebdisc_filepair \"{0:s}\" \"{1:s}\" {2:s} 39\n"

    total_lines = num_files * num_files
    lines_per_file = total_lines // num_splits
    smallfile = None
    curr_line = 0
    curr_file_num = 0

    for i in xrange(num_files) :
        if i % 20 == 0:
            print("Progress: {0:d} out of: {1:d}".format(curr_line+1, total_lines))
        for j in xrange(num_files):
            out_line = cmd_string.format(wav_file_list[i], \
                                              wav_file_list[j], \
                                              exp_local_path)
            if curr_line % lines_per_file == 0:
                if smallfile:
                    smallfile.close()
                small_filename = disc_file_split.format(curr_file_num)
                smallfile = open(small_filename, "w")
                curr_file_num += 1
            smallfile.write(out_line)
            curr_line += 1
    if smallfile:
        smallfile.close()

    # Making a list of commands to execute the split disc list
    full_split_cmd_string = "nice sh {0:s} 1> {1:s} 2>{2:s} &\n"
    split_cmd = os.path.join(exp_local_path, "matches","{0:s}.{1:d}")
    with open(disc_split_file, "w") as out_f:
        for i in xrange(curr_file_num):
            curr_split_file = os.path.join(exp_local_path, disc_file_split_base.format(i))
            split_cmd_out = split_cmd.format("out", i)
            #split_cmd_err = split_cmd.format("err", i)
            split_cmd_err = "/dev/null"

            out_line = "nice sh "
            out_f.write(full_split_cmd_string.format(curr_split_file, \
                                                    split_cmd_out, \
                                                    split_cmd_err))

    print("Completed - disc.cmd")

In [77]:
create_discovery_cmd_scripts(exp_path=exp_path, wav_file_list=wav_file_list, exp_name=exp_name, num_splits=25)

Progress: 1 out of: 10816
Progress: 2081 out of: 10816
Progress: 4161 out of: 10816
Progress: 6241 out of: 10816
Progress: 8321 out of: 10816
Progress: 10401 out of: 10816
Completed - disc.cmd


# Read transcripts, and translations into a dictionary

In [6]:
Align = namedtuple('Align', ['word', 'start', 'end'])

In [30]:
def read_alignment_file(align_fname, stopwords_corpus=None):
    align_list = []
    with open(align_fname, "r") as align_f:
        for line in align_f:
            line_items = line.strip().split()
            if len(line_items) != 3:
                raise ValueError
            start, end = map(lambda v: int(float(v)*100), line_items[1:3])
            if (not stopwords_corpus) or \
            (stopwords_corpus and line_items[0].lower().decode("utf-8") not in stopwords_corpus):
                align_list.append(Align(*[line_items[0], start, end]))
    if sorted(align_list, key=lambda t: t.start) != align_list and not align_fname.endswith("en"):
        raise IOError    
            
    return align_list

In [3]:
es_words_path = '../wav2es-words/'
en_words_path = '../wav2eng-words/'
align_dict_fname = '../align_dict.p'

In [34]:
# test code
stopwords_es = set(stopwords.words('spanish'))
stopwords_en = set(stopwords.words('english'))
display(read_alignment_file('../wav2es-words/001.001.es'))
display(read_alignment_file('../wav2es-words/001.001.es', stopwords_corpus=stopwords_es))
display(read_alignment_file('../wav2eng-words/001.001.en'))
display(read_alignment_file('../wav2eng-words/001.001.en', stopwords_corpus=stopwords_en))

[Align(word='MECHITA', start=12, end=50),
 Align(word='QU\xc3\xa9', start=50, end=73),
 Align(word='LAS', start=109, end=126),
 Align(word='HA', start=126, end=129),
 Align(word='MANDADO', start=129, end=169),
 Align(word='A', start=169, end=176),
 Align(word='QUI\xc3\xa9N', start=176, end=192),
 Align(word='A', start=192, end=198),
 Align(word='POCHO', start=198, end=225)]

[Align(word='MECHITA', start=12, end=50),
 Align(word='MANDADO', start=129, end=169),
 Align(word='QUI\xc3\xa9N', start=176, end=192),
 Align(word='POCHO', start=198, end=225)]

[Align(word='MECHITA', start=12, end=50),
 Align(word='WHAT', start=50, end=73),
 Align(word='SENT', start=129, end=169),
 Align(word='IT', start=126, end=129),
 Align(word='TO', start=169, end=176),
 Align(word='WHOM', start=176, end=192),
 Align(word='TO', start=192, end=198),
 Align(word='POCHO', start=198, end=225)]

[Align(word='MECHITA', start=12, end=50),
 Align(word='SENT', start=129, end=169),
 Align(word='POCHO', start=198, end=225)]

In [37]:
os.listdir('.')[0].endswith('haa')

False

In [41]:
def get_file_list(file_path, file_ext):
    return [os.path.splitext(f)[0] for f in os.listdir(file_path) if f.endswith(file_ext)]

In [59]:
es_file_list = get_file_list(es_words_path, 'es')
en_file_list = get_file_list(en_words_path, 'en')

print(sorted(es_file_list) == sorted(en_file_list))
print(set(es_file_list)-set(en_file_list))

False
set(['009.025'])


In [35]:
def create_alignment_dict():
    align_dict = {}
    stopwords_es = set(stopwords.words('spanish'))
    stopwords_en = set(stopwords.words('english'))
    for file_id in segment_map:
        #print("Processing file: %s" % file_id)
        align_dict[file_id] = {}
        for seg_id in segment_map[file_id]:
            align_dict[file_id][seg_id] = {}
            es_fname = os.path.join(es_words_path, seg_id+".es")
            en_fname = os.path.join(en_words_path, seg_id+".en")
            align_dict[file_id][seg_id]["es"] = read_alignment_file(es_fname)
            align_dict[file_id][seg_id]["en"] = read_alignment_file(en_fname)
            align_dict[file_id][seg_id]["es_cnt"] = read_alignment_file(es_fname, stopwords_corpus=stopwords_es)
            align_dict[file_id][seg_id]["en_cnt"] = read_alignment_file(en_fname, stopwords_corpus=stopwords_en)
    return align_dict
        

In [36]:
align_dict = create_alignment_dict()
pickle.dump(align_dict, open(align_dict_fname, "wb"))

In [29]:
def create_gold_feats(align_dict, gold_feats_dict_fname, es_key="es"):
    gold_feats_dict = {}
    for fid in align_dict:
        for sid in align_dict[fid]:
            gold_feats_dict[sid] = {}
            if align_dict[fid][sid][es_key] == []:
                # Only es_cnt can be empty, in which case include stop words
                gold_feats_dict[sid] = [w.word for w in align_dict[fid][sid]['es']]
            else:
                gold_feats_dict[sid] = [w.word for w in align_dict[fid][sid][es_key]]
    print("Saving gold features using key: %s" % es_key)
    pickle.dump(gold_feats_dict, open(gold_feats_dict_fname, "wb"))
    print("finished ...")
    return gold_feats_dict
        

In [30]:
align_dict = pickle.load(open(align_dict_fname, "rb"))
gold_feats_dict_fname = config['es']['gold_feats']
gold_feats_dict = create_gold_feats(align_dict, gold_feats_dict_fname, es_key="es_cnt")

Saving gold features using key: es_cnt
finished ...


In [28]:
# align_dict['001']['001.001']['es_cnt']
gold_feats_dict['001.001']

['MECHITA', 'MANDADO', 'QUI\xc3\xa9N', 'POCHO']

In [143]:
display(segment_map.keys()[:5])
display(align_dict["001"]["001.001"])

['090', '091', '092', '093', '094']

{'en': [Align(es_word='MECHITA', start=12, end=50),
  Align(es_word='WHAT', start=50, end=73),
  Align(es_word='SENT', start=129, end=169),
  Align(es_word='IT', start=126, end=129),
  Align(es_word='TO', start=169, end=176),
  Align(es_word='WHOM', start=176, end=192),
  Align(es_word='TO', start=192, end=198),
  Align(es_word='POCHO', start=198, end=225)],
 'es': [Align(es_word='MECHITA', start=12, end=50),
  Align(es_word='QU\xc3\xa9', start=50, end=73),
  Align(es_word='LAS', start=109, end=126),
  Align(es_word='HA', start=126, end=129),
  Align(es_word='MANDADO', start=129, end=169),
  Align(es_word='A', start=169, end=176),
  Align(es_word='QUI\xc3\xa9N', start=176, end=192),
  Align(es_word='A', start=192, end=198),
  Align(es_word='POCHO', start=198, end=225)]}

In [144]:
print(segment_map.keys())

['090', '091', '092', '093', '094', '095', '096', '097', '010', '011', '012', '013', '014', '015', '018', '025', '024', '027', '026', '021', '023', '022', '029', '028', '115', '114', '038', '039', '111', '110', '113', '112', '032', '033', '030', '031', '036', '037', '034', '035', '051', '108', '049', '048', '047', '046', '045', '044', '043', '042', '041', '040', '058', '059', '103', '054', '056', '057', '050', '100', '052', '053', '101', '106', '107', '104', '105', '061', '060', '063', '062', '065', '064', '067', '066', '117', '116', '076', '077', '075', '072', '073', '070', '071', '078', '079', '119', '118', '089', '088', '083', '082', '081', '087', '086', '085', '084', '002', '001', '007', '006', '005', '009', '120']


In [54]:
(sorted(segment_map['001'].keys()+segment_map['002'].keys()))

['001.001',
 '001.002',
 '001.003',
 '001.004',
 '001.005',
 '001.006',
 '001.007',
 '001.008',
 '001.009',
 '001.010',
 '001.011',
 '001.012',
 '001.013',
 '001.014',
 '001.016',
 '001.017',
 '001.018',
 '001.019',
 '001.020',
 '001.021',
 '001.022',
 '001.023',
 '001.026',
 '001.027',
 '001.028',
 '001.029',
 '001.030',
 '001.031',
 '001.032',
 '001.033',
 '001.037',
 '001.038',
 '001.039',
 '001.041',
 '001.045',
 '001.046',
 '001.047',
 '001.048',
 '001.049',
 '001.050',
 '001.051',
 '001.052',
 '001.053',
 '001.054',
 '001.055',
 '001.056',
 '001.058',
 '001.059',
 '001.062',
 '001.066',
 '001.068',
 '001.069',
 '001.071',
 '001.072',
 '001.075',
 '001.076',
 '001.077',
 '001.078',
 '001.080',
 '001.081',
 '001.082',
 '001.083',
 '001.084',
 '001.085',
 '001.086',
 '001.087',
 '001.088',
 '001.089',
 '001.090',
 '001.092',
 '001.094',
 '001.096',
 '001.099',
 '001.100',
 '001.102',
 '001.103',
 '001.105',
 '001.107',
 '001.108',
 '001.109',
 '001.112',
 '001.113',
 '001.114',
 '00