In [69]:
from __future__ import print_function
from __future__ import division
import os
import cPickle as pickle
import json
import subprocess
from IPython.display import display
from IPython.display import Audio
from collections import namedtuple
from itertools import izip
import sys
import nltk
from nltk.corpus import stopwords

In [70]:
with open("config.json") as json_data_file:
    config = json.load(json_data_file)

# Preprocessing CALLHOME for ZRTools


- Created: 26-Oct-2016


## Create mapping for start time for each segment

Format: Dictionary  
key: {key: value}  
*file: {file.seg.wav: start time}*  
Name: segment_start.dict, segment_start.txt  

In [3]:
def read_segments_file(seg_fname):
    segment_map = {}
    with open(seg_fname, "r") as seg_f:
        for i, line in enumerate(seg_f):
            if i == 0:
                continue
            try:
                line_items = line.strip().split()
                seg_key = line_items[0]
                file_id = line_items[1]
                if file_id not in segment_map:
                    segment_map[file_id] = {}
                seg_start = int(float(line_items[6])*100)
                segment_map[file_id][seg_key] = seg_start
            except ValueError:
                print("Incorrect line format at line: %d" % i)
    return segment_map
        

### Read segment map

In [4]:
segment_map = read_segments_file('../segments.txt')
pickle.dump(segment_map, open(config['es']['segment_dict_fname'], "wb"))

## Create VAD files for merged wavs

In [2]:
def create_merged_vad_from_ed(vad_file_id, segment_map, seg_vad_path, merged_vad_path):
    total_dur_10ms = 0
    total_dur_10ms_ge500ms = 0
    with open(os.path.join(merged_vad_path, vad_file_id+".vad"), "w") as vad_f:
        print("creating vad %s ..." % vad_file_id)
        for i, (seg_id, seg_start) in enumerate(sorted(segment_map[vad_file_id].items(), key=lambda t:t[0])):
            with open(os.path.join(seg_vad_path, seg_id+".vad"), "r") as seg_vad_f:
                for line in seg_vad_f:
                    line_items = map(int, line.strip().split())
                    start = seg_start+line_items[0]
                    end = seg_start+line_items[1]
                    total_dur_10ms += (end-start)
                    total_dur_10ms_ge500ms += ((end - start) if (end-start) >= 50 else 0)
                    out_line = ("%d %d\n" %(start, end))
                    vad_f.write(out_line)
                # end for
            # end reading seg file
        # end looping over all segments
    # end writing vad file
    return total_dur_10ms, total_dur_10ms_ge500ms

### Create new directory for merged vads

In [3]:
# merged_ed_vads_path = "../mergedVads"
# seg_vad_path = "../vad"
# if not os.path.exists(merged_ed_vads_path):
#     os.makedirs(merged_ed_vads_path)

### Create merged vad for each file

In [None]:
# total_dur_10ms, total_dur_10ms_ge500ms = 0, 0
# for vad_file_id in segment_map:
#     t1, t2 = create_merged_vad(vad_file_id, segment_map, seg_vad_path, merged_vads_path)
#     total_dur_10ms += t1
#     total_dur_10ms_ge500ms += t2

In [4]:
# print(total_dur_10ms, total_dur_10ms_ge500ms)
# print(map(lambda t: "{0:.3f}".format((t / 100.0 / 3600)), [total_dur_10ms, total_dur_10ms_ge500ms]))

## Create PLP features

In [5]:
merged_wavs_path = "../mergeWavs"
plp_path = "../plp"
plp_norm_path = "../std_plp"
if not os.path.exists(plp_path):
    os.makedirs(plp_path)
if not os.path.exists(plp_norm_path):
    os.makedirs(plp_norm_path)

In [6]:
with open("config.json") as json_data_file:
    config = json.load(json_data_file)

In [7]:
def create_file_lst(file_lst_fname):
    prefix = "../corpora/callhome/mergeWavs"
    wav_file_list = [os.path.join(prefix, wav_file) for \
                     wav_file in os.listdir(merged_wavs_path) if wav_file.endswith(".wav")]
    wav_file_list_string = "\n".join(wav_file_list)
    with open(file_lst_fname, "w") as out_f:
        out_f.write(wav_file_list_string)
    print("Finished writing files.lst")

In [8]:
create_file_lst(config["es"]["lst_file"])

Finished writing files.lst


In [9]:
def create_plp(wav_fname, plp_fname):
    FEACALC = config['base']["feacalc"]
    subprocess.call([FEACALC,"-plp", \
                    "12", "-cep", "13", "-dom", "cep", "-deltaorder", \
                    "2", "-dither", "-frqaxis", "bark", "-samplerate", \
                    "8000", "-win", "25", "-step", "10", "-ip", \
                    "MSWAVE", "-rasta", "false", "-compress", \
                    "true", "-op", "swappedraw", "-o", plp_fname, wav_fname])

    
def normalize_plp(plp_fname, vad_fname, plp_norm_fname):
    STANDFEAT = config['base']["standfeat"]
    # Standardize binary file, for VAD regions only
    subprocess.call([STANDFEAT, "-D", "39", "-infile", \
                    plp_fname, "-outfile", plp_norm_fname, \
                    "-vadfile", vad_fname])

In [12]:
for i, file_id in enumerate(segment_map):
    wav_fname = os.path.join(merged_wavs_path, file_id+".wav")
    vad_fname = os.path.join(merged_fa_vads_path, file_id+".vad")
    plp_fname = os.path.join(plp_path, file_id+".binary")
    plp_norm_fname = os.path.join(plp_norm_path, file_id+".std.binary")
    
    #print(file_id, wav_fname, vad_fname, plp_fname, plp_norm_fname)
    
    # create PLP
    if i % 20 == 0:
        print("plp for file %s " % file_id)
    
    #if not os.path.exists(plp_fname):
    create_plp(wav_fname, plp_fname)
    
    if i % 20 == 0:
        print("normalizing plp %s" % file_id)
    
    #if not os.path.exists(plp_norm_fname):
    normalize_plp(plp_fname, vad_fname, plp_norm_fname)
print("Completed!")

plp for file 090 
normalizing plp 090
plp for file 023 
normalizing plp 023
plp for file 051 
normalizing plp 051
plp for file 052 
normalizing plp 052
plp for file 072 
normalizing plp 072
plp for file 006 
normalizing plp 006
Completed!


## Create LSH files

In [13]:
merged_wavs_path = "../mergeWavs"
plp_path = "../plp"
plp_norm_path = "../std_plp"
lsh_path = "../lsh"
if not os.path.exists(lsh_path):
    os.makedirs(lsh_path)
lsh_proj_fname = os.path.join(lsh_path, "proj_S64xD39_seed1")

In [14]:
def create_lsh_proj_file(lsh_proj_fname):
    subprocess.call([config['base']["lsh_genproj"], \
                     "-D","39","-S","64","-seed", \
                     "1","-projfile", lsh_proj_fname])

def create_lsh_file(plp_norm_fname, vad_fname, lsh_proj_fname, lsh_fname):
    LSH = config['base']["lsh"]
    subprocess.call([LSH, "-D", "39", "-S", "64", \
                    "-projfile", lsh_proj_fname, \
                    "-featfile", plp_norm_fname, "-sigfile", \
                    lsh_fname, "-vadfile", vad_fname])

In [15]:
create_lsh_proj_file(lsh_proj_fname)
os.path.exists(lsh_proj_fname)

True

In [16]:
for i, file_id in enumerate(segment_map):
    wav_fname = os.path.join(merged_wavs_path, file_id+".wav")
    vad_fname = os.path.join(merged_fa_vads_path, file_id+".vad")
    plp_norm_fname = os.path.join(plp_norm_path, file_id+".std.binary")
    lsh_fname = os.path.join(lsh_path, file_id+".std.lsh64")
    
    #print(file_id, wav_fname, vad_fname, plp_fname, plp_norm_fname)
    
    # create LSH
    if i % 20 == 0:
        print("lsh for file %s " % file_id)
    
    #if not os.path.exists(lsh_fname):
    create_lsh_file(plp_norm_fname, vad_fname, lsh_proj_fname, lsh_fname)
        
print("Completed!")

lsh for file 090 
lsh for file 023 
lsh for file 051 
lsh for file 052 
lsh for file 072 
lsh for file 006 
Completed!


## Create ZRTools discovery command files

In [17]:
exp_path = '../exp'
if not os.path.exists(exp_path):
    os.makedirs(exp_path)

# List of wav files
wav_file_list = sorted(segment_map.keys())
exp_name = 'callhome'

In [18]:
with open(os.path.join(exp_path, 'files.base'), "w") as out_f:
    for wav_file in wav_file_list:
        out_f.write(wav_file+'\n')
print("Generated files.base")

Generated files.base


In [19]:
def create_discovery_cmd_scripts(exp_path, wav_file_list, exp_name, num_splits=1):
    disc_file_split_base = "disc_{0:d}.cmd"
    disc_file_split = os.path.join(exp_path, disc_file_split_base)
    disc_split_file = os.path.join(exp_path, "disc_split.txt")
    num_files = len(wav_file_list)
    exp_local_path = os.path.join("exp", exp_name)
    cmd_string = "scripts/plebdisc_filepair \"{0:s}\" \"{1:s}\" {2:s} 39\n"

    total_lines = num_files * num_files
    lines_per_file = total_lines // num_splits
    smallfile = None
    curr_line = 0
    curr_file_num = 0

    for i in xrange(num_files) :
        if i % 20 == 0:
            print("Progress: {0:d} out of: {1:d}".format(curr_line+1, total_lines))
        for j in xrange(num_files):
            out_line = cmd_string.format(wav_file_list[i], \
                                              wav_file_list[j], \
                                              exp_local_path)
            if curr_line % lines_per_file == 0:
                if smallfile:
                    smallfile.close()
                small_filename = disc_file_split.format(curr_file_num)
                smallfile = open(small_filename, "w")
                curr_file_num += 1
            smallfile.write(out_line)
            curr_line += 1
    if smallfile:
        smallfile.close()

    # Making a list of commands to execute the split disc list
    full_split_cmd_string = "nice sh {0:s} 1> {1:s} 2>{2:s} &\n"
    split_cmd = os.path.join(exp_local_path, "matches","{0:s}.{1:d}")
    with open(disc_split_file, "w") as out_f:
        for i in xrange(curr_file_num):
            curr_split_file = os.path.join(exp_local_path, disc_file_split_base.format(i))
            split_cmd_out = split_cmd.format("out", i)
            #split_cmd_err = split_cmd.format("err", i)
            split_cmd_err = "/dev/null"

            out_line = "nice sh "
            out_f.write(full_split_cmd_string.format(curr_split_file, \
                                                    split_cmd_out, \
                                                    split_cmd_err))

    print("Completed - disc.cmd")

In [20]:
create_discovery_cmd_scripts(exp_path=exp_path, wav_file_list=wav_file_list, exp_name=exp_name, num_splits=25)

Progress: 1 out of: 10816
Progress: 2081 out of: 10816
Progress: 4161 out of: 10816
Progress: 6241 out of: 10816
Progress: 8321 out of: 10816
Progress: 10401 out of: 10816
Completed - disc.cmd


# Read transcripts, and translations into a dictionary

In [7]:
Align = namedtuple('Align', ['word', 'start', 'end'])

In [None]:
def read_alignment_file(align_fname, stopwords_corpus=None):
    align_list = []
    with open(align_fname, "r") as align_f:
        for line in align_f:
            line_items = line.strip().split()
            if len(line_items) != 3:
                raise ValueError
            start, end = map(lambda v: int(float(v)*100), line_items[1:3])
            if (not stopwords_corpus) or \
            (stopwords_corpus and line_items[0].lower().decode("utf-8") not in stopwords_corpus):
                align_list.append(Align(*[line_items[0], start, end]))
    if sorted(align_list, key=lambda t: t.start) != align_list and not align_fname.endswith("en"):
        raise IOError    
            
    return align_list

In [8]:
es_words_path = '../wav2es-words/'
en_words_path = '../wav2eng-words/'
align_dict_fname = config['es']['align_dict_fname']

In [None]:
# test code
stopwords_es = set(stopwords.words('spanish'))
stopwords_en = set(stopwords.words('english'))
display(read_alignment_file('../wav2es-words/001.001.es'))
display(read_alignment_file('../wav2es-words/001.001.es', stopwords_corpus=stopwords_es))
display(read_alignment_file('../wav2eng-words/001.001.en'))
display(read_alignment_file('../wav2eng-words/001.001.en', stopwords_corpus=stopwords_en))

In [None]:
os.listdir('.')[0].endswith('haa')

In [None]:
def get_file_list(file_path, file_ext):
    return [os.path.splitext(f)[0] for f in os.listdir(file_path) if f.endswith(file_ext)]

In [None]:
es_file_list = get_file_list(es_words_path, 'es')
en_file_list = get_file_list(en_words_path, 'en')

print(sorted(es_file_list) == sorted(en_file_list))
print(set(es_file_list)-set(en_file_list))

In [None]:
def create_alignment_dict():
    align_dict = {}
    stopwords_es = set(stopwords.words('spanish'))
    stopwords_en = set(stopwords.words('english'))
    for file_id in segment_map:
        #print("Processing file: %s" % file_id)
        align_dict[file_id] = {}
        for seg_id in segment_map[file_id]:
            align_dict[file_id][seg_id] = {}
            es_fname = os.path.join(es_words_path, seg_id+".es")
            en_fname = os.path.join(en_words_path, seg_id+".en")
            align_dict[file_id][seg_id]["es"] = read_alignment_file(es_fname)
            align_dict[file_id][seg_id]["en"] = read_alignment_file(en_fname)
            align_dict[file_id][seg_id]["es_cnt"] = read_alignment_file(es_fname, stopwords_corpus=stopwords_es)
            align_dict[file_id][seg_id]["en_cnt"] = read_alignment_file(en_fname, stopwords_corpus=stopwords_en)
    return align_dict
        

In [None]:
align_dict = create_alignment_dict()
pickle.dump(align_dict, open(align_dict_fname, "wb"))

## Create VAD from alignments

In [51]:
align_dict = pickle.load(open(align_dict_fname, "rb"))
segment_map = pickle.load(open(config['es']['segment_dict_fname'], "rb"))
has_500ms_fa_vad_dict_fname = config['es']['has_500ms_fa_vad_dict']

In [52]:
align_dict['001']['001.001']['es'], align_dict['001']['001.002']['es']

([Align(word='MECHITA', start=12, end=50),
  Align(word='QU\xc3\xa9', start=50, end=73),
  Align(word='LAS', start=109, end=126),
  Align(word='HA', start=126, end=129),
  Align(word='MANDADO', start=129, end=169),
  Align(word='A', start=169, end=176),
  Align(word='QUI\xc3\xa9N', start=176, end=192),
  Align(word='A', start=192, end=198),
  Align(word='POCHO', start=198, end=225)],
 [Align(word='LAS', start=25, end=48),
  Align(word='HA', start=48, end=56),
  Align(word='MANDADO', start=56, end=113),
  Align(word='AL', start=113, end=135),
  Align(word='AH', start=181, end=211)])

In [53]:
segment_map['001']['001.001'], segment_map['001']['001.002']

(0, 258)

In [54]:
def create_uttr_vad_from_alignment(align_dict, vad_path):
    has_500ms_dur = {}
    total_dur_10ms = 0
    total_dur_10ms_ge500ms = 0
    for i, vad_file_id in enumerate(align_dict):
        if i % 20 == 0:
            print("Created vad for %d files id" % i)
        for seg_id in align_dict[vad_file_id]:
            with open(os.path.join(vad_path, seg_id+".vad"), "w") as vad_f:
                dur_10ms = 0
                dur_10ms_ge500ms = 0
                vad_list = []
                # start index
                s = 0
                # create a local list of alignment values
                align_list = align_dict[vad_file_id][seg_id]['es']
                for j in xrange(len(align_list)):
                    # if 1st or last element, add to vad_list
                    if ((j+1) == len(align_list)) or (align_list[j].end != align_list[j+1].start):
                        vad_list.append(((align_list[s].start), (align_list[j].end)))
                        s=j+1
                # write vad list to file        
                for vad_tup in vad_list:
                    start = vad_tup[0]
                    end = vad_tup[1]
                    dur_10ms += (end-start)
                    dur_10ms_ge500ms += ((end - start) if (end-start) >= 50 else 0)
                    out_line = ("%d %d\n" %(start, end))
                    vad_f.write(out_line)
                
                # set whether atleast one vad region of 500 ms
                has_500ms_dur[seg_id] = (dur_10ms_ge500ms > 0)
                # compute total durations
                total_dur_10ms += dur_10ms
                total_dur_10ms_ge500ms += dur_10ms_ge500ms 
                    
            # end for
        # end looping over all segments
    # end writing vad file
    return total_dur_10ms, total_dur_10ms_ge500ms, has_500ms_dur
    

In [55]:
def create_merged_vad_from_alignment(vad_file_id, align_dict, segment_map, vad_path):
    total_dur_10ms = 0
    total_dur_10ms_ge500ms = 0
    with open(os.path.join(vad_path, vad_file_id+".vad"), "w") as vad_f:
        print("creating vad %s ..." % vad_file_id)
        for i, (seg_id, seg_start) in enumerate(sorted(segment_map[vad_file_id].items(), key=lambda t:t[0])):
            vad_list = []
            # start index
            s = 0
            # create a local list of alignment values
            align_list = align_dict[vad_file_id][seg_id]['es']
            for j in xrange(len(align_list)):
                # if 1st or last element, add to vad_list
                if ((j+1) == len(align_list)) or (align_list[j].end != align_list[j+1].start):
                    vad_list.append(((seg_start+align_list[s].start), (seg_start+align_list[j].end)))
                    s=j+1
            # write vad list to file        
            for vad_tup in vad_list:
                start = vad_tup[0]
                end = vad_tup[1]
                total_dur_10ms += (end-start)
                total_dur_10ms_ge500ms += ((end - start) if (end-start) >= 50 else 0)
                out_line = ("%d %d\n" %(start, end))
                vad_f.write(out_line)
            # end for
        # end looping over all segments
    # end writing vad file
    return total_dur_10ms, total_dur_10ms_ge500ms
    

### Create new directory for merged vads

In [56]:
uttr_fa_vads_path = config['es']['es_uttr_fa_vad']
if not os.path.exists(uttr_fa_vads_path):
    os.makedirs(uttr_fa_vads_path)

merged_fa_vads_path = config['es']['es_merge_fa_vad']
if not os.path.exists(merged_fa_vads_path):
    os.makedirs(merged_fa_vads_path)


In [57]:
t1, t2, has_500ms_dur = create_uttr_vad_from_alignment(align_dict, uttr_fa_vads_path)
# print(t1, t2)
print(map(lambda t: "{0:.3f} hrs".format((t / 100.0 / 3600)), [t1, t2]))
print("saving dict: %s" % has_500ms_fa_vad_dict_fname)
pickle.dump(has_500ms_dur, open(has_500ms_fa_vad_dict_fname, "wb"))

Created vad for 0 files id
Created vad for 20 files id
Created vad for 40 files id
Created vad for 60 files id
Created vad for 80 files id
Created vad for 100 files id
['12.704 hrs', '11.329 hrs']
saving dict: ../has_500ms_fa_vad_dict.p


In [44]:
# check how many utterances have atleast 500ms VAD
print("total utterances: %d" % len(has_500ms_dur))
uttrs_with_500ms = {k:v for k, v in has_500ms_dur.items() if v}
print("with 500ms: %d" % len(uttrs_with_500ms))

total utterances: 17394
with 500ms: 14410


In [58]:
# save dev file with only 500ms segments

In [72]:
dev_500ms_fname = config['es']['mt_dev_500ms_files']
dev_fname = config['es']['mt_dev_test_files']
with open(dev_fname, "r") as dev_f, open(dev_500ms_fname, "w") as dev_500ms_f:
    for line in dev_f:
        if line.strip() in has_500ms_dur and has_500ms_dur[line.strip()]:
            dev_500ms_f.write(line)

In [68]:
!wc $dev_500ms_fname

 2081  2081 16648 ../files-dev-500ms.txt


### Create merged vad for each file

In [None]:
total_dur_10ms, total_dur_10ms_ge500ms = 0, 0
for vad_file_id in segment_map:
    t1, t2 = create_vad_from_alignment(vad_file_id, align_dict, segment_map, merged_fa_vads_path)
    total_dur_10ms += t1
    total_dur_10ms_ge500ms += t2

In [None]:
print(total_dur_10ms, total_dur_10ms_ge500ms)
print(map(lambda t: "{0:.3f}".format((t / 100.0 / 3600)), [total_dur_10ms, total_dur_10ms_ge500ms]))

## Create features

In [None]:
def create_gold_feats(align_dict, gold_feats_dict_fname, es_key="es"):
    gold_feats_dict = {}
    for fid in align_dict:
        for sid in align_dict[fid]:
            gold_feats_dict[sid] = {}
            if align_dict[fid][sid][es_key] == []:
                # Only es_cnt can be empty, in which case include stop words
                gold_feats_dict[sid] = [w.word for w in align_dict[fid][sid]['es']]
            else:
                gold_feats_dict[sid] = [w.word for w in align_dict[fid][sid][es_key]]
    print("Saving gold features using key: %s" % es_key)
    pickle.dump(gold_feats_dict, open(gold_feats_dict_fname, "wb"))
    print("finished ...")
    return gold_feats_dict
        

In [None]:
align_dict = pickle.load(open(align_dict_fname, "rb"))
gold_feats_dict_fname = config['es']['gold_feats']
gold_feats_dict = create_gold_feats(align_dict, gold_feats_dict_fname, es_key="es_cnt")

In [77]:
# align_dict['001']['001.001']['es_cnt']
gold_feats_dict['001.001']

NameError: name 'gold_feats_dict' is not defined

In [None]:
display(segment_map.keys()[:5])
display(align_dict["001"]["001.001"])

In [None]:
print(segment_map.keys())

In [None]:
(sorted(segment_map['001'].keys()+segment_map['002'].keys()))

In [30]:
config["es"]["lst_file"]

u'../../../ZRTools/exp/callhome/files.lst'

## Check English translations

In [36]:
align_dict['001']['001.001']['en']

[Align(word='MECHITA', start=12, end=50),
 Align(word='WHAT', start=50, end=73),
 Align(word='SENT', start=129, end=169),
 Align(word='IT', start=126, end=129),
 Align(word='TO', start=169, end=176),
 Align(word='WHOM', start=176, end=192),
 Align(word='TO', start=192, end=198),
 Align(word='POCHO', start=198, end=225)]

In [63]:
es_words = [a.word for fid in align_dict for sid in align_dict[fid] for a in align_dict[fid][sid]['es']]
es_cnt_words = [a.word for fid in align_dict for sid in align_dict[fid] for a in align_dict[fid][sid]['es_cnt']]
en_words = [a.word for fid in align_dict for sid in align_dict[fid] for a in align_dict[fid][sid]['en']]
en_cnt_words = [a.word for fid in align_dict for sid in align_dict[fid] for a in align_dict[fid][sid]['en_cnt']]

In [64]:
from collections import Counter

In [68]:
es_words_freq = Counter(es_words)
es_cnt_words_freq = Counter(es_cnt_words)
en_words_freq = Counter(en_words)
en_cnt_words_freq = Counter(en_cnt_words)

In [69]:
print(sorted(en_words_freq.items(), reverse=True, key=lambda t:t[1])[:10])

[('THE', 5178), ('AND', 4629), ('THAT', 4080), ('I', 3849), ('TO', 3359), ('YES', 3345), ("'T", 2265), ('YOU', 2256), ('NO', 2135), ("'S", 2030)]


In [70]:
print(sorted(en_cnt_words_freq.items(), reverse=True, key=lambda t:t[1])[:10])

[('YES', 3345), ("'T", 2265), ("'S", 2030), ('WELL', 1829), ('AH', 1349), ('KNOW', 1100), ('OH', 1066), ('SEE', 934), ('YEAH', 904), ('LIKE', 889)]


In [71]:
print(sorted(es_words_freq.items(), reverse=True, key=lambda t:t[1])[:10])

[('QUE', 7089), ('NO', 6110), ('Y', 5037), ('A', 4310), ('DE', 4009), ('S\xc3\xad', 3667), ('LA', 3425), ('YA', 2782), ('EL', 2680), ('ES', 2587)]


In [72]:
print(sorted(es_cnt_words_freq.items(), reverse=True, key=lambda t:t[1])[:10])

[('AH', 1831), ('PUES', 1236), ('BUENO', 1186), ('BIEN', 1183), ('SI', 1045), ('<LAUGH>', 987), ('MMM', 976), ('AS\xc3\xad', 781), ('ENTONCES', 775), ('CLARO', 729)]


In [73]:
print([(w,f) for w, f in en_cnt_words_freq.items() if "'" in w])

[("'T", 2265), ("'S", 2030), ("'R", 1), ("'D", 34), ("'M", 627), ("'TS", 1), ("'VE", 184), ("'", 40), ("'OEUVRES", 1), ("'RE", 269), ("'CLOCK", 4), ("'LL", 540), ("O'CLOCK", 1), ("'AM", 8)]


In [75]:
print([(w,f) for w, f in es_words_freq.items() if "<" in w])

[('<SNEEZE>', 4), ('<COUGH>', 11), ('<LAUGH>', 987), ('<BREATH>', 16), ('<NOISE>', 450), ('<BACKGROUND>', 107)]


In [76]:
print([(w,f) for w, f in es_cnt_words_freq.items() if "<" in w])

[('<SNEEZE>', 4), ('<COUGH>', 11), ('<LAUGH>', 987), ('<BREATH>', 16), ('<NOISE>', 450), ('<BACKGROUND>', 107)]
