In [1]:
import nltk
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu, modified_precision
from nltk.translate.chrf_score import sentence_chrf, corpus_chrf
from nltk.metrics import scores
import scipy.io.wavfile
from IPython.display import Audio
from IPython.display import display
from nltk.stem import *
# from nltk.stem.snowball import SnowballStemmer
from stemming.porter2 import stem
import stemming
from nltk.metrics.scores import recall

from basics import *

import sentencepiece as spm

from nltk.corpus import stopwords

%matplotlib inline

In [2]:
tableau20 = [(31, 119, 180), (174, 199, 232), (255, 127, 14), (255, 187, 120),    
             (44, 160, 44), (152, 223, 138), (214, 39, 40), (255, 152, 150),    
             (148, 103, 189), (197, 176, 213), (140, 86, 75), (196, 156, 148),    
             (227, 119, 194), (247, 182, 210), (127, 127, 127), (199, 199, 199),    
             (188, 189, 34), (219, 219, 141), (23, 190, 207), (158, 218, 229)]    
# Scale the RGB values to the [0, 1] range, which is the format matplotlib accepts.    
for i in range(len(tableau20)):    
    r, g, b = tableau20[i]    
    tableau20[i] = (r / 255., g / 255., b / 255.)

In [3]:
smooth_fun = nltk.translate.bleu_score.SmoothingFunction()

In [4]:
from nmt_run import *

In [5]:
cfg_path = "sp2enw_mel-80_vocab-nltk/sp_1.0_h-256_e-128_drpt-rnn-.3_lstm"

In [6]:
%%capture
last_epoch, model, optimizer, m_cfg, t_cfg = check_model(cfg_path)

### Load Fisher dataset

In [7]:
%%capture
# -------------------------------------------------------------------------
# get data dictionaries
# -------------------------------------------------------------------------
map_dict, vocab_dict, bucket_dict = get_data_dicts(m_cfg)
info_dict = pickle.load(open("fbanks_80dim_nltk/info.dict", "rb"))
sim_dict = pickle.load(open("./fbanks_80dim_nltk/mix_sim.dict", "rb"))

In [8]:
random.seed("meh")
# random.seed("haha")

### Train text

In [11]:
train_file = "../installs/fisher-callhome-corpus/corpus/ldc/fisher_train.en"

In [12]:
train_text = []

In [13]:
for u in map_dict["fisher_train"]:
    train_text.append(" ".join([w.decode() for w in map_dict["fisher_train"][u]["en_w"]]))

In [14]:
train_text[:10]

['hello',
 'hello',
 'hello',
 'hello',
 'with whom am i speaking',
 'eh silvia yes what is your name',
 'hello silvia eh my name is nicole',
 'ah nice to meet you',
 'nice to meet you em and where are you from',
 "eh i 'm in philadelphia"]

In [20]:
train_text_to_dump = "\n".join(train_text)

In [14]:
with open("../subword-nmt/fisher_train.en", "w") as out_f:
    out_f.write(train_text_to_dump)

In [15]:
# ./learn_joint_bpe_and_vocab.py --input {train_file}.L1 {train_file}.L2 -s {num_operations} -o {codes_file} --write-vocabulary {vocab_file}.L1 {vocab_file}.L2

In [16]:
# sp = spm.SentencePieceProcessor()

In [17]:
# sp.Load("test/test_model.model")

### word level analysis

In [15]:
min_word_len = 1
top_k = 100

In [16]:
stop_words = set(nltk.corpus.stopwords.words("english"))
len(stop_words)

127

In [17]:
es_stop_words = set(nltk.corpus.stopwords.words("spanish"))
len(es_stop_words)

313

In [18]:
def get_words(m_dict, key="en_w"):
    words = []
    for u in m_dict:
        if type(m_dict[u][key]) == list:
            words.extend([w.decode() for w in m_dict[u][key]])
        else:
            for ref in m_dict[u][key]:
                words.extend([w.decode() for w in ref])
    return Counter(words)

In [19]:
len(map_dict['fisher_dev'])

3979

In [20]:
# words in train
train_words = get_words(map_dict['fisher_train'])
train_words_top_k = [(w,f) for w, f in sorted(train_words.items(), reverse=True, key=lambda t:t[1]) 
                     if w not in stop_words and len(w) >= min_word_len][:top_k]

train_only_words = set(train_words.keys())

print("{0:20s} | {1:10d}".format("# train word types", len(train_words)))
print("{0:20s} | {1:10d}".format("# train word tokens", sum(train_words.values())))

dev_words = get_words(map_dict['fisher_dev'])
dev_words_top_k = [(w,f) for w, f in sorted(dev_words.items(), reverse=True, key=lambda t:t[1]) 
                     if w not in stop_words and len(w) >= min_word_len][:top_k]

dev_only_words = set(dev_words.keys())

print("-"*80)
print("{0:20s} | {1:10d}".format("# dev word types", len(dev_words)))
print("{0:20s} | {1:10d}".format("# dev word tokens", sum(dev_words.values())))

# train word types   |      17830
# train word tokens  |    1497352
--------------------------------------------------------------------------------
# dev word types     |       4835
# dev word tokens    |     165206


In [21]:
# words in train
es_train_words = get_words(map_dict['fisher_train'], key="es_w")
es_train_words_top_k = [(w,f) for w, f in sorted(es_train_words.items(), reverse=True, key=lambda t:t[1]) 
                     if w not in es_stop_words and len(w) >= min_word_len][:top_k]

es_train_only_words = set(es_train_words.keys())

print("{0:20s} | {1:10d}".format("# train word types", len(es_train_words)))
print("{0:20s} | {1:10d}".format("# train word tokens", sum(es_train_words.values())))

es_dev_words = get_words(map_dict['fisher_dev'], key="es_w")
es_dev_words_top_k = [(w,f) for w, f in sorted(es_dev_words.items(), reverse=True, key=lambda t:t[1]) 
                     if w not in es_stop_words and len(w) >= min_word_len][:top_k]

es_dev_only_words = set(es_dev_words.keys())

print("-"*80)
print("{0:20s} | {1:10d}".format("# dev word types", len(es_dev_words)))
print("{0:20s} | {1:10d}".format("# dev word tokens", sum(es_dev_words.values())))

# train word types   |      32860
# train word tokens  |    1496792
--------------------------------------------------------------------------------
# dev word types     |       4145
# dev word tokens    |      41098


In [22]:
train_words_top_k[:5], es_train_words_top_k[:5]

([('yes', 35054),
  ("'s", 24162),
  ("n't", 19184),
  ('like', 14334),
  ('well', 12354)],
 [('ah', 12325), ('eh', 11447), ('si', 9423), ('ajá', 7988), ('bueno', 7838)])

In [23]:
[(w,f) for w,f in train_words_top_k if "'" in w]

[("'s", 24162), ("n't", 19184), ("'m", 5546), ("'re", 2832), ("'ve", 2392)]

In [24]:
dev_words_top_k[:5], es_dev_words_top_k[:5]

([('yes', 3652), ("n't", 1999), ("'s", 1866), ('like', 1826), ('know', 1294)],
 [('ajá', 343), ('ah', 341), ('entonces', 249), ('si', 247), ('mhm', 236)])

In [25]:
oov_words = {w:f for w,f in dev_words.items() if w not in train_only_words}

print("{0:20s} | {1:10d}".format("# oov word types", len(oov_words)))
print("{0:20s} | {1:10d}".format("# oov word tokens", sum(oov_words.values())))

# oov word types     |       1011
# oov word tokens    |       1599


In [26]:
es_oov_words = {w:f for w,f in es_dev_words.items() if w not in es_train_only_words}

print("{0:20s} | {1:10d}".format("# oov word types", len(es_oov_words)))
print("{0:20s} | {1:10d}".format("# oov word tokens", sum(es_oov_words.values())))

# oov word types     |        448
# oov word tokens    |        525


In [27]:
"{0:.1f}%".format(sum(oov_words.values()) / sum(dev_words.values()) * 100)

'1.0%'

### Word level - get train, dev frequency, and utts in which they occur

In [28]:
len(train_only_words), len(set([stem(w) for w in train_only_words]))

(17830, 12011)

In [29]:
def get_word_level_details(word_key):
    word_utt_count = {"train": {}, "dev": {}, "train_utts": {}, "dev_utts": {}}
    for u in tqdm(map_dict["fisher_train"].keys()):
        for w in set(map_dict["fisher_train"][u][word_key]):
            curr_word = w.decode()
            if curr_word not in word_utt_count["train"]:
                word_utt_count["train"][curr_word] = 0
                word_utt_count["train_utts"][curr_word] = set()
            word_utt_count["train"][curr_word] += 1
            word_utt_count["train_utts"][curr_word].update({u})
        # end for words in current utt
    # end for all utts
    for u in tqdm(map_dict["fisher_dev"].keys()):
        if word_key == "en_w":
            for ref in map_dict["fisher_dev"][u][word_key]:
                for w in set(ref):
                    curr_word = w.decode()
                    if curr_word not in word_utt_count["dev"]:
                        word_utt_count["dev"][curr_word] = 0
                        word_utt_count["dev_utts"][curr_word] = set()
                    word_utt_count["dev"][curr_word] += 1            
                    word_utt_count["dev_utts"][curr_word].update({u})
                # end for words in current ref
            # end for all references
        # end if multiple references
        else:
            ref = map_dict["fisher_dev"][u][word_key]
            for w in set(ref):
                curr_word = w.decode()
                if curr_word not in word_utt_count["dev"]:
                    word_utt_count["dev"][curr_word] = 0
                    word_utt_count["dev_utts"][curr_word] = set()
                word_utt_count["dev"][curr_word] += 1            
                word_utt_count["dev_utts"][curr_word].update({u})
            
    # end for all utts
    all_train_utts = set()
    for w in word_utt_count["train_utts"]:
        all_train_utts.update(word_utt_count["train_utts"][w])
    # end for

    all_dev_utts = set()
    for w in word_utt_count["dev_utts"]:
        all_dev_utts.update(word_utt_count["dev_utts"][w])
    # end for
    
    return word_utt_count, all_train_utts, all_dev_utts
    

In [30]:
en_word_utt_count, en_train_utts, en_dev_utts = get_word_level_details("en_w")

100%|██████████| 138819/138819 [00:02<00:00, 63773.99it/s]
100%|██████████| 3979/3979 [00:00<00:00, 19655.30it/s]


In [31]:
len(en_train_utts), len(en_dev_utts)

(138795, 3979)

In [32]:
es_word_utt_count, es_train_utts, es_dev_utts = get_word_level_details("es_w")

100%|██████████| 138819/138819 [00:02<00:00, 58856.07it/s]
100%|██████████| 3979/3979 [00:00<00:00, 58109.60it/s]


In [42]:
len(es_train_utts), len(es_dev_utts)

(138797, 3977)

In [43]:
print("word types")
print(len(en_word_utt_count['train']), len(en_word_utt_count['dev']))
print("common word types")
en_common_words = set(en_word_utt_count['train'].keys()) & set(en_word_utt_count['dev'].keys())
len(en_common_words)

word types
17830 4835
common word types


3824

In [44]:
print("word types")
print(len(es_word_utt_count['train']), len(es_word_utt_count['dev']))
print("common word types")
es_common_words = set(es_word_utt_count['train'].keys()) & set(es_word_utt_count['dev'].keys())
len(es_common_words)

word types
32860 4145
common word types


3697

In [192]:
def get_details_for_words(words, common_words, word_utt_count, 
                          min_dev_freq, max_dev_freq, min_train_freq, max_train_freq, min_len):
    details = {"words": {}, "train_utts": set(), "dev_utts": set()}
    
    in_vocab_words = set(words) & set(common_words)
    print("number of in-vocab words = {0:d}".format(len(in_vocab_words)))

    for w in in_vocab_words:
        t_count, d_count = len(word_utt_count["train_utts"][w]), len(word_utt_count["dev_utts"][w])
        if ((d_count >= min_dev_freq) and 
            (d_count <= max_dev_freq) and
            (len(w) >= min_len) and
            (t_count >= min_train_freq) and 
            (t_count <= max_train_freq)):
            details["words"][w] = {"train": t_count, "dev": d_count}
            details["train_utts"].update(word_utt_count["train_utts"][w])
            details["dev_utts"].update(word_utt_count["dev_utts"][w])
        # end meets criteria
    # end for in-vocab word
    return details
# end function

In [37]:
def get_duration(utts, key):
    dur = 0
    utts_not_found = []
    for u in utts:
        if u not in info_dict[key]:
            #print("argh!", u)
            utts_not_found.append(u)
        else:
            dur += (info_dict[key][u]['sp'] * 10)
    dur = dur / 60 / 60 / 1000
    print("-"*80)
    print("{0:d} total utts".format(len(utts)))
    print("{0:d} not found".format(len(utts_not_found)))
    print("selected utts from {0:s} -- duration = {1:.2f} hours".format(key, dur))
    return dur

In [38]:
def create_vocab(words_list):
    out = {"w2i":{}, "i2w":{}, "freq":{}, "freq_dev":{}}
    START_VOCAB = [PAD, GO, EOS, UNK]
    for w in START_VOCAB:
        out['w2i'][w] = len(out["w2i"])
        out["freq"][w] = 1
    #for w in words_list['words']:
    sorted_w = sorted(words_list['words'].items(), reverse=True, key=lambda t: t[1]['train'])
    for w in sorted_w:
        encoded_word = w[0].encode()
        out["w2i"][encoded_word] = len(out["w2i"])
        out["freq"][encoded_word] = w[1]["train"]
        out["freq_dev"][encoded_word] = w[1]["dev"]

    out["i2w"] = {val:key for key, val in out["w2i"].items()}
    return out

In [39]:
train_dur, dev_dur = get_duration(en_train_utts, key="fisher_train"), get_duration(en_dev_utts, key="fisher_dev")

--------------------------------------------------------------------------------
138795 total utts
89 not found
selected utts from fisher_train -- duration = 161.62 hours
--------------------------------------------------------------------------------
3979 total utts
2 not found
selected utts from fisher_dev -- duration = 4.35 hours


### Task 0 - 500 randomly selected frequent words, minor filtering

In [None]:
min_dev_freq=10
max_dev_freq=10000
min_train_freq=50 
min_len=5

In [None]:
terms_of_interest = get_details_for_words(common_words, 
                                          min_dev_freq=min_dev_freq, 
                                          max_dev_freq=max_dev_freq, 
                                          min_train_freq=min_train_freq, 
                                          min_len=min_len)
print("total words meeting criteria = {0:d}".format(len(terms_of_interest["words"])))

In [None]:
_, _ = get_duration(terms_of_interest["train_utts"], key="fisher_train"), get_duration(terms_of_interest["dev_utts"], key="fisher_dev")

In [None]:
random.seed("hmm")
sample_terms = random.sample(list(terms_of_interest["words"].keys()), 500)

In [None]:
sample_terms_details = get_details_for_words(sample_terms, 
                                          min_dev_freq=min_dev_freq, 
                                          max_dev_freq=max_dev_freq, 
                                          min_train_freq=min_train_freq, 
                                          min_len=min_len)
print("total words meeting criteria = {0:d}".format(len(sample_terms_details["words"])))

In [None]:
_, _ = get_duration(sample_terms_details["train_utts"], key="fisher_train"), get_duration(sample_terms_details["dev_utts"], key="fisher_dev")

In [None]:
sample_terms[:10]

In [None]:
bow_top_500_words_vocab = create_vocab(sample_terms_details)

In [None]:
pickle.dump(bow_top_500_words_vocab, open(os.path.join(m_cfg['data_path'], "bow_top_500_words_vocab.dict"), "wb"))

### Task 1 - randomly selected frequent words

In [None]:
min_dev_freq=10 
max_dev_freq=100
min_train_freq=100
min_len=5

In [None]:
terms_of_interest = get_details_for_words(common_words, 
                                          min_dev_freq=min_dev_freq, 
                                          max_dev_freq=max_dev_freq, 
                                          min_train_freq=min_train_freq, 
                                          min_len=min_len)
print("total words meeting criteria = {0:d}".format(len(terms_of_interest["words"])))

In [None]:
_, _ = get_duration(terms_of_interest["train_utts"], key="fisher_train"), get_duration(terms_of_interest["dev_utts"], key="fisher_dev")

In [None]:
random.seed("hmm")
sample_terms = random.sample(list(terms_of_interest["words"].keys()), 100)

In [None]:
sample_terms_details = get_details_for_words(sample_terms, 
                                          min_dev_freq=min_dev_freq, 
                                          max_dev_freq=max_dev_freq, 
                                          min_train_freq=min_train_freq, 
                                          min_len=min_len)
print("total words meeting criteria = {0:d}".format(len(sample_terms_details["words"])))

In [None]:
_, _ = get_duration(sample_terms_details["train_utts"], key="fisher_train"), get_duration(sample_terms_details["dev_utts"], key="fisher_dev")

In [None]:
sample_terms[:10]

In [None]:
bow_top_100_words_vocab = create_vocab(sample_terms_details)

In [None]:
pickle.dump(bow_top_100_words_vocab, open(os.path.join(m_cfg['data_path'], "bow_top_100_words_vocab.dict"), "wb"))

### Task 2 - topics as keywords

In [None]:
train_text_fname= "../installs/fisher-callhome-corpus/corpus/ldc/fisher_train.en"
topics_fname = "../criseslex/fsp06_topics_in_english.txt"

In [None]:
topics = [ "peace", "Music", "Marriage", "Religion", "Cell phones", 
           "Dating", "Telemarketing and SPAM", "Politics", "Travel", 
           "Technical devices", "Healthcare", "Advertisements", "Power", 
           "Occupations", "Movies", "Welfare", "Breaking up", "Location", 
           "Justice", "Memories", "Crime", "Violence against women", "Equality", 
            "Housing", "Immigration",     
            # new topics
           "Interracial", "Christians", "muslims", "jews", "e-mail", 
           "phone", "democracy", "Democratic", "Republican", "technology", 
           "leadership", "community", "jury", "police", "inequality", 
           "renting", "Violence", "immigrants", "immigrant", "skilled", 
           "Telemarketing", "SPAM", "skill", "job", "health", "mobile", 
            "ads", "physical", "emotional", "bubble", "rent", "economy", 
            "abuse", "women", "city", "country", "suburban", "dollar", 
            "united states", "laws", "phone", "race", "biracial", "interracial", 
            "marriage", "lyrics", "sexuality", "medicine", "television", "european",
            "home", "protect", "spouse", "language", "cellphone", "money",
            "doctor", "insurance", "cigarettes", "alcohol", "income", "salary",
            "class", "censor", "rating", "programs", "government",
            "relationship", "legal", "event", "life", "safe", "victim", "cops",
            "wage", "illegal"
            ]
topics = list(set(t.lower() for t in topics))
topics_stem = [stem(t) for t in topics]

# add similar topic words
new_topics = []
# for t in topics:
#     if t.encode() in sim_dict['w']:
#         new_topics.extend([w.decode() for w in sim_dict['w'][t.encode()]])
topics.extend(new_topics)

In [None]:
len(topics)

In [None]:
topics_details = get_details_for_words(topics, en_common_words, en_word_utt_count, 
                                       min_dev_freq=5, 
                                       max_dev_freq=10000, 
                                       min_train_freq=10, 
                                       min_len=1)
print("total words meeting criteria = {0:d}".format(len(topics_details["words"])))

In [None]:
_, _ = get_duration(topics_details["train_utts"], key="fisher_train"), get_duration(topics_details["dev_utts"], key="fisher_dev")

In [None]:
print("\n".join(list(topics_details["words"].keys())))

In [None]:
topics_details['words']

In [None]:
bow_topics_vocab = create_vocab(topics_details)

In [None]:
haha = pickle.load(open(os.path.join(m_cfg['data_path'], "bow_topics_vocab.dict"), "rb"))

In [None]:
len(set(haha['w2i']) & set(bow_topics_vocab['w2i'].keys()))

In [None]:
pickle.dump(bow_topics_vocab, open(os.path.join(m_cfg['data_path'], "bow_topics_vocab.dict"), "wb"))

### Task 3 - crises terms as keywords

In [7]:
crises_lex_fname = "../criseslex/CrisisLexLexicon/CrisisLexRec.txt"

In [8]:
crises = set()
with open(crises_lex_fname, "r") as in_f:
    for line in in_f:
        crises.update(line.strip().split())
crises = list(crises)
crises_stem = [stem(w) for w in crises]

# new_crises = []
# for t in crises:
#     if t.encode() in sim_dict['w']:
#         new_crises.extend([w.decode() for w in sim_dict['w'][t.encode()]])
# crises.extend(new_crises)

In [9]:
len(crises)

288

In [46]:
crises_details = get_details_for_words(crises, en_common_words, en_word_utt_count,
                                       min_dev_freq=10, 
                                       max_dev_freq=1000, 
                                       min_train_freq=100, 
                                       min_len=1)
print("total words meeting criteria = {0:d}".format(len(crises_details["words"])))

number of in-vocab words = 123
total words meeting criteria = 40


In [None]:
_, _ = get_duration(crises_details["train_utts"], key="fisher_train"), get_duration(crises_details["dev_utts"], key="fisher_dev")

In [None]:
en_word_utt_count.keys()

In [None]:
len(en_word_utt_count['dev_utts']['people'])

In [None]:
print("\n".join(list(crises_details["words"].keys())))

In [None]:
crises_details["words"]

In [None]:
bow_crises_vocab = create_vocab(crises_details)

In [None]:
pickle.dump(bow_crises_vocab, open(os.path.join(m_cfg['data_path'], "bow_crises_vocab.dict"), "wb"))

### More Crisis

In [50]:
crises_details = get_details_for_words(crises, en_common_words, en_word_utt_count,
                                       min_dev_freq=5, 
                                       max_dev_freq=1000, 
                                       min_train_freq=50, 
                                       min_len=1)
print("total words meeting criteria = {0:d}".format(len(crises_details["words"])))

number of in-vocab words = 123
total words meeting criteria = 59


In [51]:
_, _ = get_duration(crises_details["train_utts"], key="fisher_train"), get_duration(crises_details["dev_utts"], key="fisher_dev")

--------------------------------------------------------------------------------
30331 total utts
11 not found
selected utts from fisher_train -- duration = 58.82 hours
--------------------------------------------------------------------------------
1138 total utts
0 not found
selected utts from fisher_dev -- duration = 2.08 hours


In [52]:
en_word_utt_count.keys()

dict_keys(['train', 'dev', 'train_utts', 'dev_utts'])

In [53]:
len(en_word_utt_count['dev_utts']['people'])

204

In [54]:
print("\n".join(list(crises_details["words"].keys())))

remember
case
stay
want
please
leave
city
huge
help
love
time
give
gets
kill
someone
need
lost
house
heart
people
large
another
force
saying
morning
black
terrible
situation
even
service
home
lives
watch
women
years
police
safe
life
coast
government
free
make
live
town
news
change
cost
first
coming
waiting
send
girl
name
found
return
areas
involved
high
brought


In [55]:
crises_details["words"]

{'another': {'dev': 45, 'train': 1215},
 'areas': {'dev': 7, 'train': 102},
 'black': {'dev': 8, 'train': 202},
 'brought': {'dev': 14, 'train': 98},
 'case': {'dev': 14, 'train': 432},
 'change': {'dev': 19, 'train': 425},
 'city': {'dev': 39, 'train': 1045},
 'coast': {'dev': 5, 'train': 60},
 'coming': {'dev': 20, 'train': 253},
 'cost': {'dev': 7, 'train': 175},
 'even': {'dev': 73, 'train': 1582},
 'first': {'dev': 20, 'train': 937},
 'force': {'dev': 8, 'train': 79},
 'found': {'dev': 15, 'train': 290},
 'free': {'dev': 7, 'train': 261},
 'gets': {'dev': 20, 'train': 370},
 'girl': {'dev': 34, 'train': 598},
 'give': {'dev': 37, 'train': 999},
 'government': {'dev': 9, 'train': 333},
 'heart': {'dev': 8, 'train': 53},
 'help': {'dev': 38, 'train': 529},
 'high': {'dev': 20, 'train': 255},
 'home': {'dev': 34, 'train': 537},
 'house': {'dev': 42, 'train': 1134},
 'huge': {'dev': 10, 'train': 119},
 'involved': {'dev': 5, 'train': 89},
 'kill': {'dev': 7, 'train': 108},
 'large': {

In [56]:
bow_crises_vocab = create_vocab(crises_details)

NameError: name 'create_vocab' is not defined

In [57]:
pickle.dump(bow_crises_vocab, open(os.path.join(m_cfg['data_path'], "bow_crises_vocab_more.dict"), "wb"))

NameError: name 'bow_crises_vocab' is not defined

### Task Spanish - 500 randomly selected frequent words, minor filtering

In [None]:
min_dev_freq=18
max_dev_freq=10000
min_train_freq=200
min_len=5

In [None]:
terms_of_interest = get_details_for_words(es_common_words,
                                          es_common_words,
                                          es_word_utt_count,
                                          min_dev_freq=min_dev_freq, 
                                          max_dev_freq=max_dev_freq, 
                                          min_train_freq=min_train_freq, 
                                          min_len=min_len)
print("total words meeting criteria = {0:d}".format(len(terms_of_interest["words"])))

In [None]:
_, _ = get_duration(terms_of_interest["train_utts"], key="fisher_train"), get_duration(terms_of_interest["dev_utts"], key="fisher_dev")

In [None]:
random.seed("hmm")
sample_terms = random.sample(list(terms_of_interest["words"].keys()), 
                             min(len(terms_of_interest["words"].keys()), len(terms_of_interest["words"].keys())))

In [None]:
# sample_terms = ["bueno"]

In [None]:
sample_terms_details = get_details_for_words(sample_terms,
                                             es_common_words,
                                             es_word_utt_count,
                                              min_dev_freq=min_dev_freq, 
                                              max_dev_freq=max_dev_freq, 
                                              min_train_freq=min_train_freq, 
                                              min_len=min_len)
print("total words meeting criteria = {0:d}".format(len(sample_terms_details["words"])))

In [None]:
_, _ = get_duration(sample_terms_details["train_utts"], key="fisher_train"), get_duration(sample_terms_details["dev_utts"], key="fisher_dev")

In [None]:
sample_terms

In [None]:
# sample_terms_details

In [None]:
bow_es_top_words_vocab = create_vocab(sample_terms_details)

In [None]:
bow_es_top_words_vocab.keys()

In [None]:
sample_terms_details.keys()

In [None]:
sample_terms_details["train_utts"][:10]

In [None]:
sample_terms_details['words']

In [None]:
sample_terms_details['words']['bueno']

In [None]:
bow_es_top_words_vocab['freq'][b'bueno'], bow_es_top_words_vocab['freq_dev'][b'bueno']

In [None]:
pickle.dump(bow_es_top_words_vocab, open(os.path.join(m_cfg['data_path'], "bow_es_100word_vocab.dict"), "wb"))

In [None]:
pickle.dump(bow_es_top_words_vocab, open(os.path.join(m_cfg['data_path'], "bow_es_1word_vocab.dict"), "wb"))

In [None]:
!su s1444673

In [None]:
pickle.dump(bow_es_top_words_vocab, open(os.path.join(m_cfg['data_path'], "bow_es_100word_vocab.dict"), "wb"))

In [None]:
pickle.dump(bow_es_top_words_vocab, open(os.path.join(m_cfg['data_path'], "bow_es_top_words_vocab.dict"), "wb"))

In [None]:
sample_terms_details['words']['colorado']

In [None]:
sorted([(w, sample_terms_details['words'][w]['train']) 
       for w in sample_terms_details['words']], reverse=True, key=lambda t: t[1])[:20]

In [None]:
sorted([(w, sample_terms_details['words'][w]['dev']) 
       for w in sample_terms_details['words']], reverse=True, key=lambda t: t[1])[:20]

In [None]:
m_cfg['data_path']

In [None]:
!ls fbanks_80dim_nltk

In [None]:
m_cfg["sim_dict"]

In [None]:
sim_dict = pickle.load(open(os.path.join(m_cfg['data_path'], 'pre_trained_sim.dict'), "rb"))

In [None]:
len([i for i in sim_dict['w'].values() if len(i)>1])

In [None]:
sim_dict['w'][b'sure']

In [None]:
sim_dict['w']

In [98]:
mix_sim_dict = pickle.load(open(os.path.join(m_cfg['data_path'], 'mix_sim.dict'), "rb"))

In [99]:
len([i for i in mix_sim_dict['w'].values() if len(i)>1])

7314

In [100]:
for w, i in [(w.decode(),[j.decode() for j in i]) for w, i in mix_sim_dict['w'].items() if len(i) > 2]:
    print(w, "  & ", ", ".join(set(i)-set([w])), " \\\\")

hello   &  olga, hi, ¡hi, jolanda, ¡hello, yolanda, josefina, ¿hello, patricia, jose  \\
silvia   &  olga, audria, auria, silvio, claudia, mariza, gloria, josé, josma, josefina  \\
nicole   &  alfredo, yosma, fernando, mauricio, nicolas, fernanda, josma, rodrigo, arturo, zulma  \\
where   &  ¿where, nowhere  \\
philadelphia   &  phila, yosma, eugenia  \\
study   &  undergraduate, studio, graduate, studies, studying  \\
included   &  include, including, includes, excluded  \\
people   &  peop, peo  \\
but   &  ¿but, although, though, however  \\
participated   &  participation, participating, participant, participants, participate  \\
hmm   &  hm, mmm, uhmm, hmmmm, hmmm, mm, mmmm, mmmhmm, mmhmm  \\
son   &  father, daughter, nephew, grandfather, eldest, grandson  \\
country   &  countryside, countrylike, countrymen  \\
hispanic   &  hispanics, latino  \\
parents   &  grandparents, parent  \\
sure   &  really, think  \\
very   &  quite, extremely  \\
cuban   &  puertorican, indian, cuba,

harms   &  q, harmful, strength, arms, unfair, harm  \\
josefina   &  olga, silvia, ¡hi, alvaro, joe, josé, miriam, josma, paulina, jose  \\
answered   &  answer, answering, answers  \\
winter   &  hot, hotter, summer, winters, autumn, cold, windy, temperature, humid, painter  \\
nearby   &  midtown, hampshire, ottawa, acapulco, downtown, aquarium, near, norfolk, pittsburgh, utah  \\
kansas   &  albuquerque, maryland, switzerland, cuarnabaca, ohio, acapulco, oaxaca, bolivia, orlando, utah  \\
movement   &  cement, equipment, movements  \\
receive   &  receives, deceive, received, receiving  \\
omaha   &  cuarnabaca, hampshire, bogota, acapulco, aquarium, lincoln, volcano, 2003, pittsburgh, utah  \\
awful   &  grateful, horrible  \\
diseases   &  circles, refugees, sacrifices, geniuses, obstacles, causes, phrases, disease, purchases, horses  \\
water   &  sweater, heater  \\
flood   &  flat, flooded, firewood, floods, floyd, vacuum, floor, floors  \\
dinner   &  dawn, dinners, lunch  \\

inclination   &  civilization, inspiration, combination, inclinations, nation, denomination, ambition, regulation, legislation, segregation, coalition  \\
political   &  polite, politic, politically, politician  \\
dis   &  distract, dislike, disc, disco, dispute, discos, di, disabled  \\
discover   &  discos, discovers, discovered, discovery  \\
defending   &  ending, deciding, responding, kidding, demanding, dealing, attacking, defend, instilling, hiding  \\
sundays   &  saturday, saturdays, sunday, monday, tuesdays, fridays, wednesdays, thursdays  \\
celebrations   &  locations, celebrating, reservations, celebration, celebrate, exceptions, generations  \\
wear   &  wears, swear  \\
uniform   &  unified, unique, unify, buddhist, condemn, ii  \\
shirt   &  shift, shuttle, cloth, shirts, shut, shit, tshirt, skirt  \\
forced   &  baptized, forcing, abandoned, devoted, convinced  \\
rules   &  forces, molecules, principles  \\
regulations   &  variations, limitations, traditions, denomi

expressing   &  expressed, dressing, express, messing, expression  \\
discrimination   &  combination, discriminatory, eliminate, denomination, discriminates, discriminating, denominations, imagination, racism, discriminate, discriminated, inclination, criminal  \\
obscene   &  conclusion, q, odd, vocabulary, phenomenon, chorus, fundamental, scene, admire, sadomasochism  \\
blacks   &  asians, lacks, blackberry, whites, black  \\
sell   &  buy, sellers, sells  \\
hate   &  gate, isolate  \\
dollar   &  doll, peso, dollars  \\
lorena   &  ¡hi, alvaro, jorge, marcela, mauricio, gloria, fernanda, josma, paulina, zulma  \\
antonio   &  juan, tijuana, vanesa, navy, francisco, pedro, san, oro, rodrigo, diego  \\
¿oh   &  odd, ooh, mmh, ¿uh, ¿yeah, uf, ¡oh, pyramids, ahhh, ohh  \\
finance   &  finances, unemployment, maintenance, financial  \\
engineering   &  engineer, engine, chemical, chemistry, engineers, biochemistry, studying, phd  \\
sugar   &  vulgar, egg, walmart  \\
tropical   &  va

terrorism   &  terrorists, fanaticism, terror, terrorist, errors, error, evil, domestic, sexism, evidences  \\
threats   &  threat, treats  \\
¿from   &  yosma, from, ontario, rafael, rodrigo, originally, zulma  \\
distracts   &  distract, distracted, distraction  \\
exercising   &  exercise, raising, forcing, rising, entertaining, exaggerating, swimming, screaming, meaning, hiring  \\
cheer   &  cheese, cheers  \\
artists   &  articles, artifacts, artist, artistic  \\
entertained   &  entertaining, entertain, entertainment  \\
mainly   &  mostly, primarily  \\
fort   &  fortworth, forth  \\
orlando   &  dc, florida, marcelo, fernando, ohio, paso, wilmington, oro, kansas, zulma  \\
airport   &  passport, airports, airplane  \\
windy   &  winter, temperatures, wind, autumn, cloudy, temperature, humid, rainy  \\
frozen   &  freezing, snowing, shift, freeze, slightly, autumn, freezes, drops, clouds, cloudy  \\
shower   &  showed, shoot, show, shown  \\
written   &  writes, write  \\
creat

korea   &  hopkins, cuarnabaca, oregon, korean, guadalajara, volcano, salmon, salesman, buffalo, zoo  \\
thailand   &  switzerland, finland, cuarnabaca, jamaica, acapulco, egypt, guadalajara, poland, holland, ibiza  \\
poland   &  albuquerque, cuernavaca, switzerland, valencia, cuarnabaca, arcadia, jamaica, thailand, volcano, utah  \\
educational   &  education, traditional, nutritional  \\
therefore   &  therapist, theft, thus, uf  \\
loving   &  arguing, moving, saving, riding, removing, adapting, facing, improving, hurting, hiding  \\
embarrassing   &  struggling, fascinating, embarrassed, adapting, analyzing, facing, managing, missing  \\
embarrassed   &  embarrasses, embassy, intrigued, embarrassing  \\
actor   &  factor, victor  \\
carries   &  dries, worries  \\
intimate   &  interfere, legitimate, interrupt, accumulate, interpret, adequate  \\
speaker   &  speak, speaks, speakers, speakerphone, spanglish  \\
method   &  ambulance, odd, fiancee, u2, dsl, 9, mp3s, methods, ehhh  

oregon   &  cuernavaca, albuquerque, cuarnabaca, jamaica, acapulco, oaxaca, guadalajara, mblas, volcano, utah  \\
nephews   &  nephew, niece, uncles  \\
annoys   &  annoy, annoying  \\
telemarketing   &  marketing, telemarketers, telemarketer  \\
trash   &  trailer, garbage, package  \\
winters   &  winter, summers  \\
preparatory   &  academic, mandatory, preparing  \\
fill   &  fulfill, filth, fills  \\
heating   &  quitting, beating, hitting, melting, exaggerating, shooting, cheating, cutting, eating, rating  \\
complained   &  complain, complaint, explained, combined, complaining, complaints, complains  \\
increases   &  decreasing, increased, decrease, increasing, increase, decreased, decreases  \\
disoriented   &  adopted, occupied, accustomed, disappointed, talented, terrified, amazed, occurred, disabled, oriented  \\
salaries   &  sacrifices, salary  \\
establish   &  reflect, stabilize, strategy, immoral, established, conduct  \\
budget   &  target, exercise, rush, fulfill, bu

contribute   &  condoms, con, contributes, contributed, conduct, imperialism, controversy, contradictory, controls, controversial  \\
donate   &  gate, imitate, assimilate, quote, senate, donating, accurate, debate, evaluate, isolate, adequate  \\
politician   &  political, politic, polite, politics, politicians  \\
satisfied   &  occupied, unified, traumatic, simplified  \\
complaints   &  complain, complaint, complaining, complained, complains  \\
questioning   &  questioned, question, questions  \\
action   &  inspiration, creation, obligation, destruction, devotion, reflection, infection, reaction, function, rejection  \\
constitution   &  civilization, evolution, abortion, institutions, pollution, institute, institutional, institution, democratic, organization  \\
communist   &  community, communism, communion, communists  \\
followed   &  follows, allowed, follow  \\
laia   &  audria, silvia, victoria, eugenia, auria, gloria, paula, cia, patricia, josma  \\
catalonia   &  alterna

accepting   &  existing, acceptable, accepts, insulting, accept  \\
ashamed   &  assumed, gained, harmed, combined, strongly, refined, succeed, terrified, corrupted  \\
cla   &  classmate, claus  \\
wanting   &  quitting, worrying, chatting, arguing, skying, adapting, facing, shooting, hoping, seeking  \\
whore   &  gore, whoa, who´s  \\
trapped   &  sued, kidnapped, occupied, accustomed, trained, assaulted, equipped, amazed, occurred, robbed  \\
bitch   &  switch, shuttle, witch  \\
unique   &  hypocrites, idyllic, assimilate, unify, chauvinistic, buddhist, hypocrisy, hypocrite, uniform, dialogue  \\
protestants   &  principle, catholics, baptists, protests, principles, prisoners, protects, commandments, principal, protestant, protestantism, protest  \\
wearing   &  quitting, drinking, wore, caring, wears, sitting, swimming, hitting, hearing, sharing, hiring, smoking  \\
spirit   &  jehovah, spiritual, jehova, jesus, spiritually  \\
structure   &  imperialism, structures, destructive,

weighed   &  weighs, weighing  \\
gained   &  fed, melted, conjoined, ashamed, combined, refined, terrified, rained, gaining  \\
liver   &  kidneys, kidney  \\
chains   &  chairs, chain, ruins  \\
burger   &  mcdonald´s, hamburger, mcdonalds, hamburgers, mcdonald  \\
assimilate   &  donate, imitate, unique, exclude, describe, excluded, accurate, assimilates, dialogue, tolerate, adequate  \\
manipulations   &  manipulating, manipulation  \\
filth   &  filter, fill, filters  \\
nuggets   &  puppets, upsets, yogurt, sweets, gadgets, cuts, eats  \\
weighs   &  weighed, weighing, weight, weights  \\
freezing   &  spring, climbing, freeze, snowboarding, autumn, melting, dropping, snowing, raining, frozen  \\
graduating   &  graduated, enrolling, graduation  \\
responsibilities   &  stability, flexibility, benefits, facilities, abilities, respond, responsibility, responsible, ability, possibilities  \\
17   &  30, 21, 18, twentyseven, 16, 9, 23, 19, 14, 4  \\
60   &  wisconsin, 21, 18, u2, mi

noticing   &  struggling, fascinating, criticizing, forcing, analyzing, frustrating, facing, repeating, managing, sacrificing  \\
sox   &  phillies, yankees  \\
remedy   &  chorus, remedies  \\
boots   &  pots, boats, walmart, idiots, bookstores  \\
shot   &  showed, shift, shocked, shoe, shots, shoot  \\
slightly   &  slip, fly, yep, somewhat, freezes, lightly, frozen  \\
terror   &  terrorist, errors, terry, terrorism, error, evil, terrorists  \\
arrange   &  range, orange  \\
sates   &  satellites, hates, plates, rates  \\
concrete   &  aquarium, autumn, ottawa, outrageous  \\
ivette   &  j, cassette  \\
afternoons   &  mornings, evenings  \\
wave   &  microwaves, microwave, waves  \\
simplified   &  length, ideology, traumatic, obliged, justified, impersonal, simple, passionate, qualified, individual  \\
suburbs   &  midtown, suburb, outskirts  \\
rushing   &  quitting, climbing, washing, hanging, fixing, adapting, screaming, swimming, settling, fishing, yards  \\
factor   &  facto

fairly   &  mn, extremely, quite, ugly, truthfully, fairy, slowly, fair  \\
nail   &  hail, fail, tail  \\
perfume   &  q, j, schwarzenegger, mp3s, uf, ¡oh  \\
individuals   &  exclude, rejects, prejudices, ignorance, liberals, extremists, individually, conflicts, individual, buddhism  \\
sacrificing   &  needing, struggling, sacrifices, producing, criticizing, forcing, proposing, refinancing, sacrificed, sacrifice, financing  \\
respectable   &  suitable, acceptable, respect, reflect, respectful, uncomfortable, inevitable, respected, disrespectful, respects  \\
diaper   &  paper, diapers, gallbladder  \\
implies   &  selfsufficient, procedures, prejudices, results, consequences, bureaucracy, enemies, issues, individual, individuals  \\
universe   &  universal, universities, university, diversity  \\
beneficial   &  selfsufficient, benefits, prohibitive, financial, mutual, efficient, benefit, official, sufficient, beneficiary  \\
whatsoever   &  fever, clever, whenever  \\
ego   &  gua

gardel   &  garden, boulevard, wooden, gardens, flavor, buffalo  \\
closing   &  losing, climbing, riding, kidding, kissing, missing, attacking, shooting, loosing, hiding  \\
buffalo   &  felipe, bogota, aquarium, guadalajara, volcano, kennedy, uptown, allentown, utah, zoo  \\
complaint   &  complain, complaining, complained, complaints, complains  \\
ingrid   &  olga, alfredo, silvia, ¡hi, alvaro, yosma, mercedes, josma, arturo, josefina  \\
complains   &  complain, complaint, complaining, complained, complaints  \\
transplants   &  transplant, transplantation  \\
myth   &  q, curriculum, hawaii, u2, mitsubishi, mr, dictatorship, sadomasochism  \\
sustain   &  maintain, remain, obtain  \\
mistreatment   &  mistreatments, mistreated, mistreating  \\
aspirins   &  tylenol, idiots, forgiveness  \\
thirtythree   &  thirty, twentyfive, 30, twentyseven, twentyfour, 9, twentyone, twentysix, thirtyseven, twentytwo  \\
consent   &  consume, resentment, continent, confirm, content, condemn, ext

In [84]:
pre_sim_dict = pickle.load(open(os.path.join(m_cfg['data_path'], 'pre_trained_sim.dict'), "rb"))

In [85]:
len([i for i in pre_sim_dict['w'].values() if len(i)>1])

3460

In [86]:
for w, i in [(w.decode(),[j.decode() for j in i]) for w, i in pre_sim_dict['w'].items() if len(i) > 2]:
    print(w, "  & ", ", ".join(set(i)-set([w])), " \\\\")

but   &  though, although, however  \\
son   &  father, daughter, nephew, grandfather, eldest, grandson  \\
hispanic   &  hispanics, latino  \\
sure   &  really, think  \\
very   &  quite, extremely  \\
father   &  brother, son, uncle, grandfather, stepfather, mother  \\
grandparents   &  grandparent, grandmother, parents  \\
eleven   &  six, fourteen, seven, fifteen, twelve, five, eight, thirteen, seventeen, nine  \\
nine   &  six, seven, fourteen, four, fifteen, five, eight, thirteen, twelve, eleven  \\
married   &  remarried, divorced  \\
thirty   &  fifty, eighty, fifteen, twenty, sixty, forty, ninety  \\
seven   &  six, four, fourteen, three, fifteen, five, eight, twelve, eleven, nine  \\
divorced   &  remarried, married, divorcing  \\
ten   &  nine, five, eight, fifteen  \\
twenty   &  thirty, fifty, eighty, fifteen, sixty, forty  \\
three   &  six, seven, four, five, eight, two, nine  \\
two   &  three, six, four  \\
think   &  honestly, definitely, anyway, really, sure  \\
marr

vaccine   &  vaccines, vaccinations, vaccination  \\
vaccinated   &  vaccinate, vaccinations, vaccination  \\
vaccinate   &  vaccinated, vaccinations, vaccination  \\
reimburse   &  reimbursed, reimbursement, reimbursing  \\
nonetheless   &  nevertheless, however  \\
paraguay   &  uruguay, bolivia  \\
divorcing   &  divorce, divorced  \\
paraguayan   &  paraguayans, uruguayan  \\
fiancee   &  fiance, girlfriend  \\
renovation   &  refurbishment, renovated, renovating  \\
investing   &  investments, investment, invest  \\
disconnection   &  disconnecting, disconnect, disconnects  \\
anyhow   &  anyways, anyway  \\
investment   &  investments, investors  \\
emotional   &  emotions, emotion  \\
sacrifices   &  sacrifice, sacrificed  \\
generalized   &  generalizing, generalize, generalised, generalization  \\
postgraduate   &  undergraduate, postgrade  \\
stimulating   &  stimulate, stimulates  \\
stimulate   &  stimulating, stimulates  \\
individualism   &  individualistic, individualist

In [87]:
pre_sim_dict['w'][b'sure']

[b'sure', b'really', b'think']

In [88]:
sim_dict = pickle.load(open(os.path.join(m_cfg['data_path'], 'sim.dict'), "rb"))

In [89]:
len([i for i in sim_dict['w'].values() if len(i)>1])

5379

In [90]:
for w, i in [(w.decode(),[j.decode() for j in i]) for w, i in sim_dict['w'].items() if len(i) > 2]:
    print(w, "  & ", ", ".join(set(i)-set([w])), " \\\\")

hello   &  hi, olga, ¡hi, jolanda, ¡hello, yolanda, ¿hello, jose, patricia, josefina  \\
silvia   &  olga, audria, silvio, auria, claudia, mariza, gloria, josé, josma, josefina  \\
nicole   &  alfredo, yosma, fernando, mauricio, nicolas, fernanda, josma, rodrigo, arturo, zulma  \\
where   &  ¿where, nowhere  \\
philadelphia   &  phila, yosma, eugenia  \\
study   &  undergraduate, studio, graduate, studies, studying  \\
included   &  include, including, includes, excluded  \\
people   &  peop, peo  \\
participated   &  participation, participating, participant, participants, participate  \\
hmm   &  hm, mmm, uhmm, hmmmm, hmmm, mm, mmmm, mmmhmm, mmhmm  \\
country   &  countryside, countrylike, countrymen  \\
parents   &  grandparents, parent  \\
cuban   &  puertorican, indian, cuba, dominican, peruvian, rican, mexican, newyorkian  \\
father   &  fathers, godfather, grandfather, mother  \\
havana   &  eliana, rodriguez, iliana, liliana, sulma, mauricio, ontario, rodrigo, diana, zulma  \\


mistake   &  mistakes, mistaken  \\
ricans   &  puertorican, koreans, ricardo, mexicans, dominicans, cubans, beans, rican, rica  \\
dominicans   &  puertorican, indians, peruvians, ricans, mexicans, dominican, cubans, domingo, ecuadorean, rica  \\
venezuela   &  venezuelan, venezuelans  \\
sixty   &  pesos, thirty, fifty, eighty, sixth, zero, seventy, forty, fiftyfive, fahrenheit  \\
americans   &  american, america, latinamerican  \\
remembers   &  members, remembering, remembered, remember  \\
fruits   &  sweets, fruit, walmart, suits, potatoes  \\
travel   &  traveled, traveling, travelled, travelling, travels  \\
ninety   &  eighty, ninth, nineteen, nineties, nine  \\
originally   &  ¿from, origin, origins, original, ecuadorian  \\
descendant   &  descendants, bohemian, descent, korean, guadalajara, arabic  \\
televisions   &  channels, television  \\
soaps   &  dvds, mtv, pornography, soap, dvd, hbo, pornographic  \\
programs   &  programmer, program, programming  \\
repetition   

christmas   &  christopher, christmases, chris  \\
gifts   &  sweets, gift  \\
meal   &  heal, crucial  \\
candles   &  sweets, vegetables, boxes, candle, shots, flowers  \\
ceremony   &  harmony, q, odd, eharmony, debauchery, mr, k, idiot, frivolous, sadomasochism  \\
serious   &  tedious, mysterious, seriously, curious, luxurious  \\
praying   &  worrying, carrying, dying, forcing, skying, practicing, facing, lying, hiding  \\
conflict   &  condoms, aspects, condemn, controversy, conflicts, unfair, sexism, individual, controversial, individuals  \\
observe   &  deserve, reserve, deserves, describe, confirm, condemn, serve, serves, exploit, conduct  \\
obviously   &  obvious, previously  \\
childhood   &  children, grandchildren, child  \\
continued   &  continue, continuing, continues, continent  \\
believing   &  believes, believed, unbelievable, belief, dividing, believer, practicing, beliefs, relief, believe  \\
fifties   &  twenties, sixties, thirties, forties, eighties, nineties

finishes   &  finishing, finish, finished  \\
enormous   &  tremendous, gorgeous, marvelous, insignificant, cosmopolitan, indigenous, outrageous  \\
compromise   &  imperialism, q, promise  \\
demanding   &  decreasing, demand, defending, demands, ending, responding, deciding, dealing  \\
yikes   &  likes, earthquakes, smokes, jokes  \\
depressed   &  pressed, stressed, impressed, rushed, soaked, messed, touched, pushed  \\
evil   &  terror, terrorist, terrorism, error, devil  \\
laughing   &  laugh, crying, laughed, laughs  \\
faces   &  races, circles, anxieties, factories, obstacles, robberies  \\
silence   &  sentence, fence, existence, coexistence, patience, violence, confidence, audience  \\
laughs   &  mommy, laughing, cry, laughter, laugh, laughed  \\
sadness   &  q, ¿oh, sickness, happiness, goodness, loneliness, forgiveness, shameless, illness, ¡oh  \\
afterwords   &  afterwards, i´ll, after  \\
vancouver   &  toronto, cuernavaca, vanesa, ontario, nevada, ohio, columbia, mont

juarez   &  tijuana, nuevo, chihuahua, felipe, acapulco, aquarium, lauderdale, oro, villa, utah  \\
smooth   &  shepherd, habits, mr, smell, tooth, habit, uf, flavor, knife, cuts  \\
ground   &  round, background, greyhound  \\
x   &  fbi, ebay, schwarzenegger  \\
oaxaca   &  cuernavaca, albuquerque, switzerland, cuarnabaca, hampshire, jamaica, acapulco, rebeca, guadalajara, utah  \\
paso   &  arlington, chihuahua, acapulco, wilmington, flix, oro, orlando, montana, kansas, utah  \\
necessity   &  curiosity, accessible, necessarily, unnecessary, necessary  \\
tmobile   &  automobile, mobile, cellular, mobiles  \\
informed   &  info, inform  \\
kn   &  know, knew  \\
technology   &  technical, technologies, tech, technological  \\
blocking   &  parking, flying, seeking, caring, roaming, knocking, walking, picking, swimming, backpacking  \\
realizing   &  rushing, realize, arguing, analyzing, adapting, realizes, facing, managing, missing  \\
roaming   &  jumping, climbing, chatting, fixin

sponsored   &  prohibited, permitted, required, accompanied, proposed, censored, submitted, sponsor, admitted, promoted  \\
residency   &  resident, residential, residents, presidents, residence, presidency, president  \\
citizenship   &  citizen, citizens  \\
citizens   &  citizen, citizenship  \\
passports   &  sports, transport, airports, reports, sport, passport, export  \\
closest   &  closet, closer, closed, close  \\
requirements   &  requires, selfsufficient, elements, require, improvements, agencies, efficient, equipments, required, sufficient  \\
admitted   &  adopted, committed, determined, prohibited, permitted, obliged, sponsored, submitted, obligated, promoted  \\
phd   &  upenn, genetics, pennsylvania, postdoctorate, chemistry, penn, biochemistry, studying, laboratory, postdoc  \\
specialty   &  specialist, specialized, special, specially  \\
harvard   &  albuquerque, freshmen, swarthmore, mitsubishi, lakeward, ¿no, lincoln, 2003, pittsburgh, ibiza  \\
zero   &  october,

current   &  ambient, currency, currently  \\
politics   &  politicians, polite, politic, politician  \\
basque   &  region, imperialism, basis  \\
initiative   &  q, alternative, perspective, hawaii, collective, unfaithful, objective, imperialism, neutral, inferior  \\
murderers   &  f, manners, q, barriers, shelters, pastors, u2, murders, hormones, sadomasochism  \\
measures   &  measure, structures, procedures, failures  \\
zulma   &  alfredo, yosma, ¡hi, alvaro, sulma, fernando, mauricio, josma, rodrigo, vilma  \\
prize   &  print, pri, determine, prison, principal, admit, pride, declare, privilege, prosecutor  \\
receives   &  receive, deceive, received, receiving  \\
member   &  chamber, september, remember  \\
smarter   &  cnn, gangster, smart, shepherd, soso, rochester, quarter  \\
prove   &  profit, proud, improve, pro, provide, provider, provoke, approve, protect, prosecutor  \\
shoes   &  shoe, shots  \\
pots   &  boots, idiots  \\
hundreds   &  peso, pesos, hundred, dollars

balanced   &  balance, obliged  \\
discriminate   &  eliminated, eliminate, criminals, hypocrisy, racist, racism, rejected, discrimination, discriminated, criminal  \\
regard   &  regards, immoral, regardless, regarding, controversial  \\
unfortunate   &  fortunately, unfortunately  \\
desire   &  describe, desires, design  \\
aspect   &  respect, perspective, aspects, respectful, respects  \\
strength   &  length, circus, struggle, strategy, mutual, due, sexism, individual, bureaucratic, buddhism  \\
dictator   &  privileged, iraqi, wisconsin, iraqis, schwarzenegger, devoted, dictatorship, osama, ii  \\
insist   &  therapist, instill  \\
vices   &  devices, services, es  \\
addiction   &  inspiration, selection, recreation, reproduction, infection, sensation, dictionary, function, addition, action  \\
inconvenience   &  selfsufficient, indispensable, inconvenient, convenience, inevitable, convenient  \\
promote   &  profit, proof, prohibited, profile, pros, products, manipulate, aids,

terry   &  terror, ordinary  \\
regulated   &  regulate, established, legitimate, excluded, motivated, isolated, distributed  \\
mmh   &  ahm, ¿oh, mmhm, mhmh, umm, mmmm, ¡oh, mmmhmm, mmhmm, ohh  \\
debate   &  gate, illiterate, imitate, donate, accurate, vibrate, evaluate, demonstrate, desperate, adequate  \\
argument   &  disagreement, management, resentment, arguments, content, harassment, agreement, treatment, entertainment  \\
consequences   &  circumstances, prejudices, implies, conscience, enemies, sciences, references, consumers, consequence, evidences  \\
16   &  13, 21, 18, twentyseven, 14, 9, 23, 19, 17, thirtyseven  \\
finishing   &  rushing, nursing, finished, finishes, studding, finish, fishing  \\
va   &  cuarnabaca, cuernavaca, switzerland, acapulco  \\
exercise   &  exploit, budget, exercising  \\
who´s   &  whoa, whore  \\
intervene   &  intense, interfere, interference, interview, interest, interviews, interrupt, interior, interpret, interests  \\
affair   &  q, dete

wawa   &  barbie, boise, shift, ooh, shuttle, switch, aquarium, mp3s, barbeque, pittsburgh  \\
thomas   &  mblas, felipe  \\
fishing   &  finishing, rushing, washing, settling  \\
interfere   &  intervene, interview, integrate, intend, interests, interviews, interrupt, interior, interpret, interference  \\
injustice   &  corruption, corrupt, flexible, prejudice, justify, justified, absurd, imperialism, justice, unfair  \\
they´ve   &  therapy, theft, therapist  \\
humanitarian   &  veterinarian, chauvinistic, castilian, sebastian  \\
subjective   &  object, objective, subjects, subject  \\
predominant   &  principle, fanaticism, dominant, insignificant, dominate, ii  \\
civilization   &  inspiration, unification, authorization, ambition, legislation, corporation, coalition, foundation, inclination, organization  \\
acceptable   &  flexible, accepts, unbelievable, uncomfortable, unthinkable, unfaithful, accepting, inevitable, respectable, accept  \\
wasted   &  lasted, permitted, invest

marcos   &  audria, maría, marcelo, marc, marcela, mariza, mauricio, marisa, marco, paulina  \\
reimburse   &  500, retire  \\
sticks   &  sucks, stick  \\
range   &  strange, arrange, orange, stranger  \\
rely   &  entirely, sincerely  \\
mateo   &  maría, mariachis, mariza, mate, mam  \\
horses   &  circles, refugees, cafes, geniuses, boxes, diseases  \\
passion   &  passions, inspiration, recreation, devotion, tension, sensation, aggression, repetition, passionate, depression  \\
knife   &  exwife, shepherd, 21, smooth, u2, k, 23, guadalupe, uf, wife  \\
suggestion   &  unification, introduction, discussion, object, suggest, question, justify  \\
concern   &  concerned, concept, concerns, concert, concerts  \\
rated   &  hated, exaggerated, coated, saturated, frustrated, operated, animated  \\
supervise   &  supervisor, supervision, exploit, superior  \\
pg13   &  13, odd, hbo, tube, pg, k, 23, mp3s, adults, 6  \\
playstation   &  station, stations, radiation, invitation, quotation,

bum   &  bump, laura  \\
nicaragua   &  cuernavaca, paraguay, indiana, cuarnabaca, paraguayan, jamaica, guadalajara, utah, ecuadorian, honduras  \\
socialize   &  socialism, socialized, socially, social, imperialism, socialist  \\
ski   &  sky, skirt  \\
exclusive   &  offensive, massive, incentive, exclude, excessive, authentic, aggressive, unsafe, sensitive  \\
insult   &  fault, obliged, inspired, accuse, insulin, obligated  \\
stork   &  storm, store  \\
claus   &  claim, cla  \\
conjoined   &  sued, kidnapped, occupied, gained, abandoned, joined, condemned, fled  \\
hmo   &  q, u2, mitsubishi, mr, u, fbi, k, mp3s, uf, ¡oh  \\
tylenol   &  q, odd, u2, k, aspirins, mp3s, ¡oh, asthma, gallbladder, sadomasochism  \\
returns   &  return, returning, returned  \\
coasts   &  plantains, chiapas, coast, mountains, museums  \\
nudist   &  frequently, capitalist, journalist, asylum, hawaii, delinquency, parallel, mississippi, distrust, atheist  \\
humble   &  flexible, acceptable, unbelievab

In [113]:
pre_words = [w for w, i in pre_sim_dict['w'].items() if len(i)>1]
fisher_words = [w for w, i in sim_dict['w'].items() if len(i)>1]

In [114]:
len(pre_words), len(fisher_words)

(3460, 5379)

In [128]:
pre_only = set(pre_words) - set(fisher_words)
fisher_only = set(fisher_words) - set(pre_words)
common_only = set(pre_words) & set(fisher_words)

In [129]:
len(pre_only), len(fisher_only),  len(common_only), (len(set(pre_words) | set(fisher_words)))

(1935, 3854, 1525, 7314)

In [135]:
for w, i in sorted([(w.decode(),[j.decode() for j in i]) for w, i in pre_sim_dict['w'].items() if len(i) > 2 and w in pre_only]):
    print(w, "  & ", ", ".join(set(i)-set([w])), " \\\\")

accusing   &  accused, accuse, accuses  \\
acquitted   &  convicted, guilty  \\
adolescent   &  adolescents, adolescence  \\
advertize   &  advertizement, advertise  \\
afternoons   &  mornings, evenings  \\
agrees   &  asks, decides, refuses, convinces  \\
aim   &  aiming, aims  \\
airlines   &  airline, airways  \\
allowing   &  allows, allow  \\
almost   &  nearly, virtually  \\
ambitions   &  ambition, aspirations  \\
announcing   &  announce, announcement  \\
anyhow   &  anyways, anyway  \\
anyone   &  anybody, anything  \\
anythign   &  somethign, everythign  \\
anything   &  anythin, something, nothing, anyone  \\
argentine   &  argentinian, uruguayan, argentinean  \\
argentineans   &  argentines, argentinians, argentinean  \\
argentines   &  argentinians, chileans, argentineans, uruguayans  \\
attempt   &  attempted, attempts  \\
attempted   &  tried, attempt  \\
babysit   &  babysits, babysitting, babysitter  \\
babysits   &  babysit, babysitting, babysitter  \\
babysitting   

In [136]:
for w, i in sorted([(w.decode(),[j.decode() for j in i]) for w, i in sim_dict['w'].items() if len(i) > 2 and w in fisher_only]):
    print(w, "  & ", ", ".join(set(i)-set([w])), " \\\\")

'chavo   &  pizarro, japollo, brazilian, anthony, chayanne, rodriguez, eharmony, chambao, banderas, malaguena  \\
'i   &  j, barbie, idiot, notebook, viagra, k, he/she, phrase, uf, mb  \\
000   &  maximum, 80, 100, kilometers, 2003, 500, millionaire, 40, acre, 50  \\
10   &  pesos, 12, maximum, pm, 15, 6, 11, 40, fiftyfive, 8  \\
100   &  equivalent, ambulance, kilometers, 60, 000, 9, mp3s, 500, 80, costume  \\
11   &  10, 3, 9, 14, 40, pm, 2003, 50, 4, 8  \\
12   &  10, 13, 30, 11, 15, 9, 23, 6, pm, 14  \\
13   &  fourteen, 18, pg13, 16, sixteen, 9, 23, thirteen, 17, 14  \\
14   &  12, 13, 21, 18, 16, 9, 23, 11, 4, 8  \\
15   &  12, 10, 30, 18, 9, 6, 5, 11, 14, 8  \\
16   &  13, 21, 18, twentyseven, 14, 9, 23, 19, 17, thirtyseven  \\
17   &  30, 4, 21, 18, twentyseven, 16, 9, 23, 14, 19  \\
18   &  30, 21, 16, 14, 23, 25, 19, 17, 4, 8  \\
19   &  30, 21, 18, 16, 17, 9, 14, 23, 25, 7  \\
2   &  30, 3, 18, 15, 25, 2003, 19, 14, 4, 8  \\
20   &  30, 11, 18, 16, 9, 14, 25, 19, 7, 4  \\
20

claus   &  claim, cla  \\
cleaner   &  clean, salt  \\
cleaning   &  exercising, owning, burning, meaning, hiring  \\
clever   &  fever, whatsoever  \\
client   &  treatment, print, clients  \\
clients   &  client, treatments, arguments, patients  \\
climbing   &  rushing, jumping, freezing, quitting, fixing, fooling, roaming, screaming, swimming, settling  \\
close   &  closet, closest, closer, closed  \\
closed   &  closet, closest, closer, close  \\
closer   &  closet, closest, closed, close  \\
closest   &  closet, closer, closed, close  \\
closet   &  closest, closer, closed, close  \\
closing   &  losing, climbing, riding, kidding, attacking, shooting, kissing, missing, loosing, hiding  \\
cloth   &  clothes, shirt, skirt, clothing  \\
clouds   &  shift, shuttle, drop, freezes, shirts, clock, drops, cloudy, frozen  \\
cloudy   &  shift, drop, autumn, windy, freezes, bottle, drops, clouds, sunny, frozen  \\
clubs   &  club, bars  \\
cnn   &  albuquerque, aruba, volunteer, mba, u2,

dominican   &  puertorican, indian, dominant, cuban, dominguez, dominicans, domingo, rican, rica  \\
dominicans   &  puertorican, indians, peruvians, ricans, mexicans, dominican, cubans, domingo, ecuadorean, rica  \\
don't   &  dsl, k, do, uf, ¡oh  \\
doors   &  door, floors  \\
dot   &  w, yahoo  \\
double   &  doubled, doubts  \\
doubts   &  debts, doubt, double  \\
downloaded   &  mp3, downloading, aol, download, pc, yahoo  \\
downloading   &  mp3, advertising, fashionable, downloaded, download, pc  \\
downtown   &  outskirts, midtown, ottawa, allentown, lansdowne, lake, uptown, nearby  \\
drama   &  aruba, sadomasochism  \\
dramatic   &  idyllic, traumatic, autistic, drastic, verbal, fundamental, domestic, agnostic, exotic, bureaucratic  \\
drastic   &  plastic, fantastic, autistic, dramatic, fairy, fundamental, authentic, domestic, agnostic, exotic  \\
draw   &  puff, ooh, tube, gallbladder, homeless, kiss, fbi, k, ¡oh, raw  \\
dreaming   &  jumping, skiing, roaming, skying, screa

grammar   &  cnn, grad, grabs, grammys, grades, soso, granny  \\
grand   &  granddaughter, grandchild, granny, grandma, grandfather, grandparents, gray, grandmothers, grant, grandmother  \\
grandma   &  granddaughter, grandchild, granny, grand, grandfather, grandparents, gray, grandmothers, grandmother  \\
granny   &  granddaughter, grabs, grammys, grandchild, gray, grand, grandma, grant, grandmother, grammar  \\
grant   &  granny, grand  \\
grass   &  grabbed, gray, grabs, grab  \\
gray   &  granddaughter, grabs, grad, grandchildren, grandchild, granny, grand, grandma, grass, grandmother  \\
greatest   &  great, greater  \\
greyhound   &  round, ground, switzerland, ottawa, samsun, surrounded, aquarium, lincoln, volcano, pittsburgh  \\
gringo   &  gringos, italian  \\
ground   &  round, background, greyhound  \\
grow   &  growth, grown, grows  \\
grown   &  grow, growth, brown, grows  \\
grows   &  growth, grow, grown  \\
growth   &  grow, grown, grows  \\
guadalajara   &  guanajuato,

ma   &  maría, mam  \\
ma'am   &  maría, johnny, mariachis, mafia, mitsubishi, ¿i, marco, ¡oh, marvel, mam  \\
machines   &  machine, lines  \\
mafia   &  guillermo, bacteria, map, phobia, cia, ma'am, nigeria, nokia, marvel, mam  \\
magazine   &  magazines, cinema  \\
magazines   &  lines, magazine  \\
magical   &  logic, pharmaceutical, typical, logical, psychological, hysterical, radical, technical, physical  \\
mail   &  mailbox, hotmail, email, mails, emails, gmail, send  \\
mailbox   &  hotmail, address, junk, mail, addresses, mails, pc, gmail, box  \\
maintain   &  sustain, maintaining, maintained, obtain  \\
maintained   &  maintaining, determined, maintain  \\
maintaining   &  entertaining, decreasing, maintained, maintain  \\
maintenance   &  ambulance, employer, balance, unemployment, emphasis, finance, vigilance, employment  \\
majority   &  minority, major, priority  \\
maker   &  makes, makeup, make  \\
makeup   &  shake, maker  \\
making   &  taking, seeking, waking  \\
m

pc   &  merchandise, keyboard, mp3, aol, fedex, downloaded, mp3s, vehicle, msn, yahoo  \\
peacefully   &  unsafe, quietly, quieter, peaceful  \\
peanut   &  mcdonald´s, alert, barbie, zorro, shift, pinochet, mr, barbeque, uf, shut  \\
peas   &  irony, jerusalem, corona, surroundings, landscapes, buffalo  \\
pedro   &  dallas, juan, tijuana, antonio, ego, navy, mblas, francisco, juarez, diego  \\
penalty   &  pen, debt, declare, deposit, permit  \\
peninsula   &  cnn, aruba, mba, mitsubishi, mississippi, nigeria, petroleum, delinquency, osama  \\
penn   &  pennsylvania, postdoctorate, upenn, phd  \\
pennsylvania   &  upenn, postdoc, university, postdoctorate, phila, penn, phd  \\
peo   &  peop, people  \\
peop   &  peo, people  \\
people   &  peop, peo  \\
perceive   &  percent, deceive, percentage  \\
percent   &  perceive, cent, percentage  \\
perfect   &  perfection, perfectly  \\
perfection   &  perfect, reflection, infection, section, function, rejection, affection  \\
perfume   & 

rural   &  federal, neutral  \\
rush   &  screw, fulfill, confront, push, budget  \\
rushed   &  admitted, sued, occupied, harmed, equipped, occurred, pushed, submitted, screwed, cashed  \\
russian   &  vatican, korean, arabic, arab, asian  \\
russians   &  asians, galicians, italians, indians, lesbians, europeans, koreans, germans, veterinarians, fans  \\
sacramento   &  tijuana, cuarnabaca, bogota, acapulco, francisco, capitol, medellin, allentown, ibiza, incas  \\
sacred   &  hired, kicked, denied, occupied, murdered, scattered, equipped, crowded, occurred, fucked  \\
sadness   &  q, ¿oh, sickness, happiness, goodness, loneliness, forgiveness, shameless, illness, ¡oh  \\
sadomasochism   &  idyllic, q, odd, aruba, catastrophe, debauchery, u2, idiot, delinquency, ii  \\
safe   &  safer, unsafe, cafe, safety  \\
safer   &  unsafe, safe  \\
safety   &  capacity, ethnicity, 80, quantity, unsafe, safe  \\
saints   &  jehovah, mormons, baptists, witnesses, principles, jehova, evangelicals,

tolerance   &  exclude, ignorance, liberals, excluded, entrance, imperialism, tolerant, vigilance, individual, individuals  \\
tolerate   &  moderate, exclude, assimilate, strategy, integrate, tolerant, demonstrate, rate, desperate, illiterate  \\
tomato   &  bullet, pi, tube, yogurt, pig, earpiece, crash, tape, vehicle, sauce  \\
tomorrow   &  borrow, thursday  \\
tongue   &  bachata, regueton, merengue, reggaeton, bolero, salsa, shakira, reggae, jazz, ballads  \\
tonight   &  midnight, tight, night, overnight  \\
tool   &  aol, fool  \\
tools   &  pools, consumers, tobacco  \\
topics   &  discussions, topic  \\
toronto   &  rio, ontario, ohio, acapulco, columbia, nevada, montreal, rafael, kansas, vancouver  \\
touched   &  sued, melted, rushed, hooked, occupied, attached, soaked, scattered, amazed, occurred  \\
touching   &  rushing, quitting, reaching, catching, kidding, screaming, shooting, crawling, smoking, seeking  \\
tourism   &  tourist, tourists, touristic  \\
town   &  midto

In [137]:
for w, i in sorted([(w.decode(),[j.decode() for j in i]) for w, i in pre_sim_dict['w'].items() 
                    if (len(i) > 1 and w in common_only)]):
    print(w, "  & ", ", ".join(set(i)-set([w])), " \\\\")

abandon   &  abandoning  \\
abortion   &  abortions  \\
absurd   &  ridiculous, absurdity  \\
abused   &  abusing  \\
abuser   &  abusers  \\
abusers   &  abuser  \\
abusing   &  abused  \\
accent   &  accents  \\
accents   &  accent  \\
acceptable   &  unacceptable  \\
accumulate   &  accumulating  \\
accuse   &  accusing  \\
accused   &  accusing  \\
achieve   &  achieving  \\
actors   &  actresses  \\
adapt   &  adapting  \\
adapting   &  adapt  \\
addiction   &  addictions  \\
address   &  addresses  \\
addresses   &  address  \\
adolescents   &  adolescent  \\
adopted   &  adopting  \\
ads   &  advertisements  \\
advantages   &  disadvantages  \\
advertise   &  advertize  \\
advertisement   &  advertisements  \\
advertisements   &  ads, advertising, advertisement  \\
advice   &  advices  \\
affecting   &  affects  \\
affects   &  affecting  \\
afternoon   &  morning, evening  \\
afterwards   &  afterward  \\
agent   &  agents  \\
agree   &  disagree, concur  \\
agreement   &  agre

entertain   &  entertains  \\
entirely   &  completely  \\
entity   &  entities  \\
envelope   &  envelopes  \\
envelopes   &  envelope  \\
epidemic   &  cholera, epidemics  \\
equipment   &  equipments  \\
equipments   &  equipment  \\
error   &  errors  \\
errors   &  error  \\
escape   &  escaping  \\
especially   &  particularly  \\
essentially   &  basically  \\
ethics   &  ethical  \\
evangelical   &  lutheran, pentecostal, evangelicals  \\
evangelicals   &  evangelical  \\
evening   &  afternoon, morning  \\
eventually   &  finally, ultimately  \\
everybody   &  everyone  \\
everyone   &  everybody  \\
evidence   &  evidences  \\
evidences   &  evidence  \\
evidently   &  apparently  \\
exaggerated   &  exaggeration, exaggerating, exaggerate  \\
exaggerating   &  exaggeration, exaggerated, exaggerate  \\
example   &  instance  \\
exception   &  exceptions  \\
exceptions   &  exception  \\
excuses   &  excuse  \\
experiment   &  experiments  \\
experiments   &  experiment  \\
exp

morning   &  afternoon, evening  \\
mornings   &  evenings, afternoons  \\
mortgage   &  mortgages  \\
mortgages   &  mortgage  \\
mother   &  father, aunt, stepmother, grandmother  \\
moths   &  moth  \\
motivation   &  motivating, motivations  \\
mountain   &  mountains  \\
mountains   &  mountain  \\
moved   &  relocated  \\
movement   &  movements  \\
movements   &  movement  \\
movie   &  film, movies  \\
movies   &  movie  \\
mozart   &  beethoven  \\
murder   &  murders, murdering  \\
murdered   &  murdering  \\
murders   &  murder  \\
muslim   &  muslims  \\
muslims   &  christians, muslim  \\
necessary   &  needed  \\
negative   &  positive  \\
neighborhood   &  neighborhoods  \\
neighborhoods   &  neighborhood  \\
neither   &  nor  \\
nephew   &  brother, cousin, son, uncle, grandfather, grandson, nephews  \\
nephews   &  nephew, uncles  \\
nerves   &  nerve  \\
network   &  networks  \\
nevertheless   &  nonetheless, however  \\
newspaper   &  newspapers  \\
newspapers   &  

slower   &  faster  \\
slowly   &  rapidly, gradually  \\
smaller   &  larger  \\
smell   &  smelling  \\
smelling   &  smell  \\
snowboarding   &  skiing, snowboard  \\
soaked   &  soaking  \\
soap   &  soaps  \\
soaps   &  soap  \\
socialism   &  liberalism, communism  \\
socialist   &  socialists  \\
socialize   &  socializing  \\
sold   &  bought, purchased  \\
soldiers   &  troops  \\
solution   &  solutions  \\
solve   &  solving  \\
somebody   &  anybody, someone  \\
someone   &  somebody  \\
something   &  anything, thing  \\
somethings   &  somethink  \\
sometimes   &  often, usually  \\
somewhat   &  slightly  \\
song   &  songs  \\
songs   &  song  \\
south   &  north  \\
southern   &  northern  \\
specialization   &  specializations  \\
specialty   &  specialties  \\
speed   &  speeds  \\
sphere   &  spheres  \\
spiritual   &  spirituality  \\
spirituality   &  spiritual  \\
sponsor   &  sponsorship, sponsors  \\
stabilize   &  stabilized  \\
stains   &  stain  \\
stairs   

In [138]:
for w, i in sorted([(w.decode(),[j.decode() for j in i]) for w, i in sim_dict['w'].items() 
                    if (len(i) > 1 and w in common_only)]):
    print(w, "  & ", ", ".join(set(i)-set([w])), " \\\\")

abandon   &  pizarro, shepherd, abandoned, 23, arnie  \\
abortion   &  civilization, inspiration, authorization, devotion, ambition, portion, corporation, migration, repetition, institution  \\
absurd   &  idyllic, q, debauchery, mutual, molecule, imperialism, vietnam, condemn, agnostic, sadomasochism  \\
abused   &  abuse, abuser, abuses, accused, physically, abusers  \\
abuser   &  abuse, abuses, abused, abusive, physically, abusers  \\
abusers   &  abuse, abuser, abuses, abused, prisoners, abusive  \\
abusing   &  promoting, confusing, proposing, abusive, attacking, causing  \\
accent   &  accents  \\
accents   &  adolescents, accent  \\
acceptable   &  flexible, accepts, unbelievable, uncomfortable, unthinkable, unfaithful, accepting, inevitable, respectable, accept  \\
accumulate   &  regulate, compensate, survive, available, accurate, manipulate, evaluate, exploit, isolate, adequate  \\
accuse   &  accused, insult, refuse, advise  \\
accused   &  caused, occupied, focused, accust

expressing   &  dressing, express, expression, messing  \\
extremely   &  extremes, extremists, extremist, extreme, fairly  \\
extremist   &  extremely, extremes, eliminate, fanaticism, prejudice, extremists, extreme, individually, individual, individuals  \\
extremists   &  extremely, extremes, prejudices, prejudice, individually, extremist, extreme, conflicts, individual, individuals  \\
facilities   &  personalities, authorities, anxieties, qualities, abilities, minorities, possibilities, activities, inequalities, humanities  \\
facility   &  stability, flexibility, infidelity, possibility, creativity, responsibility, entity, ability, probability, ethnicity  \\
factories   &  personalities, authorities, factors, prejudices, facts, facilities, stories, series, enemies, robberies  \\
factory   &  factor, mandatory, territory  \\
fairly   &  mn, extremely, ugly, truthfully, fairy, slowly, fair  \\
fanaticism   &  hypocrites, eliminate, denomination, hypocrisy, denominations, individual

perception   &  deception, corruption, ambition, corporation, segregation, percentage, coalition  \\
period   &  permit  \\
permit   &  permission, period, afford, permitted, penalty, determine, admit, forbidden  \\
permitted   &  committed, determined, declared, resolved, assaulted, disappointed, submitted, permit, obligated, admitted  \\
perspective   &  aspect, respectful, unfaithful, objective, effective, initiative, disrespectful, respectable, aspects, effectively  \\
peru   &  ecuadorean, ecuadorian, honduras  \\
peruvian   &  brazilian, bohemian, indian, barbarian, cuban, peruvians, korean, colombian, newyorkian, ecuadorian  \\
peruvians   &  brazilians, lesbians, koreans, salvadorans, uruguayans, colombians, cubans, salvadorians, ecuadorean, ecuadorian  \\
pharmaceutical   &  temporal, selfsufficient, scientific, emphasis, scientifically, radical, hysterical, fundamental, petroleum, magical  \\
phd   &  upenn, genetics, pennsylvania, postdoctorate, chemistry, penn, biochemistry

terrible   &  horribly, terribly, horrible  \\
terribly   &  horribly, terrible, horrible  \\
terrified   &  assumed, admitted, determined, occupied, failed, resolved, justified, occurred, solved, disoriented  \\
terrorist   &  terror, terrorism, error, evil, terrorists  \\
terrorists   &  terror, terrorist, errors, institutions, terrorism, error  \\
that´s   &  what´s  \\
theater   &  heater, theaters  \\
theory   &  supreme  \\
therapist   &  idyllic, q, therapy, theft, odd, buddhist, rapist, k, ii, sadomasochism  \\
therapy   &  selfsufficient, they´ve, q, theft, results, therapist, confirm, individually, exploit, verify  \\
therefore   &  therapist, theft, uf  \\
thirteen   &  13, fourteen, fifteen, eighteen, nineteen, seventeen, sixteen, thirtyseven  \\
thirties   &  twenties, sixties, forties, eighties, fifties, nineties, seventies  \\
thirty   &  fortyfive, twentyfive, fifty, twentyseven, twenty, twentysix, forty, fiftyfive, thirtyseven, thirtythree  \\
though   &  thoughts, alt

In [44]:
def check_word(curr_set, word_type, max_len=1):
    found_count = 0
    eng_tokens = []
    for utt in map_dict[curr_set]:
        if word_type.encode() in map_dict[curr_set][utt]["es_w"] and len(map_dict[curr_set][utt]["es_w"]) <= max_len:
            found_count+=1
            if curr_set == "fisher_train":
                eng_tokens.append(" ".join([w.decode() for w in map_dict[curr_set][utt]["en_w"]]))
            else:
                for r in map_dict[curr_set][utt]["en_w"]:
                    eng_tokens.append(" ".join([w.decode() for w in r]))
    print(found_count, len(map_dict[curr_set]), "{0:.2f}".format(found_count / len(map_dict[curr_set]) * 100))
    print(len(set(eng_tokens)))
    return Counter(eng_tokens)

In [None]:
t = check_word("fisher_train", "si", 1)
t

In [None]:
d = check_word("fisher_dev", "mhm", 1)
d

In [None]:
t.most_common(5)

In [None]:
", ".join(set([i[0] for i in t.most_common(10)]) and set([i[0] for i in d.most_common(10)]))

In [43]:
# found_count = 0
eng_tokens = []
curr_set= "fisher_train"
for utt in map_dict[curr_set]:
    if b"claro" in map_dict[curr_set][utt]["es_w"] and len(map_dict[curr_set][utt]["es_w"]) <= 1:
        found_count+=1
        eng_tokens.append(" ".join([w.decode() for w in map_dict[curr_set][utt]["en_w"]]))

NameError: name 'found_count' is not defined

In [None]:
found_count, len(map_dict[curr_set]), found_count / len(map_dict[curr_set]) * 100

In [None]:
Counter(eng_tokens)

In [None]:
found_count = 0
eng_tokens = []
curr_set= "fisher_dev"
for utt in map_dict[curr_set]:
    if b"claro" in map_dict[curr_set][utt]["es_w"] and len(map_dict[curr_set][utt]["es_w"]) <= 1:
        found_count+=1
        eng_tokens.append(" ".join([w.decode() for w in map_dict[curr_set][utt]["en_w"]]))

In [None]:
found_count, len(map_dict[curr_set]), found_count / len(map_dict[curr_set]) * 100

In [None]:
Counter(eng_tokens)

In [None]:
len(set(eng_tokens))

In [None]:
sim_dict['w']

### Evaluation dictionaries

### eval 1 - 500 randomly selected frequent words, minor filtering

In [225]:
min_dev_freq=10
max_dev_freq=10000
min_train_freq=10
max_train_freq=10000
min_len=5

In [226]:
len(en_common_words)

3824

In [227]:
en_content_words = (en_common_words - (es_stop_words | stop_words))
en_content_words = {w for w in en_content_words if '¿' not in w}

In [228]:
terms_of_interest = get_details_for_words(en_content_words, en_content_words, en_word_utt_count, 
                                          min_dev_freq=min_dev_freq, 
                                          max_dev_freq=max_dev_freq, 
                                          min_train_freq=min_train_freq, 
                                          max_train_freq=max_train_freq,
                                          min_len=min_len)
print("total words meeting criteria = {0:d}".format(len(terms_of_interest["words"])))

number of in-vocab words = 3642
total words meeting criteria = 273


In [229]:
_, _ = get_duration(terms_of_interest["train_utts"], key="fisher_train"), get_duration(terms_of_interest["dev_utts"], key="fisher_dev")

--------------------------------------------------------------------------------
73097 total utts
37 not found
selected utts from fisher_train -- duration = 120.45 hours
--------------------------------------------------------------------------------
2570 total utts
2 not found
selected utts from fisher_dev -- duration = 3.66 hours


In [230]:
random.seed("hmm")
sample_terms = random.sample(list(terms_of_interest["words"].keys()), 
                             min(len(terms_of_interest["words"]), 500))

In [231]:
sample_terms_details = get_details_for_words(sample_terms, en_common_words, en_word_utt_count,
                                          min_dev_freq=min_dev_freq, 
                                          max_dev_freq=max_dev_freq, 
                                          min_train_freq=min_train_freq,
                                          max_train_freq=max_train_freq,
                                          min_len=min_len)
print("total words meeting criteria = {0:d}".format(len(sample_terms_details["words"])))

number of in-vocab words = 273
total words meeting criteria = 273


In [232]:
_, _ = (get_duration(sample_terms_details["train_utts"], key="fisher_train"), 
       get_duration(sample_terms_details["dev_utts"], key="fisher_dev"))

--------------------------------------------------------------------------------
73097 total utts
37 not found
selected utts from fisher_train -- duration = 120.45 hours
--------------------------------------------------------------------------------
2570 total utts
2 not found
selected utts from fisher_dev -- duration = 3.66 hours


In [233]:
" -- ".join(sample_terms)

'email -- listened -- eight -- happens -- enough -- father -- rican -- english -- fifteen -- whatever -- speaking -- usually -- american -- student -- really -- lives -- puerto -- horrible -- anyone -- dollars -- although -- thank -- nobody -- going -- around -- crazy -- twenty -- large -- history -- different -- something -- listen -- canada -- family -- still -- quite -- without -- receive -- anything -- starts -- ricans -- world -- center -- person -- chicago -- calling -- matter -- waiting -- florida -- hundred -- watch -- listening -- sorry -- thought -- relax -- might -- bought -- sometimes -- times -- imagine -- longer -- finish -- program -- begin -- specially -- reggaeton -- often -- church -- later -- everything -- wants -- knows -- saying -- mexico -- talking -- start -- heard -- making -- states -- spanish -- catholic -- understand -- especially -- stayed -- needs -- money -- coming -- speak -- brought -- places -- hello -- asked -- parents -- careful -- cause -- remember -

In [234]:
eval_freq_content = create_vocab(sample_terms_details)

In [235]:
pickle.dump(eval_freq_content, 
            open(os.path.join(m_cfg['data_path'], 
                              "eval_en_freq_vocab.dict"), "wb"))

### eval 2 - 500 randomly selected infrequent words, minor filtering

In [336]:
min_dev_freq=2
max_dev_freq=10
min_train_freq=2
max_train_freq=10
min_len=5

In [337]:
len(en_common_words)

3824

In [338]:
en_content_words = (en_common_words - (es_stop_words | stop_words))
en_content_words = {w for w in en_content_words if '¿' not in w}

In [339]:
len(en_content_words)

3642

In [340]:
terms_of_interest = get_details_for_words(en_content_words, en_content_words, en_word_utt_count, 
                                          min_dev_freq=min_dev_freq, 
                                          max_dev_freq=max_dev_freq, 
                                          min_train_freq=min_train_freq,
                                          max_train_freq=max_train_freq,
                                          min_len=min_len)
print("total words meeting criteria = {0:d}".format(len(terms_of_interest["words"])))

number of in-vocab words = 3642
total words meeting criteria = 225


In [341]:
_, _ = get_duration(terms_of_interest["train_utts"], key="fisher_train"), get_duration(terms_of_interest["dev_utts"], key="fisher_dev")

--------------------------------------------------------------------------------
1230 total utts
1 not found
selected utts from fisher_train -- duration = 2.45 hours
--------------------------------------------------------------------------------
486 total utts
0 not found
selected utts from fisher_dev -- duration = 0.85 hours


In [342]:
random.seed("hmm")
sample_terms = random.sample(list(terms_of_interest["words"].keys()), 
                             min(len(terms_of_interest["words"]), 500))

In [343]:
sample_terms_details = get_details_for_words(sample_terms, en_common_words, en_word_utt_count,
                                          min_dev_freq=min_dev_freq, 
                                          max_dev_freq=max_dev_freq, 
                                          min_train_freq=min_train_freq,
                                          max_train_freq=max_train_freq,
                                          min_len=min_len)
print("total words meeting criteria = {0:d}".format(len(sample_terms_details["words"])))

number of in-vocab words = 225
total words meeting criteria = 225


In [344]:
_, _ = (get_duration(sample_terms_details["train_utts"], key="fisher_train"), 
       get_duration(sample_terms_details["dev_utts"], key="fisher_dev"))

--------------------------------------------------------------------------------
1230 total utts
1 not found
selected utts from fisher_train -- duration = 2.45 hours
--------------------------------------------------------------------------------
486 total utts
0 not found
selected utts from fisher_dev -- duration = 0.85 hours


In [345]:
" -- ".join(sample_terms)

"mister -- africanamerican -- addition -- installed -- knock -- awhile -- precautions -- intimidated -- wakes -- distractions -- madam -- safety -- gossip -- grabbed -- interpreting -- dresses -- illness -- rocks -- agitated -- graduating -- protest -- distract -- attraction -- coupons -- assault -- abandon -- score -- shots -- evangelic -- verify -- javier -- views -- deeds -- elvis -- difficulty -- whichever -- nasty -- prisoners -- carries -- entertainment -- waves -- skirt -- ahhhh -- speaker -- doctrine -- deserted -- largest -- monterrey -- serving -- approval -- defined -- required -- quieter -- rythm -- rainy -- plains -- crespo -- policemen -- released -- pointing -- painted -- cornell -- haiti -- exwife -- sinner -- barbecues -- twentyeight -- twentythree -- landscapes -- reunite -- requested -- olaya -- optional -- landscape -- eighth -- mixture -- rocker -- queen -- custody -- lessons -- insecurity -- downer -- stadium -- unsafe -- ma'am -- barbeque -- frequently -- reincar

In [324]:
eval_content = create_vocab(sample_terms_details)

In [325]:
pickle.dump(eval_content, 
            open(os.path.join(m_cfg['data_path'], 
                              "eval_en_rare_vocab.dict"), "wb"))

### eval 3 - common es, en words

In [370]:
min_dev_freq=2
max_dev_freq=10000
min_train_freq=2
max_train_freq=100000
min_len=5

In [371]:
len(en_common_words)

3824

In [372]:
en_content_words = (en_common_words - (es_stop_words | stop_words))
en_content_words = {w for w in en_content_words if '¿' not in w}

In [373]:
len(en_content_words)

3642

In [374]:
es_en_common_words = (es_common_words & en_common_words)  - (es_stop_words | stop_words)

In [375]:
len(es_en_common_words)

467

In [376]:
terms_of_interest = get_details_for_words(es_en_common_words, en_content_words, en_word_utt_count, 
                                          min_dev_freq=min_dev_freq, 
                                          max_dev_freq=max_dev_freq, 
                                          min_train_freq=min_train_freq,
                                          max_train_freq=max_train_freq,
                                          min_len=min_len)
print("total words meeting criteria = {0:d}".format(len(terms_of_interest["words"])))

number of in-vocab words = 464
total words meeting criteria = 173


In [377]:
_, _ = get_duration(terms_of_interest["train_utts"], key="fisher_train"), get_duration(terms_of_interest["dev_utts"], key="fisher_dev")

--------------------------------------------------------------------------------
44796 total utts
15 not found
selected utts from fisher_train -- duration = 78.14 hours
--------------------------------------------------------------------------------
1798 total utts
2 not found
selected utts from fisher_dev -- duration = 2.76 hours


In [378]:
random.seed("hmm")
sample_terms = random.sample(list(terms_of_interest["words"].keys()), 
                             min(len(terms_of_interest["words"]), 500))

In [379]:
sample_terms_details = get_details_for_words(sample_terms, en_common_words, en_word_utt_count,
                                          min_dev_freq=min_dev_freq, 
                                          max_dev_freq=max_dev_freq, 
                                          min_train_freq=min_train_freq,
                                          max_train_freq=max_train_freq,
                                          min_len=min_len)
print("total words meeting criteria = {0:d}".format(len(sample_terms_details["words"])))

number of in-vocab words = 173
total words meeting criteria = 173


In [380]:
_, _ = (get_duration(sample_terms_details["train_utts"], key="fisher_train"), 
       get_duration(sample_terms_details["dev_utts"], key="fisher_dev"))

--------------------------------------------------------------------------------
44796 total utts
15 not found
selected utts from fisher_train -- duration = 78.14 hours
--------------------------------------------------------------------------------
1798 total utts
2 not found
selected utts from fisher_dev -- duration = 2.76 hours


In [381]:
" -- ".join(sample_terms)

'color -- think -- everybody -- philly -- latina -- brutal -- vancouver -- speak -- hello -- health -- monterrey -- chiapas -- three -- around -- hospital -- carmen -- going -- sorry -- hotmail -- australia -- close -- boleros -- philadelphia -- tejana -- elvis -- university -- always -- person -- apart -- alcohol -- ciudad -- rancheras -- salvador -- queen -- horrible -- computers -- annie -- krishna -- capital -- daddy -- bachata -- barbeque -- latinos -- super -- south -- fault -- idaho -- everything -- radio -- karma -- topic -- animal -- columbia -- olaya -- white -- chicago -- microsoft -- spanish -- terrible -- sweet -- montreal -- colombia -- falls -- machine -- emails -- norma -- miles -- exact -- marketing -- north -- still -- maybe -- internet -- california -- aruba -- barranquilla -- nebraska -- alright -- crazy -- david -- family -- grand -- quiet -- betsy -- money -- maira -- simple -- rises -- people -- relax -- yahoo -- virginia -- alaska -- email -- tickets -- pennsylv

In [297]:
eval_content = create_vocab(sample_terms_details)

In [298]:
pickle.dump(eval_content, 
            open(os.path.join(m_cfg['data_path'], 
                              "eval_en_es_common_vocab.dict"), "wb"))

In [224]:
es_en_common_words = [w for w in es_common_words & en_common_words if len(w) >= 5]

In [105]:
len(es_en_common_words)

260

In [107]:
es_en_common_words

['terrible',
 'florida',
 'houston',
 'super',
 'mcdonald',
 'republicans',
 'exact',
 'personal',
 'channel',
 'texmex',
 'federal',
 'health',
 'oaxaca',
 'wisconsin',
 'favor',
 'georgia',
 'mercedes',
 'cristóbal',
 'miami',
 'state',
 'piano',
 'attachment',
 'grand',
 'where',
 'rises',
 'ciudad',
 'please',
 'civil',
 'everything',
 'shows',
 'university',
 'nigeria',
 'exactly',
 'alright',
 'tañon',
 'brutal',
 'boricua',
 'cristina',
 'costa',
 'colorado',
 'josefina',
 'colombia',
 'goodness',
 'house',
 'altar',
 'aruba',
 'pesos',
 'group',
 'regular',
 'playstation',
 'marketing',
 'money',
 'think',
 'teenager',
 'chatrooms',
 'nebraska',
 'tejana',
 'school',
 'atlanta',
 'philly',
 'california',
 'guatemala',
 'chiapas',
 'ticket',
 'gmail',
 'arizona',
 'miles',
 'probable',
 'hotmail',
 'people',
 'elvis',
 'maternity',
 'latinos',
 'computers',
 'pennsylvania',
 'columbia',
 'miguel',
 'going',
 'bachata',
 'guagua',
 'santo',
 'their',
 'corridos',
 'alaska',
 'bet