In [1]:
import nltk
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu, modified_precision
from nltk.translate.chrf_score import sentence_chrf, corpus_chrf
from nltk.metrics import scores
import scipy.io.wavfile
from IPython.display import Audio
from IPython.display import display
from nltk.stem import *
# from nltk.stem.snowball import SnowballStemmer
from stemming.porter2 import stem
import stemming
from nltk.metrics.scores import recall

from basics import *

import sentencepiece as spm

from nltk.corpus import stopwords

%matplotlib inline

In [2]:
tableau20 = [(31, 119, 180), (174, 199, 232), (255, 127, 14), (255, 187, 120),    
             (44, 160, 44), (152, 223, 138), (214, 39, 40), (255, 152, 150),    
             (148, 103, 189), (197, 176, 213), (140, 86, 75), (196, 156, 148),    
             (227, 119, 194), (247, 182, 210), (127, 127, 127), (199, 199, 199),    
             (188, 189, 34), (219, 219, 141), (23, 190, 207), (158, 218, 229)]    
# Scale the RGB values to the [0, 1] range, which is the format matplotlib accepts.    
for i in range(len(tableau20)):    
    r, g, b = tableau20[i]    
    tableau20[i] = (r / 255., g / 255., b / 255.)

In [3]:
smooth_fun = nltk.translate.bleu_score.SmoothingFunction()

In [4]:
from nmt_run import *

In [16]:
cfg_path = "interspeech_new_vocab/sp_160hrs/"

In [17]:
%%capture
last_epoch, model, optimizer, m_cfg, t_cfg = check_model(cfg_path)

### Load Fisher dataset

In [18]:
%%capture
# -------------------------------------------------------------------------
# get data dictionaries
# -------------------------------------------------------------------------
map_dict, vocab_dict, bucket_dict = get_data_dicts(m_cfg)
info_dict = pickle.load(open("fbanks_80dim_nltk/info.dict", "rb"))
sim_dict = pickle.load(open("./fbanks_80dim_nltk/mix_sim.dict", "rb"))

In [19]:
random.seed("meh")
# random.seed("haha")

### Train text

In [None]:
train_file = "../installs/fisher-callhome-corpus/corpus/ldc/fisher_train.en"

In [None]:
train_text = []

In [None]:
for u in map_dict["fisher_train"]:
    train_text.append(" ".join([w.decode() for w in map_dict["fisher_train"][u]["en_w"]]))

In [None]:
train_text[:10]

In [None]:
train_text_to_dump = "\n".join(train_text)

In [None]:
with open("../subword-nmt/fisher_train.en", "w") as out_f:
    out_f.write(train_text_to_dump)

In [None]:
# ./learn_joint_bpe_and_vocab.py --input {train_file}.L1 {train_file}.L2 -s {num_operations} -o {codes_file} --write-vocabulary {vocab_file}.L1 {vocab_file}.L2

In [None]:
# sp = spm.SentencePieceProcessor()

In [None]:
# sp.Load("test/test_model.model")

### word level analysis

In [None]:
min_word_len = 1
top_k = 100

In [None]:
stop_words = set(nltk.corpus.stopwords.words("english"))
len(stop_words)

In [None]:
es_stop_words = set(nltk.corpus.stopwords.words("spanish"))
len(es_stop_words)

In [None]:
def get_words(m_dict, key="en_w"):
    words = []
    for u in m_dict:
        if type(m_dict[u][key]) == list:
            words.extend([w.decode() for w in m_dict[u][key]])
        else:
            for ref in m_dict[u][key]:
                words.extend([w.decode() for w in ref])
    return Counter(words)

In [None]:
len(map_dict['fisher_dev'])

In [None]:
# words in train
train_words = get_words(map_dict['fisher_train'])
train_words_top_k = [(w,f) for w, f in sorted(train_words.items(), reverse=True, key=lambda t:t[1]) 
                     if w not in stop_words and len(w) >= min_word_len][:top_k]

train_only_words = set(train_words.keys())

print("{0:20s} | {1:10d}".format("# train word types", len(train_words)))
print("{0:20s} | {1:10d}".format("# train word tokens", sum(train_words.values())))

dev_words = get_words(map_dict['fisher_dev'])
dev_words_top_k = [(w,f) for w, f in sorted(dev_words.items(), reverse=True, key=lambda t:t[1]) 
                     if w not in stop_words and len(w) >= min_word_len][:top_k]

dev_only_words = set(dev_words.keys())

print("-"*80)
print("{0:20s} | {1:10d}".format("# dev word types", len(dev_words)))
print("{0:20s} | {1:10d}".format("# dev word tokens", sum(dev_words.values())))

In [None]:
# words in train
es_train_words = get_words(map_dict['fisher_train'], key="es_w")
es_train_words_top_k = [(w,f) for w, f in sorted(es_train_words.items(), reverse=True, key=lambda t:t[1]) 
                     if w not in es_stop_words and len(w) >= min_word_len][:top_k]

es_train_only_words = set(es_train_words.keys())

print("{0:20s} | {1:10d}".format("# train word types", len(es_train_words)))
print("{0:20s} | {1:10d}".format("# train word tokens", sum(es_train_words.values())))

es_dev_words = get_words(map_dict['fisher_dev'], key="es_w")
es_dev_words_top_k = [(w,f) for w, f in sorted(es_dev_words.items(), reverse=True, key=lambda t:t[1]) 
                     if w not in es_stop_words and len(w) >= min_word_len][:top_k]

es_dev_only_words = set(es_dev_words.keys())

print("-"*80)
print("{0:20s} | {1:10d}".format("# dev word types", len(es_dev_words)))
print("{0:20s} | {1:10d}".format("# dev word tokens", sum(es_dev_words.values())))

In [None]:
train_words_top_k[:5], es_train_words_top_k[:5]

In [None]:
[(w,f) for w,f in train_words_top_k if "'" in w]

In [None]:
dev_words_top_k[:5], es_dev_words_top_k[:5]

In [None]:
oov_words = {w:f for w,f in dev_words.items() if w not in train_only_words}

print("{0:20s} | {1:10d}".format("# oov word types", len(oov_words)))
print("{0:20s} | {1:10d}".format("# oov word tokens", sum(oov_words.values())))

In [None]:
es_oov_words = {w:f for w,f in es_dev_words.items() if w not in es_train_only_words}

print("{0:20s} | {1:10d}".format("# oov word types", len(es_oov_words)))
print("{0:20s} | {1:10d}".format("# oov word tokens", sum(es_oov_words.values())))

In [None]:
"{0:.1f}%".format(sum(oov_words.values()) / sum(dev_words.values()) * 100)

### Word level - get train, dev frequency, and utts in which they occur

In [None]:
len(train_only_words), len(set([stem(w) for w in train_only_words]))

In [None]:
def get_word_level_details(word_key):
    word_utt_count = {"train": {}, "dev": {}, "train_utts": {}, "dev_utts": {}}
    for u in tqdm(map_dict["fisher_train"].keys()):
        for w in set(map_dict["fisher_train"][u][word_key]):
            curr_word = w.decode()
            if curr_word not in word_utt_count["train"]:
                word_utt_count["train"][curr_word] = 0
                word_utt_count["train_utts"][curr_word] = set()
            word_utt_count["train"][curr_word] += 1
            word_utt_count["train_utts"][curr_word].update({u})
        # end for words in current utt
    # end for all utts
    for u in tqdm(map_dict["fisher_dev"].keys()):
        if word_key == "en_w":
            for ref in map_dict["fisher_dev"][u][word_key]:
                for w in set(ref):
                    curr_word = w.decode()
                    if curr_word not in word_utt_count["dev"]:
                        word_utt_count["dev"][curr_word] = 0
                        word_utt_count["dev_utts"][curr_word] = set()
                    word_utt_count["dev"][curr_word] += 1            
                    word_utt_count["dev_utts"][curr_word].update({u})
                # end for words in current ref
            # end for all references
        # end if multiple references
        else:
            ref = map_dict["fisher_dev"][u][word_key]
            for w in set(ref):
                curr_word = w.decode()
                if curr_word not in word_utt_count["dev"]:
                    word_utt_count["dev"][curr_word] = 0
                    word_utt_count["dev_utts"][curr_word] = set()
                word_utt_count["dev"][curr_word] += 1            
                word_utt_count["dev_utts"][curr_word].update({u})
            
    # end for all utts
    all_train_utts = set()
    for w in word_utt_count["train_utts"]:
        all_train_utts.update(word_utt_count["train_utts"][w])
    # end for

    all_dev_utts = set()
    for w in word_utt_count["dev_utts"]:
        all_dev_utts.update(word_utt_count["dev_utts"][w])
    # end for
    
    return word_utt_count, all_train_utts, all_dev_utts
    

In [None]:
en_word_utt_count, en_train_utts, en_dev_utts = get_word_level_details("en_w")

In [None]:
len(en_train_utts), len(en_dev_utts)

In [None]:
es_word_utt_count, es_train_utts, es_dev_utts = get_word_level_details("es_w")

In [None]:
len(es_train_utts), len(es_dev_utts)

In [None]:
print("word types")
print(len(en_word_utt_count['train']), len(en_word_utt_count['dev']))
print("common word types")
en_common_words = set(en_word_utt_count['train'].keys()) & set(en_word_utt_count['dev'].keys())
len(en_common_words)

In [None]:
print("word types")
print(len(es_word_utt_count['train']), len(es_word_utt_count['dev']))
print("common word types")
es_common_words = set(es_word_utt_count['train'].keys()) & set(es_word_utt_count['dev'].keys())
len(es_common_words)

In [None]:
def get_details_for_words(words, common_words, word_utt_count, 
                          min_dev_freq, max_dev_freq, min_train_freq, max_train_freq, min_len):
    details = {"words": {}, "train_utts": set(), "dev_utts": set()}
    
    in_vocab_words = set(words) & set(common_words)
    print("number of in-vocab words = {0:d}".format(len(in_vocab_words)))

    for w in in_vocab_words:
        t_count, d_count = len(word_utt_count["train_utts"][w]), len(word_utt_count["dev_utts"][w])
        if ((d_count >= min_dev_freq) and 
            (d_count <= max_dev_freq) and
            (len(w) >= min_len) and
            (t_count >= min_train_freq) and 
            (t_count <= max_train_freq)):
            details["words"][w] = {"train": t_count, "dev": d_count}
            details["train_utts"].update(word_utt_count["train_utts"][w])
            details["dev_utts"].update(word_utt_count["dev_utts"][w])
        # end meets criteria
    # end for in-vocab word
    return details
# end function

In [None]:
def get_duration(utts, key):
    dur = 0
    utts_not_found = []
    for u in utts:
        if u not in info_dict[key]:
            #print("argh!", u)
            utts_not_found.append(u)
        else:
            dur += (info_dict[key][u]['sp'] * 10)
    dur = dur / 60 / 60 / 1000
    print("-"*80)
    print("{0:d} total utts".format(len(utts)))
    print("{0:d} not found".format(len(utts_not_found)))
    print("selected utts from {0:s} -- duration = {1:.2f} hours".format(key, dur))
    return dur

In [None]:
def create_vocab(words_list):
    out = {"w2i":{}, "i2w":{}, "freq":{}, "freq_dev":{}}
    START_VOCAB = [PAD, GO, EOS, UNK]
    for w in START_VOCAB:
        out['w2i'][w] = len(out["w2i"])
        out["freq"][w] = 1
    #for w in words_list['words']:
    sorted_w = sorted(words_list['words'].items(), reverse=True, key=lambda t: t[1]['train'])
    for w in sorted_w:
        encoded_word = w[0].encode()
        out["w2i"][encoded_word] = len(out["w2i"])
        out["freq"][encoded_word] = w[1]["train"]
        out["freq_dev"][encoded_word] = w[1]["dev"]

    out["i2w"] = {val:key for key, val in out["w2i"].items()}
    return out

In [None]:
train_dur, dev_dur = get_duration(en_train_utts, key="fisher_train"), get_duration(en_dev_utts, key="fisher_dev")

### Task 0 - 500 randomly selected frequent words, minor filtering

In [None]:
min_dev_freq=10
max_dev_freq=10000
min_train_freq=50 
min_len=5

In [None]:
terms_of_interest = get_details_for_words(common_words, 
                                          min_dev_freq=min_dev_freq, 
                                          max_dev_freq=max_dev_freq, 
                                          min_train_freq=min_train_freq, 
                                          min_len=min_len)
print("total words meeting criteria = {0:d}".format(len(terms_of_interest["words"])))

In [None]:
_, _ = get_duration(terms_of_interest["train_utts"], key="fisher_train"), get_duration(terms_of_interest["dev_utts"], key="fisher_dev")

In [None]:
random.seed("hmm")
sample_terms = random.sample(list(terms_of_interest["words"].keys()), 500)

In [None]:
sample_terms_details = get_details_for_words(sample_terms, 
                                          min_dev_freq=min_dev_freq, 
                                          max_dev_freq=max_dev_freq, 
                                          min_train_freq=min_train_freq, 
                                          min_len=min_len)
print("total words meeting criteria = {0:d}".format(len(sample_terms_details["words"])))

In [None]:
_, _ = get_duration(sample_terms_details["train_utts"], key="fisher_train"), get_duration(sample_terms_details["dev_utts"], key="fisher_dev")

In [None]:
sample_terms[:10]

In [None]:
bow_top_500_words_vocab = create_vocab(sample_terms_details)

In [None]:
pickle.dump(bow_top_500_words_vocab, open(os.path.join(m_cfg['data_path'], "bow_top_500_words_vocab.dict"), "wb"))

### Task 1 - randomly selected frequent words

In [None]:
min_dev_freq=10 
max_dev_freq=100
min_train_freq=100
min_len=5

In [None]:
terms_of_interest = get_details_for_words(common_words, 
                                          min_dev_freq=min_dev_freq, 
                                          max_dev_freq=max_dev_freq, 
                                          min_train_freq=min_train_freq, 
                                          min_len=min_len)
print("total words meeting criteria = {0:d}".format(len(terms_of_interest["words"])))

In [None]:
_, _ = get_duration(terms_of_interest["train_utts"], key="fisher_train"), get_duration(terms_of_interest["dev_utts"], key="fisher_dev")

In [None]:
random.seed("hmm")
sample_terms = random.sample(list(terms_of_interest["words"].keys()), 100)

In [None]:
sample_terms_details = get_details_for_words(sample_terms, 
                                          min_dev_freq=min_dev_freq, 
                                          max_dev_freq=max_dev_freq, 
                                          min_train_freq=min_train_freq, 
                                          min_len=min_len)
print("total words meeting criteria = {0:d}".format(len(sample_terms_details["words"])))

In [None]:
_, _ = get_duration(sample_terms_details["train_utts"], key="fisher_train"), get_duration(sample_terms_details["dev_utts"], key="fisher_dev")

In [None]:
sample_terms[:10]

In [None]:
bow_top_100_words_vocab = create_vocab(sample_terms_details)

In [None]:
pickle.dump(bow_top_100_words_vocab, open(os.path.join(m_cfg['data_path'], "bow_top_100_words_vocab.dict"), "wb"))

### Task 2 - topics as keywords

In [None]:
train_text_fname= "../installs/fisher-callhome-corpus/corpus/ldc/fisher_train.en"
topics_fname = "../criseslex/fsp06_topics_in_english.txt"

In [None]:
topics = [ "peace", "Music", "Marriage", "Religion", "Cell phones", 
           "Dating", "Telemarketing and SPAM", "Politics", "Travel", 
           "Technical devices", "Healthcare", "Advertisements", "Power", 
           "Occupations", "Movies", "Welfare", "Breaking up", "Location", 
           "Justice", "Memories", "Crime", "Violence against women", "Equality", 
            "Housing", "Immigration",     
            # new topics
           "Interracial", "Christians", "muslims", "jews", "e-mail", 
           "phone", "democracy", "Democratic", "Republican", "technology", 
           "leadership", "community", "jury", "police", "inequality", 
           "renting", "Violence", "immigrants", "immigrant", "skilled", 
           "Telemarketing", "SPAM", "skill", "job", "health", "mobile", 
            "ads", "physical", "emotional", "bubble", "rent", "economy", 
            "abuse", "women", "city", "country", "suburban", "dollar", 
            "united states", "laws", "phone", "race", "biracial", "interracial", 
            "marriage", "lyrics", "sexuality", "medicine", "television", "european",
            "home", "protect", "spouse", "language", "cellphone", "money",
            "doctor", "insurance", "cigarettes", "alcohol", "income", "salary",
            "class", "censor", "rating", "programs", "government",
            "relationship", "legal", "event", "life", "safe", "victim", "cops",
            "wage", "illegal"
            ]
topics = list(set(t.lower() for t in topics))
topics_stem = [stem(t) for t in topics]

# add similar topic words
new_topics = []
# for t in topics:
#     if t.encode() in sim_dict['w']:
#         new_topics.extend([w.decode() for w in sim_dict['w'][t.encode()]])
topics.extend(new_topics)

In [None]:
len(topics)

In [None]:
topics_details = get_details_for_words(topics, en_common_words, en_word_utt_count, 
                                       min_dev_freq=5, 
                                       max_dev_freq=10000, 
                                       min_train_freq=10, 
                                       min_len=1)
print("total words meeting criteria = {0:d}".format(len(topics_details["words"])))

In [None]:
_, _ = get_duration(topics_details["train_utts"], key="fisher_train"), get_duration(topics_details["dev_utts"], key="fisher_dev")

In [None]:
print("\n".join(list(topics_details["words"].keys())))

In [None]:
topics_details['words']

In [None]:
bow_topics_vocab = create_vocab(topics_details)

In [None]:
haha = pickle.load(open(os.path.join(m_cfg['data_path'], "bow_topics_vocab.dict"), "rb"))

In [None]:
len(set(haha['w2i']) & set(bow_topics_vocab['w2i'].keys()))

In [None]:
pickle.dump(bow_topics_vocab, open(os.path.join(m_cfg['data_path'], "bow_topics_vocab.dict"), "wb"))

### Task 3 - crises terms as keywords

In [None]:
crises_lex_fname = "../criseslex/CrisisLexLexicon/CrisisLexRec.txt"

In [None]:
crises = set()
with open(crises_lex_fname, "r") as in_f:
    for line in in_f:
        crises.update(line.strip().split())
crises = list(crises)
crises_stem = [stem(w) for w in crises]

# new_crises = []
# for t in crises:
#     if t.encode() in sim_dict['w']:
#         new_crises.extend([w.decode() for w in sim_dict['w'][t.encode()]])
# crises.extend(new_crises)

In [None]:
len(crises)

In [None]:
crises_details = get_details_for_words(crises, en_common_words, en_word_utt_count,
                                       min_dev_freq=10, 
                                       max_dev_freq=1000, 
                                       min_train_freq=100, 
                                       min_len=1)
print("total words meeting criteria = {0:d}".format(len(crises_details["words"])))

In [None]:
_, _ = get_duration(crises_details["train_utts"], key="fisher_train"), get_duration(crises_details["dev_utts"], key="fisher_dev")

In [None]:
en_word_utt_count.keys()

In [None]:
len(en_word_utt_count['dev_utts']['people'])

In [None]:
print("\n".join(list(crises_details["words"].keys())))

In [None]:
crises_details["words"]

In [None]:
bow_crises_vocab = create_vocab(crises_details)

In [None]:
pickle.dump(bow_crises_vocab, open(os.path.join(m_cfg['data_path'], "bow_crises_vocab.dict"), "wb"))

### More Crisis

In [None]:
crises_details = get_details_for_words(crises, en_common_words, en_word_utt_count,
                                       min_dev_freq=5, 
                                       max_dev_freq=1000, 
                                       min_train_freq=50, 
                                       min_len=1)
print("total words meeting criteria = {0:d}".format(len(crises_details["words"])))

In [None]:
_, _ = get_duration(crises_details["train_utts"], key="fisher_train"), get_duration(crises_details["dev_utts"], key="fisher_dev")

In [None]:
en_word_utt_count.keys()

In [None]:
len(en_word_utt_count['dev_utts']['people'])

In [None]:
print("\n".join(list(crises_details["words"].keys())))

In [None]:
crises_details["words"]

In [None]:
bow_crises_vocab = create_vocab(crises_details)

In [None]:
pickle.dump(bow_crises_vocab, open(os.path.join(m_cfg['data_path'], "bow_crises_vocab_more.dict"), "wb"))

### Task Spanish - 500 randomly selected frequent words, minor filtering

In [None]:
min_dev_freq=18
max_dev_freq=10000
min_train_freq=200
min_len=5

In [None]:
terms_of_interest = get_details_for_words(es_common_words,
                                          es_common_words,
                                          es_word_utt_count,
                                          min_dev_freq=min_dev_freq, 
                                          max_dev_freq=max_dev_freq, 
                                          min_train_freq=min_train_freq, 
                                          min_len=min_len)
print("total words meeting criteria = {0:d}".format(len(terms_of_interest["words"])))

In [None]:
_, _ = get_duration(terms_of_interest["train_utts"], key="fisher_train"), get_duration(terms_of_interest["dev_utts"], key="fisher_dev")

In [None]:
random.seed("hmm")
sample_terms = random.sample(list(terms_of_interest["words"].keys()), 
                             min(len(terms_of_interest["words"].keys()), len(terms_of_interest["words"].keys())))

In [None]:
# sample_terms = ["bueno"]

In [None]:
sample_terms_details = get_details_for_words(sample_terms,
                                             es_common_words,
                                             es_word_utt_count,
                                              min_dev_freq=min_dev_freq, 
                                              max_dev_freq=max_dev_freq, 
                                              min_train_freq=min_train_freq, 
                                              min_len=min_len)
print("total words meeting criteria = {0:d}".format(len(sample_terms_details["words"])))

In [None]:
_, _ = get_duration(sample_terms_details["train_utts"], key="fisher_train"), get_duration(sample_terms_details["dev_utts"], key="fisher_dev")

In [None]:
sample_terms

In [None]:
# sample_terms_details

In [None]:
bow_es_top_words_vocab = create_vocab(sample_terms_details)

In [None]:
bow_es_top_words_vocab.keys()

In [None]:
sample_terms_details.keys()

In [None]:
sample_terms_details["train_utts"][:10]

In [None]:
sample_terms_details['words']

In [None]:
sample_terms_details['words']['bueno']

In [None]:
bow_es_top_words_vocab['freq'][b'bueno'], bow_es_top_words_vocab['freq_dev'][b'bueno']

In [None]:
pickle.dump(bow_es_top_words_vocab, open(os.path.join(m_cfg['data_path'], "bow_es_100word_vocab.dict"), "wb"))

In [None]:
pickle.dump(bow_es_top_words_vocab, open(os.path.join(m_cfg['data_path'], "bow_es_1word_vocab.dict"), "wb"))

In [None]:
!su s1444673

In [None]:
pickle.dump(bow_es_top_words_vocab, open(os.path.join(m_cfg['data_path'], "bow_es_100word_vocab.dict"), "wb"))

In [None]:
pickle.dump(bow_es_top_words_vocab, open(os.path.join(m_cfg['data_path'], "bow_es_top_words_vocab.dict"), "wb"))

In [None]:
sample_terms_details['words']['colorado']

In [None]:
sorted([(w, sample_terms_details['words'][w]['train']) 
       for w in sample_terms_details['words']], reverse=True, key=lambda t: t[1])[:20]

In [None]:
sorted([(w, sample_terms_details['words'][w]['dev']) 
       for w in sample_terms_details['words']], reverse=True, key=lambda t: t[1])[:20]

In [None]:
m_cfg['data_path']

In [None]:
!ls fbanks_80dim_nltk

In [None]:
m_cfg["sim_dict"]

In [None]:
sim_dict = pickle.load(open(os.path.join(m_cfg['data_path'], 'pre_trained_sim.dict'), "rb"))

In [None]:
len([i for i in sim_dict['w'].values() if len(i)>1])

In [None]:
sim_dict['w'][b'sure']

In [None]:
sim_dict['w']

In [None]:
mix_sim_dict = pickle.load(open(os.path.join(m_cfg['data_path'], 'mix_sim.dict'), "rb"))

In [None]:
len([i for i in mix_sim_dict['w'].values() if len(i)>1])

In [None]:
for w, i in [(w.decode(),[j.decode() for j in i]) for w, i in mix_sim_dict['w'].items() if len(i) > 2]:
    print(w, "  & ", ", ".join(set(i)-set([w])), " \\\\")

In [None]:
pre_sim_dict = pickle.load(open(os.path.join(m_cfg['data_path'], 'pre_trained_sim.dict'), "rb"))

In [None]:
len([i for i in pre_sim_dict['w'].values() if len(i)>1])

In [None]:
for w, i in [(w.decode(),[j.decode() for j in i]) for w, i in pre_sim_dict['w'].items() if len(i) > 2]:
    print(w, "  & ", ", ".join(set(i)-set([w])), " \\\\")

In [None]:
pre_sim_dict['w'][b'sure']

In [None]:
sim_dict = pickle.load(open(os.path.join(m_cfg['data_path'], 'sim.dict'), "rb"))

In [None]:
len([i for i in sim_dict['w'].values() if len(i)>1])

In [None]:
for w, i in [(w.decode(),[j.decode() for j in i]) for w, i in sim_dict['w'].items() if len(i) > 2]:
    print(w, "  & ", ", ".join(set(i)-set([w])), " \\\\")

In [None]:
pre_words = [w for w, i in pre_sim_dict['w'].items() if len(i)>1]
fisher_words = [w for w, i in sim_dict['w'].items() if len(i)>1]

In [None]:
len(pre_words), len(fisher_words)

In [None]:
pre_only = set(pre_words) - set(fisher_words)
fisher_only = set(fisher_words) - set(pre_words)
common_only = set(pre_words) & set(fisher_words)

In [None]:
len(pre_only), len(fisher_only),  len(common_only), (len(set(pre_words) | set(fisher_words)))

In [None]:
for w, i in sorted([(w.decode(),[j.decode() for j in i]) for w, i in pre_sim_dict['w'].items() if len(i) > 2 and w in pre_only]):
    print(w, "  & ", ", ".join(set(i)-set([w])), " \\\\")

In [None]:
for w, i in sorted([(w.decode(),[j.decode() for j in i]) for w, i in sim_dict['w'].items() if len(i) > 2 and w in fisher_only]):
    print(w, "  & ", ", ".join(set(i)-set([w])), " \\\\")

In [None]:
for w, i in sorted([(w.decode(),[j.decode() for j in i]) for w, i in pre_sim_dict['w'].items() 
                    if (len(i) > 1 and w in common_only)]):
    print(w, "  & ", ", ".join(set(i)-set([w])), " \\\\")

In [None]:
for w, i in sorted([(w.decode(),[j.decode() for j in i]) for w, i in sim_dict['w'].items() 
                    if (len(i) > 1 and w in common_only)]):
    print(w, "  & ", ", ".join(set(i)-set([w])), " \\\\")

In [None]:
def check_word(curr_set, word_type, max_len=1):
    found_count = 0
    eng_tokens = []
    for utt in map_dict[curr_set]:
        if word_type.encode() in map_dict[curr_set][utt]["es_w"] and len(map_dict[curr_set][utt]["es_w"]) <= max_len:
            found_count+=1
            if curr_set == "fisher_train":
                eng_tokens.append(" ".join([w.decode() for w in map_dict[curr_set][utt]["en_w"]]))
            else:
                for r in map_dict[curr_set][utt]["en_w"]:
                    eng_tokens.append(" ".join([w.decode() for w in r]))
    print(found_count, len(map_dict[curr_set]), "{0:.2f}".format(found_count / len(map_dict[curr_set]) * 100))
    print(len(set(eng_tokens)))
    return Counter(eng_tokens)

In [None]:
t = check_word("fisher_train", "si", 1)
t

In [None]:
d = check_word("fisher_dev", "mhm", 1)
d

In [None]:
t.most_common(5)

In [None]:
", ".join(set([i[0] for i in t.most_common(10)]) and set([i[0] for i in d.most_common(10)]))

In [None]:
# found_count = 0
eng_tokens = []
curr_set= "fisher_train"
for utt in map_dict[curr_set]:
    if b"claro" in map_dict[curr_set][utt]["es_w"] and len(map_dict[curr_set][utt]["es_w"]) <= 1:
        found_count+=1
        eng_tokens.append(" ".join([w.decode() for w in map_dict[curr_set][utt]["en_w"]]))

In [None]:
found_count, len(map_dict[curr_set]), found_count / len(map_dict[curr_set]) * 100

In [None]:
Counter(eng_tokens)

In [None]:
found_count = 0
eng_tokens = []
curr_set= "fisher_dev"
for utt in map_dict[curr_set]:
    if b"claro" in map_dict[curr_set][utt]["es_w"] and len(map_dict[curr_set][utt]["es_w"]) <= 1:
        found_count+=1
        eng_tokens.append(" ".join([w.decode() for w in map_dict[curr_set][utt]["en_w"]]))

In [None]:
found_count, len(map_dict[curr_set]), found_count / len(map_dict[curr_set]) * 100

In [None]:
Counter(eng_tokens)

In [None]:
len(set(eng_tokens))

In [None]:
sim_dict['w']

### Evaluation dictionaries

In [None]:
ref_names = ["ref_min-0_max-300.en{0:d}".format(i) for i in range(4)]

In [None]:
all_ref_names = []
google_ref_names = []
edin_ref_names = []
for r in ref_names:
    google_ref_names.append(os.path.join("google", r))
    edin_ref_names.append(os.path.join("./sp2enw_hyp_search/sp_1.0_l2e-4_rnn-3_drpt-0.5_cnn_96-2-2", r))
#     all_ref_names.append(os.path.join("sp2enw_interspeech/sp_0.33_seed-AB_l2e-4_drpt-0.5", r))
    
all_ref_names = google_ref_names + edin_ref_names    

In [None]:
def read_all_lines_in_file(fname):
    all_lines = []
    with open(fname, "r") as in_f:
        for line in in_f:
            all_lines.append(set(line.strip().split()))
    return all_lines

In [None]:
def get_all_words_in_file(fname):
    all_words = []
    with open(fname, "r") as in_f:
        for line in in_f:
            all_words.extend(line.strip().split())
    return set(all_words)

In [None]:
common_words_ref = get_all_words_in_file(all_ref_names[0])
all_words_ref = get_all_words_in_file(all_ref_names[0])

for r in all_ref_names[1:]:
    common_words_ref &= get_all_words_in_file(r)
    all_words_ref |= get_all_words_in_file(r)

In [None]:
len(common_words_ref), len(all_words_ref)

In [None]:
google_common_words_ref = get_all_words_in_file(google_ref_names[0])
google_all_words_ref = get_all_words_in_file(google_ref_names[0])

for r in google_ref_names[1:]:
    google_common_words_ref &= get_all_words_in_file(r)
    google_all_words_ref |= get_all_words_in_file(r)
print(len(google_common_words_ref), len(google_all_words_ref))

In [None]:
edin_common_words_ref = get_all_words_in_file(edin_ref_names[0])
edin_all_words_ref = get_all_words_in_file(edin_ref_names[0])

for r in edin_ref_names[1:]:
    edin_common_words_ref &= get_all_words_in_file(r)
    edin_all_words_ref |= get_all_words_in_file(r)
print(len(edin_common_words_ref), len(edin_all_words_ref))

In [None]:
len(google_common_words_ref & edin_common_words_ref)

In [None]:
common_ref_words = google_common_words_ref & edin_common_words_ref

In [None]:
len(common_ref_words)

In [None]:
len(google_all_words_ref - edin_all_words_ref), len(edin_all_words_ref - google_all_words_ref)

In [None]:
all_ref_lines = {}
for i,r in enumerate(edin_ref_names):
    all_ref_lines[i] = read_all_lines_in_file(r)
    

In [None]:
words_in_all_refs = []

for i in range(len(all_ref_lines[0])):
    words_ref = all_ref_lines[0][i]
    for j in range(1,4):
        words_ref &= all_ref_lines[j][i]
    words_in_all_refs.append(words_ref)

In [None]:
word_freq = {}

for ref in words_in_all_refs:
    now_words = ref & common_ref_words - stop_words - es_stop_words
    for w in now_words:
        if w not in word_freq:
            word_freq[w] = 0
        word_freq[w] += 1

In [None]:
len(word_freq)

In [None]:
rare_words = [(w,f) for w, f in word_freq.items() if f >= 2 and f <=5 and len(w) >= 8]

### eval 1 - 500 randomly selected frequent words, minor filtering

In [None]:
min_dev_freq=25
max_dev_freq=10000
min_train_freq=25
max_train_freq=10000
min_len=5

In [None]:
len(en_common_words)

In [None]:
en_content_words = ((en_common_words & common_ref_words) - (es_stop_words | stop_words))
en_content_words = {w for w in en_content_words if '¿' not in w and "'" not in w}

In [None]:
terms_of_interest = get_details_for_words(en_content_words, en_content_words, en_word_utt_count, 
                                          min_dev_freq=min_dev_freq, 
                                          max_dev_freq=max_dev_freq, 
                                          min_train_freq=min_train_freq, 
                                          max_train_freq=max_train_freq,
                                          min_len=min_len)
print("total words meeting criteria = {0:d}".format(len(terms_of_interest["words"])))

In [None]:
_, _ = get_duration(terms_of_interest["train_utts"], key="fisher_train"), get_duration(terms_of_interest["dev_utts"], key="fisher_dev")

In [None]:
random.seed("hmm")
sample_terms = random.sample(list(terms_of_interest["words"].keys()), 
                             min(len(terms_of_interest["words"]), 100))

In [None]:
sample_terms_details = get_details_for_words(sample_terms, en_common_words, en_word_utt_count,
                                          min_dev_freq=min_dev_freq, 
                                          max_dev_freq=max_dev_freq, 
                                          min_train_freq=min_train_freq,
                                          max_train_freq=max_train_freq,
                                          min_len=min_len)
print("total words meeting criteria = {0:d}".format(len(sample_terms_details["words"])))

In [None]:
_, _ = (get_duration(sample_terms_details["train_utts"], key="fisher_train"), 
       get_duration(sample_terms_details["dev_utts"], key="fisher_dev"))

In [None]:
" -- ".join(sample_terms)

In [None]:
eval_freq_content = create_vocab(sample_terms_details)

In [None]:
pickle.dump(eval_freq_content, 
            open(os.path.join(m_cfg['data_path'], 
                              "eval_en_freq_vocab.dict"), "wb"))

### eval 2 - 500 randomly selected infrequent words, minor filtering

In [None]:
min_dev_freq=2
max_dev_freq=10
min_train_freq=2
max_train_freq=25
min_len=5

In [None]:
len(en_common_words)

In [None]:
# en_content_words = (en_common_words - (es_stop_words | stop_words))
# en_content_words = {w for w in en_content_words if '¿' not in w}

In [None]:
len(en_content_words)

In [None]:
terms_of_interest = get_details_for_words(en_content_words, en_content_words, en_word_utt_count, 
                                          min_dev_freq=min_dev_freq, 
                                          max_dev_freq=max_dev_freq, 
                                          min_train_freq=min_train_freq,
                                          max_train_freq=max_train_freq,
                                          min_len=min_len)
print("total words meeting criteria = {0:d}".format(len(terms_of_interest["words"])))

In [None]:
_, _ = get_duration(terms_of_interest["train_utts"], key="fisher_train"), get_duration(terms_of_interest["dev_utts"], key="fisher_dev")

In [None]:
random.seed("hmm")
sample_terms = random.sample(list(terms_of_interest["words"].keys()), 
                             min(len(terms_of_interest["words"]), 500))

In [None]:
sample_terms_details = get_details_for_words(sample_terms, en_common_words, en_word_utt_count,
                                          min_dev_freq=min_dev_freq, 
                                          max_dev_freq=max_dev_freq, 
                                          min_train_freq=min_train_freq,
                                          max_train_freq=max_train_freq,
                                          min_len=min_len)
print("total words meeting criteria = {0:d}".format(len(sample_terms_details["words"])))

In [None]:
_, _ = (get_duration(sample_terms_details["train_utts"], key="fisher_train"), 
       get_duration(sample_terms_details["dev_utts"], key="fisher_dev"))

In [None]:
" -- ".join(sample_terms)

In [None]:
eval_content = create_vocab(sample_terms_details)

In [None]:
pickle.dump(eval_content, 
            open(os.path.join(m_cfg['data_path'], 
                              "eval_en_rare_vocab.dict"), "wb"))

### eval 3 - common es, en words

In [None]:
min_dev_freq=2
max_dev_freq=10000
min_train_freq=2
max_train_freq=100000
min_len=5

In [None]:
len(en_common_words)

In [None]:
# en_content_words = (en_common_words - (es_stop_words | stop_words))
# en_content_words = {w for w in en_content_words if '¿' not in w}

In [None]:
len(en_content_words)

In [None]:
es_en_common_words = (es_common_words & en_common_words & common_ref_words)  - (es_stop_words | stop_words)

In [None]:
len(es_en_common_words)

In [None]:
terms_of_interest = get_details_for_words(es_en_common_words, en_content_words, en_word_utt_count, 
                                          min_dev_freq=min_dev_freq, 
                                          max_dev_freq=max_dev_freq, 
                                          min_train_freq=min_train_freq,
                                          max_train_freq=max_train_freq,
                                          min_len=min_len)
print("total words meeting criteria = {0:d}".format(len(terms_of_interest["words"])))

In [None]:
_, _ = get_duration(terms_of_interest["train_utts"], key="fisher_train"), get_duration(terms_of_interest["dev_utts"], key="fisher_dev")

In [None]:
random.seed("hmm")
sample_terms = random.sample(list(terms_of_interest["words"].keys()), 
                             min(len(terms_of_interest["words"]), 500))

In [None]:
sample_terms_details = get_details_for_words(sample_terms, en_common_words, en_word_utt_count,
                                          min_dev_freq=min_dev_freq, 
                                          max_dev_freq=max_dev_freq, 
                                          min_train_freq=min_train_freq,
                                          max_train_freq=max_train_freq,
                                          min_len=min_len)
print("total words meeting criteria = {0:d}".format(len(sample_terms_details["words"])))

In [None]:
_, _ = (get_duration(sample_terms_details["train_utts"], key="fisher_train"), 
       get_duration(sample_terms_details["dev_utts"], key="fisher_dev"))

In [None]:
" -- ".join(sample_terms)

In [None]:
eval_content = create_vocab(sample_terms_details)

In [None]:
pickle.dump(eval_content, 
            open(os.path.join(m_cfg['data_path'], 
                              "eval_en_es_common_vocab.dict"), "wb"))

In [None]:
es_en_common_words = [w for w in es_common_words & en_common_words if len(w) >= 5]

In [None]:
len(es_en_common_words)

In [None]:
es_en_common_words

### Eval crisis


In [None]:
crises_lex_fname = "../criseslex/CrisisLexLexicon/CrisisLexRec.txt"

In [None]:
crises = set()
with open(crises_lex_fname, "r") as in_f:
    for line in in_f:
        crises.update(line.strip().split())
crises = list(crises)
crises_stem = [stem(w) for w in crises]

# new_crises = []
# for t in crises:
#     if t.encode() in sim_dict['w']:
#         new_crises.extend([w.decode() for w in sim_dict['w'][t.encode()]])
# crises.extend(new_crises)

In [None]:
len(crises)

In [None]:
crises_details = get_details_for_words(crises, (en_common_words & common_ref_words), en_word_utt_count,
                                       min_dev_freq=2, 
                                       max_dev_freq=5000, 
                                       min_train_freq=2,
                                       max_train_freq=5000,
                                       min_len=1)
print("total words meeting criteria = {0:d}".format(len(crises_details["words"])))

In [None]:
_, _ = get_duration(crises_details["train_utts"], key="fisher_train"), get_duration(crises_details["dev_utts"], key="fisher_dev")

In [None]:
en_word_utt_count.keys()

In [None]:
len(en_word_utt_count['dev_utts']['people'])

In [None]:
len(set(crises_details["words"]))

In [None]:
print(" -- ".join(list(crises_details["words"].keys())))

In [None]:
# crises_details["words"]

In [None]:
bow_crises_vocab = create_vocab(crises_details)

In [None]:
pickle.dump(bow_crises_vocab, open(os.path.join(m_cfg['data_path'], 
                                                "eval_en_crisis_vocab.dict"), "wb"))

### NLTK wordnet

In [None]:
import nltk
from nltk.corpus import wordnet as wn
from nltk.corpus import wordnet

In [None]:
en_words = set(en_word_utt_count['train'].keys())

In [None]:
len(en_words)

In [None]:
list(en_words)[:5]

In [None]:
"howdy".encode() in vocab_dict["en_w"]["w2i"]

In [None]:
syns = {}

for w in tqdm(en_words):
    s = []
#     print(w)
    w_syn = wn.synsets(w)
    for item in w_syn:
        for lm in item.lemma_names():
            if lm in en_words:
                s.append(lm)
        # end for all lemmas
    # end for syns
    s.append(w)
    syns[w] = list(set(s))
# end for all words

In [None]:
syn_sim = {"w":{}, "i":{}}

for w in tqdm(vocab_dict["en_w"]["w2i"].keys()):
    syn_sim["w"][w] = [w.encode() for w in syns.get(w.decode(), [w.decode()])]
    i = vocab_dict["en_w"]["w2i"][w]
    syn_sim["i"][i] = [vocab_dict["en_w"]["w2i"][j] for j in syn_sim["w"][w]]
    

In [None]:
list(syn_sim["w"].items())[:5]

In [None]:
pickle.dump(syn_sim, open(os.path.join(m_cfg['data_path'], "en_syns_train.dict"), "wb"))

In [None]:
haha = {w:v for w, v in syns.items() if len(v) >= 2}

In [None]:
len(haha)

In [None]:
vocab_dict["en_w"]["i2w"][14261]

In [None]:
sim_keys = set([w for w in sim_dict['w'] if len(sim_dict["w"][w]) >= 2])
syn_keys = set([w for w in syn_sim['w'] if len(syn_sim["w"][w]) >= 2])
print(len(sim_keys), len(syn_keys))

In [None]:
" -- ".join([w.decode() for w in syn_sim["w"][b'run']])

In [None]:
syn_keys - sim_keys

In [None]:
len(syns)

In [None]:
w = "hello"
w_syn = wn.synsets(w)
for item in w_syn:
    for lm in item.lemma_names():
        if lm in en_words:
            syns[w].append(lm)

In [None]:
syns = wordnet.synsets('car')

In [None]:
syn_set = []
for synset in wn.synsets("hello"):
    for item in synset.lemma_names():
        syn_set.append(item)
print(syn_set)

In [None]:
syns[1].lemmas()[3].name()

In [None]:
for s in syns:
    print(s.lemmas()[0].name)

In [None]:
[s.lemmas[0] for s in syns]

### ES, EN common words

In [20]:
def clean_out_str(out_str):
    out_str = out_str.replace("`", "")
    out_str = out_str.replace('"', '')
    out_str = out_str.replace('¿', '')
    out_str = out_str.replace("''", "")
    out_str = out_str.strip()
    return out_str
    

In [26]:
def get_out_str(h):
    out_str = ""
    for w in [i.decode() for i in h]:
        out_str += "{0:s}".format(w) if (w.startswith("'") or w=="n't") else " {0:s}".format(w)

    out_str = clean_out_str(out_str)
    return out_str

In [42]:
def match_es_en(es_words, en_words):
    utt_es_en_stats = {"common":{}, "es": {}, "en": {}}
    
    utt_es_en_stats["es"] = Counter(es_words)
    utt_es_en_stats["en"] = Counter(en_words)
    
    common_es_en = set(utt_es_en_stats["es"].keys()) & set(utt_es_en_stats["en"].keys())
    
    for w in common_es_en:
        utt_es_en_stats["common"][w] = min(utt_es_en_stats["es"][w], utt_es_en_stats["en"][w])
    
    return utt_es_en_stats

In [115]:
def corpus_level_es_en(set_key, ref_num=0):
    es_en_stats = {"common":{}, "es": {}, "en": {}}
    for u in tqdm(map_dict[set_key], ncols=80):
        es_words = get_out_str(map_dict[set_key][u]["es_w"]).strip().split()
        if set_key == "fisher_dev":
            en_words = get_out_str(map_dict[set_key][u]["en_w"][ref_num]).strip().split()
        else:
            en_words = get_out_str(map_dict[set_key][u]["en_w"]).strip().split()
        
        utt_es_en_stats = match_es_en(es_words, en_words)
        
        #print(utt_es_en_stats)
        for k in utt_es_en_stats:
            for w in utt_es_en_stats[k]:
                if w not in es_en_stats[k]:
                    es_en_stats[k][w] = 0
                es_en_stats[k][w] += utt_es_en_stats[k][w]
        # end for update counts
    # end for all utts
    return es_en_stats

In [138]:
def show_details(common_results, show=20, min_len=0, filter_stop=False):
    for k in common_results:
        print("{0:10s} = {1:>10d} types".format(k,len(common_results[k])))
        
    tot = {}
    print("-"*60)
    for k in common_results:
        tot[k] = sum(common_results[k].values())
        print("{0:10s} = {1:>10d} tokens".format(k,tot[k]))
        
    print("-"*60)
    print("common / es = {0:.2f}%".format(tot["common"]/tot["es"]*100))
    print("common / en = {0:.2f}%".format(tot["common"]/tot["en"]*100))
    print("-"*60)
    
    common_words = sorted(common_results["common"].items(), reverse=True, key=lambda t: t[1])
    
    all_stop_words = set(nltk.corpus.stopwords.words("english")) | set(nltk.corpus.stopwords.words("spanish"))
    
    if filter_stop:
        common_words = [(w,c) for w,c in common_words 
                        if w not in all_stop_words and len(w) >= min_len][:show]
    else:
        common_words = common_words[:show]
    
    print("-"*60)
    print("Top common words")
    for w, c in common_words:
        print("{0:20s} || {1:10d}".format(w,c))
        
    

In [139]:
train_common = corpus_level_es_en("fisher_train", 0)

100%|████████████████████████████████| 138819/138819 [00:08<00:00, 16526.41it/s]


In [140]:
show_details(train_common, show=10, min_len=5, filter_stop=True)

common     =       4737 types
es         =      32185 types
en         =      18140 types
------------------------------------------------------------
common     =      92445 tokens
es         =    1494776 tokens
en         =    1440914 tokens
------------------------------------------------------------
common / es = 6.18%
common / en = 6.42%
------------------------------------------------------------
------------------------------------------------------------
Top common words
internet             ||        754
puerto               ||        470
argentina            ||        452
chile                ||        421
miami                ||        291
venezuela            ||        279
colombia             ||        260
texas                ||        236
chicago              ||        233
right                ||        230


In [141]:
dev_common = corpus_level_es_en("fisher_dev", 0)

100%|████████████████████████████████████| 3979/3979 [00:00<00:00, 14880.47it/s]


In [142]:
show_details(dev_common, show=10, min_len=5, filter_stop=True)

common     =        596 types
es         =       4079 types
en         =       2998 types
------------------------------------------------------------
common     =       3396 tokens
es         =      40969 tokens
en         =      40041 tokens
------------------------------------------------------------
common / es = 8.29%
common / en = 8.48%
------------------------------------------------------------
------------------------------------------------------------
Top common words
puerto               ||         46
chicago              ||         20
general              ||         14
idaho                ||         13
colorado             ||         13
colombia             ||         12
salsa                ||         12
philly               ||         12
hello                ||         11
florida              ||         10


In [30]:
es_words = get_out_str(map_dict["fisher_dev"]["20051009_182032_217_fsp-B-2"]["es_w"])

'mi nombre es carmen de chicago y tu'