In [1]:
import nltk
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu, modified_precision
from nltk.translate.chrf_score import sentence_chrf, corpus_chrf
from nltk.metrics import scores
import scipy.io.wavfile
from IPython.display import Audio
from IPython.display import display
from nltk.stem import *
# from nltk.stem.snowball import SnowballStemmer
from stemming.porter2 import stem
import stemming
from nltk.metrics.scores import recall

from basics import *

import sentencepiece as spm

from nltk.corpus import stopwords

%matplotlib inline

In [2]:
tableau20 = [(31, 119, 180), (174, 199, 232), (255, 127, 14), (255, 187, 120),    
             (44, 160, 44), (152, 223, 138), (214, 39, 40), (255, 152, 150),    
             (148, 103, 189), (197, 176, 213), (140, 86, 75), (196, 156, 148),    
             (227, 119, 194), (247, 182, 210), (127, 127, 127), (199, 199, 199),    
             (188, 189, 34), (219, 219, 141), (23, 190, 207), (158, 218, 229)]    
# Scale the RGB values to the [0, 1] range, which is the format matplotlib accepts.    
for i in range(len(tableau20)):    
    r, g, b = tableau20[i]    
    tableau20[i] = (r / 255., g / 255., b / 255.)

In [3]:
smooth_fun = nltk.translate.bleu_score.SmoothingFunction()

In [4]:
from nmt_run import *

In [5]:
cfg_path = "sp2enw_mel-80_vocab-nltk/sp_1.0_h-256_e-128_drpt-rnn-.3_lstm"

In [6]:
%%capture
last_epoch, model, optimizer, m_cfg, t_cfg = check_model(cfg_path)

### Load Fisher dataset

In [7]:
%%capture
# -------------------------------------------------------------------------
# get data dictionaries
# -------------------------------------------------------------------------
map_dict, vocab_dict, bucket_dict = get_data_dicts(m_cfg)
info_dict = pickle.load(open("fbanks_80dim_nltk/info.dict", "rb"))
sim_dict = pickle.load(open("./fbanks_80dim_nltk/mix_sim.dict", "rb"))

In [8]:
random.seed("meh")
# random.seed("haha")

### Train text

In [9]:
train_file = "../installs/fisher-callhome-corpus/corpus/ldc/fisher_train.en"

In [10]:
train_text = []

In [11]:
for u in map_dict["fisher_train"]:
    train_text.append(" ".join([w.decode() for w in map_dict["fisher_train"][u]["en_w"]]))

In [12]:
train_text[:10]

['hello',
 'hello',
 'hello',
 'hello',
 'with whom am i speaking',
 'eh silvia yes what is your name',
 'hello silvia eh my name is nicole',
 'ah nice to meet you',
 'nice to meet you em and where are you from',
 "eh i 'm in philadelphia"]

In [13]:
train_text_to_dump = "\n".join(train_text)

In [14]:
with open("../subword-nmt/fisher_train.en", "w") as out_f:
    out_f.write(train_text_to_dump)

In [15]:
# ./learn_joint_bpe_and_vocab.py --input {train_file}.L1 {train_file}.L2 -s {num_operations} -o {codes_file} --write-vocabulary {vocab_file}.L1 {vocab_file}.L2

In [16]:
# sp = spm.SentencePieceProcessor()

In [17]:
# sp.Load("test/test_model.model")

### word level analysis

In [18]:
min_word_len = 1
top_k = 100

In [19]:
stop_words = set(nltk.corpus.stopwords.words("english"))
len(stop_words)

127

In [20]:
es_stop_words = set(nltk.corpus.stopwords.words("spanish"))
len(es_stop_words)

313

In [21]:
def get_words(m_dict, key="en_w"):
    words = []
    for u in m_dict:
        if type(m_dict[u][key]) == list:
            words.extend([w.decode() for w in m_dict[u][key]])
        else:
            for ref in m_dict[u][key]:
                words.extend([w.decode() for w in ref])
    return Counter(words)

In [22]:
# words in train
train_words = get_words(map_dict['fisher_train'])
train_words_top_k = [(w,f) for w, f in sorted(train_words.items(), reverse=True, key=lambda t:t[1]) 
                     if w not in stop_words and len(w) >= min_word_len][:top_k]

train_only_words = set(train_words.keys())

print("{0:20s} | {1:10d}".format("# train word types", len(train_words)))
print("{0:20s} | {1:10d}".format("# train word tokens", sum(train_words.values())))

dev_words = get_words(map_dict['fisher_dev'])
dev_words_top_k = [(w,f) for w, f in sorted(dev_words.items(), reverse=True, key=lambda t:t[1]) 
                     if w not in stop_words and len(w) >= min_word_len][:top_k]

dev_only_words = set(dev_words.keys())

print("-"*80)
print("{0:20s} | {1:10d}".format("# dev word types", len(dev_words)))
print("{0:20s} | {1:10d}".format("# dev word tokens", sum(dev_words.values())))

# train word types   |      17830
# train word tokens  |    1497352
--------------------------------------------------------------------------------
# dev word types     |       4835
# dev word tokens    |     165206


In [23]:
# words in train
es_train_words = get_words(map_dict['fisher_train'], key="es_w")
es_train_words_top_k = [(w,f) for w, f in sorted(es_train_words.items(), reverse=True, key=lambda t:t[1]) 
                     if w not in es_stop_words and len(w) >= min_word_len][:top_k]

es_train_only_words = set(es_train_words.keys())

print("{0:20s} | {1:10d}".format("# train word types", len(es_train_words)))
print("{0:20s} | {1:10d}".format("# train word tokens", sum(es_train_words.values())))

es_dev_words = get_words(map_dict['fisher_dev'], key="es_w")
es_dev_words_top_k = [(w,f) for w, f in sorted(es_dev_words.items(), reverse=True, key=lambda t:t[1]) 
                     if w not in es_stop_words and len(w) >= min_word_len][:top_k]

es_dev_only_words = set(es_dev_words.keys())

print("-"*80)
print("{0:20s} | {1:10d}".format("# dev word types", len(es_dev_words)))
print("{0:20s} | {1:10d}".format("# dev word tokens", sum(es_dev_words.values())))

# train word types   |      32860
# train word tokens  |    1496792
--------------------------------------------------------------------------------
# dev word types     |       4145
# dev word tokens    |      41098


In [24]:
train_words_top_k[:5], es_train_words_top_k[:5]

([('yes', 35054),
  ("'s", 24162),
  ("n't", 19184),
  ('like', 14334),
  ('well', 12354)],
 [('ah', 12325), ('eh', 11447), ('si', 9423), ('ajá', 7988), ('bueno', 7838)])

In [25]:
[(w,f) for w,f in train_words_top_k if "'" in w]

[("'s", 24162), ("n't", 19184), ("'m", 5546), ("'re", 2832), ("'ve", 2392)]

In [26]:
dev_words_top_k[:5], es_dev_words_top_k[:5]

([('yes', 3652), ("n't", 1999), ("'s", 1866), ('like', 1826), ('know', 1294)],
 [('ajá', 343), ('ah', 341), ('entonces', 249), ('si', 247), ('mhm', 236)])

In [27]:
oov_words = {w:f for w,f in dev_words.items() if w not in train_only_words}

print("{0:20s} | {1:10d}".format("# oov word types", len(oov_words)))
print("{0:20s} | {1:10d}".format("# oov word tokens", sum(oov_words.values())))

# oov word types     |       1011
# oov word tokens    |       1599


In [28]:
es_oov_words = {w:f for w,f in es_dev_words.items() if w not in es_train_only_words}

print("{0:20s} | {1:10d}".format("# oov word types", len(es_oov_words)))
print("{0:20s} | {1:10d}".format("# oov word tokens", sum(es_oov_words.values())))

# oov word types     |        448
# oov word tokens    |        525


In [29]:
"{0:.1f}%".format(sum(oov_words.values()) / sum(dev_words.values()) * 100)

'1.0%'

### Word level - get train, dev frequency, and utts in which they occur

In [30]:
len(train_only_words), len(set([stem(w) for w in train_only_words]))

(17830, 12011)

In [31]:
def get_word_level_details(word_key):
    word_utt_count = {"train": {}, "dev": {}, "train_utts": {}, "dev_utts": {}}
    for u in tqdm(map_dict["fisher_train"].keys()):
        for w in set(map_dict["fisher_train"][u][word_key]):
            curr_word = w.decode()
            if curr_word not in word_utt_count["train"]:
                word_utt_count["train"][curr_word] = 0
                word_utt_count["train_utts"][curr_word] = set()
            word_utt_count["train"][curr_word] += 1
            word_utt_count["train_utts"][curr_word].update({u})
        # end for words in current utt
    # end for all utts
    for u in tqdm(map_dict["fisher_dev"].keys()):
        if word_key == "en_w":
            for ref in map_dict["fisher_dev"][u][word_key]:
                for w in set(ref):
                    curr_word = w.decode()
                    if curr_word not in word_utt_count["dev"]:
                        word_utt_count["dev"][curr_word] = 0
                        word_utt_count["dev_utts"][curr_word] = set()
                    word_utt_count["dev"][curr_word] += 1            
                    word_utt_count["dev_utts"][curr_word].update({u})
                # end for words in current ref
            # end for all references
        # end if multiple references
        else:
            ref = map_dict["fisher_dev"][u][word_key]
            for w in set(ref):
                curr_word = w.decode()
                if curr_word not in word_utt_count["dev"]:
                    word_utt_count["dev"][curr_word] = 0
                    word_utt_count["dev_utts"][curr_word] = set()
                word_utt_count["dev"][curr_word] += 1            
                word_utt_count["dev_utts"][curr_word].update({u})
            
    # end for all utts
    all_train_utts = set()
    for w in word_utt_count["train_utts"]:
        all_train_utts.update(word_utt_count["train_utts"][w])
    # end for

    all_dev_utts = set()
    for w in word_utt_count["dev_utts"]:
        all_dev_utts.update(word_utt_count["dev_utts"][w])
    # end for
    
    return word_utt_count, all_train_utts, all_dev_utts
    

In [32]:
en_word_utt_count, en_train_utts, en_dev_utts = get_word_level_details("en_w")

100%|██████████| 138819/138819 [00:02<00:00, 64727.23it/s]
100%|██████████| 3979/3979 [00:00<00:00, 19567.59it/s]


In [33]:
len(en_train_utts), len(en_dev_utts)

(138795, 3979)

In [34]:
es_word_utt_count, es_train_utts, es_dev_utts = get_word_level_details("es_w")

100%|██████████| 138819/138819 [00:02<00:00, 62389.22it/s]
100%|██████████| 3979/3979 [00:00<00:00, 62016.27it/s]


In [35]:
len(es_train_utts), len(es_dev_utts)

(138797, 3977)

In [36]:
print("word types")
print(len(en_word_utt_count['train']), len(en_word_utt_count['dev']))
print("common word types")
en_common_words = set(en_word_utt_count['train'].keys()) & set(en_word_utt_count['dev'].keys())
len(en_common_words)

word types
17830 4835
common word types


3824

In [37]:
print("word types")
print(len(es_word_utt_count['train']), len(es_word_utt_count['dev']))
print("common word types")
es_common_words = set(es_word_utt_count['train'].keys()) & set(es_word_utt_count['dev'].keys())
len(es_common_words)

word types
32860 4145
common word types


3697

In [38]:
def get_details_for_words(words, common_words, word_utt_count, 
                          min_dev_freq, max_dev_freq, min_train_freq, min_len):
    details = {"words": {}, "train_utts": set(), "dev_utts": set()}
    
    in_vocab_words = set(words) & set(common_words)
    print("number of in-vocab words = {0:d}".format(len(in_vocab_words)))

    for w in in_vocab_words:
        t_count, d_count = len(word_utt_count["train_utts"][w]), len(word_utt_count["dev_utts"][w])
        if ((d_count >= min_dev_freq) and 
            (d_count <= max_dev_freq) and
            (len(w) >= min_len) and
            (t_count >= min_train_freq)):
            details["words"][w] = {"train": t_count, "dev": d_count}
            details["train_utts"].update(word_utt_count["train_utts"][w])
            details["dev_utts"].update(word_utt_count["dev_utts"][w])
        # end meets criteria
    # end for in-vocab word
    return details
# end function

In [39]:
def get_duration(utts, key):
    dur = 0
    utts_not_found = []
    for u in utts:
        if u not in info_dict[key]:
            #print("argh!", u)
            utts_not_found.append(u)
        else:
            dur += (info_dict[key][u]['sp'] * 10)
    dur = dur / 60 / 60 / 1000
    print("-"*80)
    print("{0:d} total utts".format(len(utts)))
    print("{0:d} not found".format(len(utts_not_found)))
    print("selected utts from {0:s} -- duration = {1:.2f} hours".format(key, dur))
    return dur

In [40]:
def create_vocab(words_list):
    out = {"w2i":{}, "i2w":{}, "freq":{}, "freq_dev":{}}
    START_VOCAB = [PAD, GO, EOS, UNK]
    for w in START_VOCAB:
        out['w2i'][w] = len(out["w2i"])
        out["freq"][w] = 1
    #for w in words_list['words']:
    sorted_w = sorted(words_list['words'].items(), reverse=True, key=lambda t: t[1]['train'])
    for w in sorted_w:
        encoded_word = w[0].encode()
        out["w2i"][encoded_word] = len(out["w2i"])
        out["freq"][encoded_word] = w[1]["train"]
        out["freq_dev"][encoded_word] = w[1]["dev"]

    out["i2w"] = {val:key for key, val in out["w2i"].items()}
    return out

In [41]:
train_dur, dev_dur = get_duration(en_train_utts, key="fisher_train"), get_duration(en_dev_utts, key="fisher_dev")

--------------------------------------------------------------------------------
138795 total utts
89 not found
selected utts from fisher_train -- duration = 161.62 hours
--------------------------------------------------------------------------------
3979 total utts
2 not found
selected utts from fisher_dev -- duration = 4.35 hours


### Task 0 - 500 randomly selected frequent words, minor filtering

In [57]:
min_dev_freq=10
max_dev_freq=10000
min_train_freq=50 
min_len=5

In [40]:
terms_of_interest = get_details_for_words(common_words, 
                                          min_dev_freq=min_dev_freq, 
                                          max_dev_freq=max_dev_freq, 
                                          min_train_freq=min_train_freq, 
                                          min_len=min_len)
print("total words meeting criteria = {0:d}".format(len(terms_of_interest["words"])))

number of in-vocab words = 3824
total words meeting criteria = 557


In [41]:
_, _ = get_duration(terms_of_interest["train_utts"], key="fisher_train"), get_duration(terms_of_interest["dev_utts"], key="fisher_dev")

--------------------------------------------------------------------------------
84953 total utts
46 not found
selected utts from fisher_train -- duration = 133.57 hours
--------------------------------------------------------------------------------
2759 total utts
2 not found
selected utts from fisher_dev -- duration = 3.82 hours


In [42]:
random.seed("hmm")
sample_terms = random.sample(list(terms_of_interest["words"].keys()), 500)

In [43]:
sample_terms_details = get_details_for_words(sample_terms, 
                                          min_dev_freq=min_dev_freq, 
                                          max_dev_freq=max_dev_freq, 
                                          min_train_freq=min_train_freq, 
                                          min_len=min_len)
print("total words meeting criteria = {0:d}".format(len(sample_terms_details["words"])))

number of in-vocab words = 500
total words meeting criteria = 500


In [44]:
_, _ = get_duration(sample_terms_details["train_utts"], key="fisher_train"), get_duration(sample_terms_details["dev_utts"], key="fisher_dev")

--------------------------------------------------------------------------------
82928 total utts
43 not found
selected utts from fisher_train -- duration = 131.58 hours
--------------------------------------------------------------------------------
2700 total utts
2 not found
selected utts from fisher_dev -- duration = 3.78 hours


In [45]:
sample_terms[:10]

['neither',
 'children',
 'dancing',
 'knows',
 'immigration',
 'found',
 'belong',
 'politics',
 'computer',
 'miles']

In [46]:
bow_top_500_words_vocab = create_vocab(sample_terms_details)

In [47]:
pickle.dump(bow_top_500_words_vocab, open(os.path.join(m_cfg['data_path'], "bow_top_500_words_vocab.dict"), "wb"))

### Task 1 - randomly selected frequent words

In [73]:
min_dev_freq=10 
max_dev_freq=100
min_train_freq=100
min_len=5

In [74]:
terms_of_interest = get_details_for_words(common_words, 
                                          min_dev_freq=min_dev_freq, 
                                          max_dev_freq=max_dev_freq, 
                                          min_train_freq=min_train_freq, 
                                          min_len=min_len)
print("total words meeting criteria = {0:d}".format(len(terms_of_interest["words"])))

number of in-vocab words = 3824
total words meeting criteria = 372


In [75]:
_, _ = get_duration(terms_of_interest["train_utts"], key="fisher_train"), get_duration(terms_of_interest["dev_utts"], key="fisher_dev")

--------------------------------------------------------------------------------
62610 total utts
35 not found
selected utts from fisher_train -- duration = 109.48 hours
--------------------------------------------------------------------------------
2084 total utts
1 not found
selected utts from fisher_dev -- duration = 3.30 hours


In [76]:
random.seed("hmm")
sample_terms = random.sample(list(terms_of_interest["words"].keys()), 100)

In [77]:
sample_terms_details = get_details_for_words(sample_terms, 
                                          min_dev_freq=min_dev_freq, 
                                          max_dev_freq=max_dev_freq, 
                                          min_train_freq=min_train_freq, 
                                          min_len=min_len)
print("total words meeting criteria = {0:d}".format(len(sample_terms_details["words"])))

number of in-vocab words = 100
total words meeting criteria = 100


In [78]:
_, _ = get_duration(sample_terms_details["train_utts"], key="fisher_train"), get_duration(sample_terms_details["dev_utts"], key="fisher_dev")

--------------------------------------------------------------------------------
26354 total utts
15 not found
selected utts from fisher_train -- duration = 51.20 hours
--------------------------------------------------------------------------------
1026 total utts
0 not found
selected utts from fisher_dev -- duration = 1.83 hours


In [79]:
sample_terms[:10]

['child',
 'rican',
 'lived',
 'strange',
 'thousand',
 'alone',
 'spend',
 'whole',
 'doing',
 'married']

In [81]:
bow_top_100_words_vocab = create_vocab(sample_terms_details)

In [82]:
pickle.dump(bow_top_100_words_vocab, open(os.path.join(m_cfg['data_path'], "bow_top_100_words_vocab.dict"), "wb"))

### Task 2 - topics as keywords

In [43]:
train_text_fname= "../installs/fisher-callhome-corpus/corpus/ldc/fisher_train.en"
topics_fname = "../criseslex/fsp06_topics_in_english.txt"

In [61]:
topics = [ "peace", "Music", "Marriage", "Religion", "Cell phones", 
           "Dating", "Telemarketing and SPAM", "Politics", "Travel", 
           "Technical devices", "Healthcare", "Advertisements", "Power", 
           "Occupations", "Movies", "Welfare", "Breaking up", "Location", 
           "Justice", "Memories", "Crime", "Violence against women", "Equality", 
            "Housing", "Immigration",     
            # new topics
           "Interracial", "Christians", "muslims", "jews", "e-mail", 
           "phone", "democracy", "Democratic", "Republican", "technology", 
           "leadership", "community", "jury", "police", "inequality", 
           "renting", "Violence", "immigrants", "immigrant", "skilled", 
           "Telemarketing", "SPAM", "skill", "job", "health", "mobile", 
            "ads", "physical", "emotional", "bubble", "rent", "economy", 
            "abuse", "women", "city", "country", "suburban", "dollar", 
            "united states", "laws", "phone", "race", "biracial", "interracial", 
            "marriage", "lyrics", "sexuality", "medicine", "television", "european",
            "home", "protect", "spouse", "language", "cellphone", "money",
            "doctor", "insurance", "cigarettes", "alcohol", "income", "salary",
            "class", "censor", "rating", "programs", "government",
            "relationship", "legal", "event", "life", "safe", "victim", "cops",
            "wage", "illegal"
            ]
topics = list(set(t.lower() for t in topics))
topics_stem = [stem(t) for t in topics]

# add similar topic words
new_topics = []
# for t in topics:
#     if t.encode() in sim_dict['w']:
#         new_topics.extend([w.decode() for w in sim_dict['w'][t.encode()]])
topics.extend(new_topics)

In [62]:
len(topics)

98

In [63]:
topics_details = get_details_for_words(topics, en_common_words, en_word_utt_count, 
                                       min_dev_freq=5, 
                                       max_dev_freq=10000, 
                                       min_train_freq=10, 
                                       min_len=1)
print("total words meeting criteria = {0:d}".format(len(topics_details["words"])))

number of in-vocab words = 60
total words meeting criteria = 42


In [64]:
_, _ = get_duration(topics_details["train_utts"], key="fisher_train"), get_duration(topics_details["dev_utts"], key="fisher_dev")

--------------------------------------------------------------------------------
13605 total utts
8 not found
selected utts from fisher_train -- duration = 28.43 hours
--------------------------------------------------------------------------------
518 total utts
0 not found
selected utts from fisher_dev -- duration = 0.99 hours


In [65]:
print("\n".join(list(topics_details["words"].keys())))

life
christians
jury
race
alcohol
welfare
class
home
safe
crime
government
religion
doctor
rent
income
medicine
laws
politics
job
justice
television
spam
police
movies
immigration
insurance
money
health
language
women
illegal
phone
programs
interracial
travel
music
country
lyrics
marriage
relationship
city
protect


In [67]:
topics_details['words']

{'alcohol': {'dev': 9, 'train': 38},
 'christians': {'dev': 12, 'train': 86},
 'city': {'dev': 144, 'train': 1045},
 'class': {'dev': 31, 'train': 119},
 'country': {'dev': 93, 'train': 1341},
 'crime': {'dev': 35, 'train': 169},
 'doctor': {'dev': 8, 'train': 195},
 'government': {'dev': 33, 'train': 333},
 'health': {'dev': 8, 'train': 243},
 'home': {'dev': 73, 'train': 537},
 'illegal': {'dev': 5, 'train': 85},
 'immigration': {'dev': 12, 'train': 137},
 'income': {'dev': 7, 'train': 32},
 'insurance': {'dev': 7, 'train': 310},
 'interracial': {'dev': 8, 'train': 66},
 'job': {'dev': 33, 'train': 476},
 'jury': {'dev': 25, 'train': 185},
 'justice': {'dev': 8, 'train': 85},
 'language': {'dev': 10, 'train': 230},
 'laws': {'dev': 6, 'train': 170},
 'life': {'dev': 93, 'train': 933},
 'lyrics': {'dev': 8, 'train': 43},
 'marriage': {'dev': 24, 'train': 281},
 'medicine': {'dev': 15, 'train': 97},
 'money': {'dev': 132, 'train': 1217},
 'movies': {'dev': 10, 'train': 329},
 'music': 

In [None]:
bow_topics_vocab = create_vocab(topics_details)

In [None]:
haha = pickle.load(open(os.path.join(m_cfg['data_path'], "bow_topics_vocab.dict"), "rb"))

In [None]:
len(set(haha['w2i']) & set(bow_topics_vocab['w2i'].keys()))

In [None]:
pickle.dump(bow_topics_vocab, open(os.path.join(m_cfg['data_path'], "bow_topics_vocab.dict"), "wb"))

### Task 3 - crises terms as keywords

In [58]:
crises_lex_fname = "../criseslex/CrisisLexLexicon/CrisisLexRec.txt"

In [59]:
crises = set()
with open(crises_lex_fname, "r") as in_f:
    for line in in_f:
        crises.update(line.strip().split())
crises = list(crises)
crises_stem = [stem(w) for w in crises]

# new_crises = []
# for t in crises:
#     if t.encode() in sim_dict['w']:
#         new_crises.extend([w.decode() for w in sim_dict['w'][t.encode()]])
# crises.extend(new_crises)

In [60]:
len(crises)

288

In [63]:
crises_details = get_details_for_words(crises, en_common_words, en_word_utt_count,
                                       min_dev_freq=10, 
                                       max_dev_freq=1000, 
                                       min_train_freq=100, 
                                       min_len=1)
print("total words meeting criteria = {0:d}".format(len(crises_details["words"])))

number of in-vocab words = 123
total words meeting criteria = 40


In [64]:
_, _ = get_duration(crises_details["train_utts"], key="fisher_train"), get_duration(crises_details["dev_utts"], key="fisher_dev")

--------------------------------------------------------------------------------
28608 total utts
11 not found
selected utts from fisher_train -- duration = 55.51 hours
--------------------------------------------------------------------------------
1076 total utts
0 not found
selected utts from fisher_dev -- duration = 1.96 hours


In [65]:
en_word_utt_count.keys()

dict_keys(['train', 'dev', 'train_utts', 'dev_utts'])

In [66]:
len(en_word_utt_count['dev_utts']['people'])

204

In [67]:
print("\n".join(list(crises_details["words"].keys())))

want
life
another
people
first
stay
need
case
city
found
watch
remember
town
girl
even
coming
gets
high
home
women
huge
saying
news
make
lives
morning
house
change
name
give
terrible
love
years
someone
waiting
send
time
help
leave
live


In [68]:
crises_details["words"]

{'another': {'dev': 45, 'train': 1215},
 'case': {'dev': 14, 'train': 432},
 'change': {'dev': 19, 'train': 425},
 'city': {'dev': 39, 'train': 1045},
 'coming': {'dev': 20, 'train': 253},
 'even': {'dev': 73, 'train': 1582},
 'first': {'dev': 20, 'train': 937},
 'found': {'dev': 15, 'train': 290},
 'gets': {'dev': 20, 'train': 370},
 'girl': {'dev': 34, 'train': 598},
 'give': {'dev': 37, 'train': 999},
 'help': {'dev': 38, 'train': 529},
 'high': {'dev': 20, 'train': 255},
 'home': {'dev': 34, 'train': 537},
 'house': {'dev': 42, 'train': 1134},
 'huge': {'dev': 10, 'train': 119},
 'leave': {'dev': 25, 'train': 404},
 'life': {'dev': 25, 'train': 933},
 'live': {'dev': 84, 'train': 1979},
 'lives': {'dev': 24, 'train': 362},
 'love': {'dev': 25, 'train': 712},
 'make': {'dev': 62, 'train': 968},
 'morning': {'dev': 11, 'train': 184},
 'name': {'dev': 32, 'train': 1064},
 'need': {'dev': 49, 'train': 770},
 'news': {'dev': 11, 'train': 187},
 'people': {'dev': 204, 'train': 5781},
 'r

In [69]:
bow_crises_vocab = create_vocab(crises_details)

In [70]:
pickle.dump(bow_crises_vocab, open(os.path.join(m_cfg['data_path'], "bow_crises_vocab.dict"), "wb"))

### Task Spanish - 500 randomly selected frequent words, minor filtering

In [42]:
min_dev_freq=18
max_dev_freq=10000
min_train_freq=200
min_len=5

In [43]:
terms_of_interest = get_details_for_words(es_common_words,
                                          es_common_words,
                                          es_word_utt_count,
                                          min_dev_freq=min_dev_freq, 
                                          max_dev_freq=max_dev_freq, 
                                          min_train_freq=min_train_freq, 
                                          min_len=min_len)
print("total words meeting criteria = {0:d}".format(len(terms_of_interest["words"])))

number of in-vocab words = 3697
total words meeting criteria = 100


In [44]:
_, _ = get_duration(terms_of_interest["train_utts"], key="fisher_train"), get_duration(terms_of_interest["dev_utts"], key="fisher_dev")

--------------------------------------------------------------------------------
74435 total utts
37 not found
selected utts from fisher_train -- duration = 122.08 hours
--------------------------------------------------------------------------------
2076 total utts
0 not found
selected utts from fisher_dev -- duration = 3.25 hours


In [45]:
random.seed("hmm")
sample_terms = random.sample(list(terms_of_interest["words"].keys()), 
                             min(len(terms_of_interest["words"].keys()), len(terms_of_interest["words"].keys())))

In [46]:
# sample_terms = ["bueno"]

In [47]:
sample_terms_details = get_details_for_words(sample_terms,
                                             es_common_words,
                                             es_word_utt_count,
                                              min_dev_freq=min_dev_freq, 
                                              max_dev_freq=max_dev_freq, 
                                              min_train_freq=min_train_freq, 
                                              min_len=min_len)
print("total words meeting criteria = {0:d}".format(len(sample_terms_details["words"])))

number of in-vocab words = 100
total words meeting criteria = 100


In [48]:
_, _ = get_duration(sample_terms_details["train_utts"], key="fisher_train"), get_duration(sample_terms_details["dev_utts"], key="fisher_dev")

--------------------------------------------------------------------------------
74435 total utts
37 not found
selected utts from fisher_train -- duration = 122.08 hours
--------------------------------------------------------------------------------
2076 total utts
0 not found
selected utts from fisher_dev -- duration = 3.25 hours


In [49]:
sample_terms

['tanto',
 'imagino',
 'quiero',
 'mejor',
 'estados',
 'gente',
 'bastante',
 'alguien',
 'nosotros',
 'tiempo',
 'puedes',
 'solamente',
 'cinco',
 'bonito',
 'mucha',
 'cuando',
 'dicen',
 'claro',
 'muchas',
 'mismo',
 'cierto',
 'ayuda',
 'persona',
 'tener',
 'estar',
 'español',
 'quiere',
 'estaba',
 'religión',
 'había',
 'llama',
 'estoy',
 'cuatro',
 'verdad',
 'todos',
 'gusta',
 'ellos',
 'estás',
 'dónde',
 'hacen',
 'nueva',
 'gustaría',
 'tienes',
 'hasta',
 'puerto',
 'chicago',
 'ejemplo',
 'familia',
 'música',
 'menos',
 'nunca',
 'sabes',
 'ahora',
 'decir',
 'tampoco',
 'mundo',
 'todavía',
 '¿verdad',
 'niños',
 'buena',
 'siempre',
 'dinero',
 'parece',
 'están',
 'personas',
 'universidad',
 'trabajo',
 'tiene',
 'antes',
 'muchos',
 'porque',
 'entonces',
 'bueno',
 'después',
 'esposo',
 'españa',
 'tengo',
 'hablando',
 'ciudad',
 'mucho',
 'unidos',
 'estado',
 'también',
 'usted',
 'difícil',
 'tenía',
 'poquito',
 'tienen',
 'buenas',
 'puede',
 'hacer',


In [50]:
# sample_terms_details

In [51]:
bow_es_top_words_vocab = create_vocab(sample_terms_details)

In [52]:
bow_es_top_words_vocab.keys()

dict_keys(['w2i', 'i2w', 'freq', 'freq_dev'])

In [53]:
sample_terms_details.keys()

dict_keys(['words', 'train_utts', 'dev_utts'])

In [54]:
sample_terms_details["train_utts"][:10]

TypeError: 'set' object is not subscriptable

In [55]:
sample_terms_details['words']

{'ahora': {'dev': 65, 'train': 2955},
 'alguien': {'dev': 23, 'train': 811},
 'antes': {'dev': 28, 'train': 962},
 'ayuda': {'dev': 19, 'train': 386},
 'bastante': {'dev': 26, 'train': 955},
 'bonito': {'dev': 19, 'train': 388},
 'buena': {'dev': 23, 'train': 562},
 'buenas': {'dev': 25, 'train': 627},
 'bueno': {'dev': 158, 'train': 7215},
 'chicago': {'dev': 20, 'train': 223},
 'cierto': {'dev': 19, 'train': 868},
 'cinco': {'dev': 20, 'train': 923},
 'ciudad': {'dev': 32, 'train': 1028},
 'claro': {'dev': 116, 'train': 5207},
 'cosas': {'dev': 81, 'train': 2881},
 'cuando': {'dev': 159, 'train': 4045},
 'cuatro': {'dev': 18, 'train': 610},
 'decir': {'dev': 23, 'train': 1239},
 'después': {'dev': 35, 'train': 1715},
 'dicen': {'dev': 21, 'train': 1016},
 'difícil': {'dev': 20, 'train': 949},
 'dinero': {'dev': 30, 'train': 990},
 'donde': {'dev': 50, 'train': 1527},
 'dónde': {'dev': 31, 'train': 1278},
 'ejemplo': {'dev': 23, 'train': 1972},
 'ellos': {'dev': 67, 'train': 2858},
 '

In [56]:
sample_terms_details['words']['bueno']

{'dev': 158, 'train': 7215}

In [57]:
bow_es_top_words_vocab['freq'][b'bueno'], bow_es_top_words_vocab['freq_dev'][b'bueno']

(7215, 158)

In [58]:
pickle.dump(bow_es_top_words_vocab, open(os.path.join(m_cfg['data_path'], "bow_es_100word_vocab.dict"), "wb"))

In [79]:
pickle.dump(bow_es_top_words_vocab, open(os.path.join(m_cfg['data_path'], "bow_es_1word_vocab.dict"), "wb"))

In [None]:
!su s1444673

Password: 

In [117]:
pickle.dump(bow_es_top_words_vocab, open(os.path.join(m_cfg['data_path'], "bow_es_100word_vocab.dict"), "wb"))

PermissionError: [Errno 13] Permission denied: 'fbanks_80dim_nltk/bow_es_100word_vocab.dict'

In [216]:
pickle.dump(bow_es_top_words_vocab, open(os.path.join(m_cfg['data_path'], "bow_es_top_words_vocab.dict"), "wb"))

In [217]:
sample_terms_details['words']['colorado']

{'dev': 11, 'train': 93}

In [96]:
sorted([(w, sample_terms_details['words'][w]['train']) 
       for w in sample_terms_details['words']], reverse=True, key=lambda t: t[1])[:20]

[('porque', 10039),
 ('bueno', 7215),
 ('entonces', 6340),
 ('también', 5385),
 ('claro', 5207),
 ('gente', 4106),
 ('cuando', 4045),
 ('mucho', 4017),
 ('tiene', 3884),
 ('tengo', 3078),
 ('estoy', 2957),
 ('ahora', 2955),
 ('cosas', 2881),
 ('ellos', 2858),
 ('verdad', 2677),
 ('están', 2339),
 ('veces', 2221),
 ('tienen', 2154),
 ('siempre', 1993),
 ('ejemplo', 1972)]

In [97]:
sorted([(w, sample_terms_details['words'][w]['dev']) 
       for w in sample_terms_details['words']], reverse=True, key=lambda t: t[1])[:20]

[('porque', 295),
 ('entonces', 222),
 ('también', 168),
 ('cuando', 159),
 ('bueno', 158),
 ('gente', 126),
 ('mucho', 123),
 ('claro', 116),
 ('tiene', 110),
 ('música', 105),
 ('sabes', 93),
 ('cosas', 81),
 ('tengo', 79),
 ('siempre', 75),
 ('ellos', 67),
 ('veces', 67),
 ('tienen', 66),
 ('ahora', 65),
 ('tiempo', 54),
 ('gusta', 54)]

In [None]:
m_cfg['data_path']

In [None]:
!ls fbanks_80dim_nltk