In [73]:
import nltk
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu, modified_precision
from nltk.translate.chrf_score import sentence_chrf, corpus_chrf
from nltk.metrics import scores
import scipy.io.wavfile
from IPython.display import Audio
from IPython.display import display
from nltk.stem import *
# from nltk.stem.snowball import SnowballStemmer
from stemming.porter2 import stem
import stemming
from nltk.metrics.scores import recall

from basics import *

from nltk.corpus import stopwords

%matplotlib inline

In [2]:
tableau20 = [(31, 119, 180), (174, 199, 232), (255, 127, 14), (255, 187, 120),    
             (44, 160, 44), (152, 223, 138), (214, 39, 40), (255, 152, 150),    
             (148, 103, 189), (197, 176, 213), (140, 86, 75), (196, 156, 148),    
             (227, 119, 194), (247, 182, 210), (127, 127, 127), (199, 199, 199),    
             (188, 189, 34), (219, 219, 141), (23, 190, 207), (158, 218, 229)]    
# Scale the RGB values to the [0, 1] range, which is the format matplotlib accepts.    
for i in range(len(tableau20)):    
    r, g, b = tableau20[i]    
    tableau20[i] = (r / 255., g / 255., b / 255.)

In [3]:
smooth_fun = nltk.translate.bleu_score.SmoothingFunction()

In [4]:
from nmt_run import *

In [5]:
cfg_path = "sp2enw_mel-80_vocab-nltk/sp_1.0_h-256_e-128_drpt-rnn-.3_lstm"

In [6]:
%%capture
last_epoch, model, optimizer, m_cfg, t_cfg = check_model(cfg_path)

### Load Fisher dataset

In [7]:
%%capture
# -------------------------------------------------------------------------
# get data dictionaries
# -------------------------------------------------------------------------
map_dict, vocab_dict, bucket_dict = get_data_dicts(m_cfg)
info_dict = pickle.load(open("fbanks_80dim_nltk/info.dict", "rb"))
sim_dict = pickle.load(open("./fbanks_80dim_nltk/mix_sim.dict", "rb"))

In [8]:
random.seed("meh")
# random.seed("haha")

### word level analysis

In [9]:
min_word_len = 1
top_k = 100

In [10]:
stop_words = set(nltk.corpus.stopwords.words("english"))
len(stop_words)

127

In [11]:
def get_words(m_dict):
    words = []
    for u in m_dict:
        if type(m_dict[u]['en_w']) == list:
            words.extend([w.decode() for w in m_dict[u]['en_w']])
        else:
            for ref in m_dict[u]['en_w']:
                words.extend([w.decode() for w in ref])
    return Counter(words)

In [12]:
# words in train
train_words = get_words(map_dict['fisher_train'])
train_words_top_k = [(w,f) for w, f in sorted(train_words.items(), reverse=True, key=lambda t:t[1]) 
                     if w not in stop_words and len(w) >= min_word_len][:top_k]

train_only_words = set(train_words.keys())

print("{0:20s} | {1:10d}".format("# train word types", len(train_words)))
print("{0:20s} | {1:10d}".format("# train word tokens", sum(train_words.values())))

# train word types   |      17830
# train word tokens  |    1497352


In [13]:
train_words_top_k[:5]

[('yes', 35054),
 ("'s", 24162),
 ("n't", 19184),
 ('like', 14334),
 ('well', 12354)]

In [14]:
[(w,f) for w,f in train_words_top_k if "'" in w]

[("'s", 24162), ("n't", 19184), ("'m", 5546), ("'re", 2832), ("'ve", 2392)]

In [15]:
dev_words = get_words(map_dict['fisher_dev'])
dev_words_top_k = [(w,f) for w, f in sorted(dev_words.items(), reverse=True, key=lambda t:t[1]) 
                     if w not in stop_words and len(w) >= min_word_len][:top_k]

dev_only_words = set(dev_words.keys())

In [16]:
dev_words_top_k[:5]

[('yes', 3652), ("n't", 1999), ("'s", 1866), ('like', 1826), ('know', 1294)]

In [17]:
oov_words = {w:f for w,f in dev_words.items() if w not in train_only_words}

In [18]:
print("{0:20s} | {1:10d}".format("# dev word types", len(dev_only_words)))
print("{0:20s} | {1:10d}".format("# dev word tokens", sum(dev_words.values())))

print("{0:20s} | {1:10d}".format("# oov word types", len(oov_words)))
print("{0:20s} | {1:10d}".format("# oov word tokens", sum(oov_words.values())))


# dev word types     |       4835
# dev word tokens    |     165206
# oov word types     |       1011
# oov word tokens    |       1599


In [19]:
"{0:.1f}%".format(sum(oov_words.values()) / sum(dev_words.values()) * 100)

'1.0%'

### Word level - get train, dev frequency, and utts in which they occur

In [20]:
word_utt_count = {"train": {}, "dev": {}, "train_utts": {}, "dev_utts": {}}

In [21]:
len(train_only_words), len(set([stem(w) for w in train_only_words]))

(17830, 12011)

In [22]:
for u in tqdm(map_dict["fisher_train"].keys()):
    for w in set(map_dict["fisher_train"][u]["en_w"]):
        curr_word = w.decode()
        if curr_word not in word_utt_count["train"]:
            word_utt_count["train"][curr_word] = 0
            word_utt_count["train_utts"][curr_word] = set()
        word_utt_count["train"][curr_word] += 1
        word_utt_count["train_utts"][curr_word].update({u})
    # end for words in current utt
# end for all utts

100%|██████████| 138819/138819 [00:02<00:00, 52107.80it/s]


In [23]:
for u in tqdm(map_dict["fisher_dev"].keys()):
    for ref in map_dict["fisher_dev"][u]["en_w"]:
        for w in set(ref):
            curr_word = w.decode()
            if curr_word not in word_utt_count["dev"]:
                word_utt_count["dev"][curr_word] = 0
                word_utt_count["dev_utts"][curr_word] = set()
            word_utt_count["dev"][curr_word] += 1            
            word_utt_count["dev_utts"][curr_word].update({u})
        # end for words in current ref
    # end for all references
# end for all utts

100%|██████████| 3979/3979 [00:00<00:00, 15016.78it/s]


In [24]:
all_train_utts = set()
for w in word_utt_count["train_utts"]:
    all_train_utts.update(word_utt_count["train_utts"][w])
# end for

all_dev_utts = set()
for w in word_utt_count["dev_utts"]:
    all_dev_utts.update(word_utt_count["dev_utts"][w])
# end for

In [25]:
len(all_train_utts), len(all_dev_utts)

(138795, 3979)

In [26]:
print("word types")
len(word_utt_count['train']), len(word_utt_count['dev'])

word types


(17830, 4835)

In [27]:
print("common word types")
common_words = set(word_utt_count['train'].keys()) & set(word_utt_count['dev'].keys())
len(common_words)

common word types


3824

In [28]:
def get_details_for_words(words, min_dev_freq, max_dev_freq, min_train_freq, min_len):
    details = {"words": {}, "train_utts": set(), "dev_utts": set()}
    
    in_vocab_words = set(words) & set(common_words)
    print("number of in-vocab words = {0:d}".format(len(in_vocab_words)))

    for w in in_vocab_words:
        t_count, d_count = word_utt_count["train"][w], word_utt_count["dev"][w]
        if ((d_count >= min_dev_freq) and 
            (d_count <= max_dev_freq) and
            (len(w) >= min_len) and
            (t_count >= min_train_freq)):
            details["words"][w] = {"train": t_count, "dev": d_count}
            details["train_utts"].update(word_utt_count["train_utts"][w])
            details["dev_utts"].update(word_utt_count["dev_utts"][w])
        # end meets criteria
    # end for in-vocab word
    return details
# end function

In [29]:
def get_duration(utts, key):
    dur = 0
    utts_not_found = []
    for u in utts:
        if u not in info_dict[key]:
            #print("argh!", u)
            utts_not_found.append(u)
        else:
            dur += (info_dict[key][u]['sp'] * 10)
    dur = dur / 60 / 60 / 1000
    print("-"*80)
    print("{0:d} total utts".format(len(utts)))
    print("{0:d} not found".format(len(utts_not_found)))
    print("selected utts from {0:s} -- duration = {1:.2f} hours".format(key, dur))
    return dur

In [114]:
def create_vocab(words_list):
    out = {"w2i":{}, "i2w":{}, "freq":{}, "freq_dev":{}}
    START_VOCAB = [PAD, GO, EOS, UNK]
    for w in START_VOCAB:
        out['w2i'][w] = len(out["w2i"])
        out["freq"][w] = 1
    #for w in words_list['words']:
    sorted_w = sorted(words_list['words'].items(), reverse=True, key=lambda t: t[1]['train'])
    for w in sorted_w:
        encoded_word = w[0].encode()
        out["w2i"][encoded_word] = len(out["w2i"])
        out["freq"][encoded_word] = w[1]["train"]
        out["freq_dev"][encoded_word] = w[1]["dev"]

    out["i2w"] = {val:key for key, val in out["w2i"].items()}
    return out

In [115]:
train_dur, dev_dur = get_duration(all_train_utts, key="fisher_train"), get_duration(all_dev_utts, key="fisher_dev")

--------------------------------------------------------------------------------
138795 total utts
89 not found
selected utts from fisher_train -- duration = 161.62 hours
--------------------------------------------------------------------------------
3979 total utts
2 not found
selected utts from fisher_dev -- duration = 4.35 hours


### Task 1 - randomly selected frequent words

In [116]:
terms_of_interest = get_details_for_words(common_words, 
                                          min_dev_freq=10, 
                                          max_dev_freq=100, 
                                          min_train_freq=100, 
                                          min_len=5)

number of in-vocab words = 3824


In [117]:
print("total words meeting criteria = {0:d}".format(len(terms_of_interest["words"])))

total words meeting criteria = 372


In [118]:
_, _ = get_duration(terms_of_interest["train_utts"], key="fisher_train"), get_duration(terms_of_interest["dev_utts"], key="fisher_dev")

--------------------------------------------------------------------------------
62610 total utts
35 not found
selected utts from fisher_train -- duration = 109.48 hours
--------------------------------------------------------------------------------
2084 total utts
1 not found
selected utts from fisher_dev -- duration = 3.30 hours


In [119]:
random.seed("hmm")
sample_terms = random.sample(list(terms_of_interest["words"].keys()), 100)

In [120]:
sample_terms_details = get_details_for_words(sample_terms, 
                                              min_dev_freq=10, 
                                              max_dev_freq=100, 
                                              min_train_freq=100, 
                                              min_len=5)

number of in-vocab words = 100


In [121]:
print("total words meeting criteria = {0:d}".format(len(sample_terms_details["words"])))

total words meeting criteria = 100


In [122]:
_, _ = get_duration(sample_terms_details["train_utts"], key="fisher_train"), get_duration(sample_terms_details["dev_utts"], key="fisher_dev")

--------------------------------------------------------------------------------
23284 total utts
15 not found
selected utts from fisher_train -- duration = 44.29 hours
--------------------------------------------------------------------------------
999 total utts
1 not found
selected utts from fisher_dev -- duration = 1.77 hours


In [123]:
sample_terms[:10]

['around',
 'outside',
 'salsa',
 'america',
 'classes',
 'terrible',
 'atlanta',
 'least',
 'email',
 'father']

In [124]:
bow_top_100_words_vocab = create_vocab(sample_terms_details)

In [125]:
pickle.dump(bow_top_100_words_vocab, open(os.path.join(m_cfg['data_path'], "bow_top_100_words_vocab.dict"), "wb"))

### Task 2 - topics as keywords

In [126]:
train_text_fname= "../installs/fisher-callhome-corpus/corpus/ldc/fisher_train.en"
topics_fname = "../criseslex/fsp06_topics_in_english.txt"

In [127]:
topics = [ "peace", "Music", "Marriage", "Religion", "Cell phones", 
           "Dating", "Telemarketing and SPAM", "Politics", "Travel", 
           "Technical devices", "Healthcare", "Advertisements", "Power", 
           "Occupations", "Movies", "Welfare", "Breaking up", "Location", 
           "Justice", "Memories", "Crime", "Violence against women", "Equality", 
            "Housing", "Immigration",     
            # new topics
           "Interracial", "Christians", "muslims", "jews", "e-mail", 
           "phone", "democracy", "Democratic", "Republican", "technology", 
           "leadership", "community", "jury", "police", "inequality", 
           "renting", "Violence", "immigrants", "immigrant", "skilled", 
           "Telemarketing", "SPAM", "skill", "job", "health", "mobile", 
            "ads", "physical", "emotional", "bubble", "rent", "economy", 
            "abuse", "women", "city", "country", "suburban", "dollar", 
            "united states", "laws", "phone", "race", "biracial", "interracial", 
            "marriage", "lyrics", "sexuality", "medicine", "television", "european",
            "home", "protect", "spouse", "language", "cellphone", "money",
            "doctor", "insurance", "cigarettes", "alcohol", "income", "salary",
            "class", "censor", "rating", "programs", "government",
            "relationship", "legal", "event", "life", "safe", "victim", "cops",
            "wage", "illegal"
            ]
topics = list(set(t.lower() for t in topics))
topics_stem = [stem(t) for t in topics]

# add similar topic words
new_topics = []
for t in topics:
    if t.encode() in sim_dict['w']:
        new_topics.extend([w.decode() for w in sim_dict['w'][t.encode()]])
topics.extend(new_topics)

In [128]:
len(topics)

555

In [129]:
topics_details = get_details_for_words(topics, 
                                       min_dev_freq=10, 
                                       max_dev_freq=100, 
                                       min_train_freq=100, 
                                       min_len=1)
print("total words meeting criteria = {0:d}".format(len(topics_details["words"])))

number of in-vocab words = 170
total words meeting criteria = 30


In [130]:
_, _ = get_duration(topics_details["train_utts"], key="fisher_train"), get_duration(topics_details["dev_utts"], key="fisher_dev")

--------------------------------------------------------------------------------
9595 total utts
4 not found
selected utts from fisher_train -- duration = 20.29 hours
--------------------------------------------------------------------------------
312 total utts
0 not found
selected utts from fisher_dev -- duration = 0.59 hours


In [131]:
print("\n".join(list(topics_details["words"].keys())))

tv
dollars
country
language
job
police
religious
religions
relationship
phone
race
movies
politics
immigration
crime
program
rent
government
class
life
classes
television
marriage
travel
programs
christian
women
jury
home
europe


In [132]:
bow_topics_vocab = create_vocab(topics_details)

In [133]:
pickle.dump(bow_topics_vocab, open(os.path.join(m_cfg['data_path'], "bow_topics_vocab.dict"), "wb"))

### Task 3 - crises terms as keywords

In [134]:
crises_lex_fname = "../criseslex/CrisisLexLexicon/CrisisLexRec.txt"

In [135]:
crises = set()
with open(crises_lex_fname, "r") as in_f:
    for line in in_f:
        crises.update(line.strip().split())
crises = list(crises)
crises_stem = [stem(w) for w in crises]

# new_crises = []
# for t in crises:
#     if t.encode() in sim_dict['w']:
#         new_crises.extend([w.decode() for w in sim_dict['w'][t.encode()]])
# crises.extend(new_crises)

In [136]:
len(crises)

288

In [137]:
crises_details = get_details_for_words(crises, 
                                       min_dev_freq=10, 
                                       max_dev_freq=100, 
                                       min_train_freq=100, 
                                       min_len=1)
print("total words meeting criteria = {0:d}".format(len(crises_details["words"])))

number of in-vocab words = 123
total words meeting criteria = 42


In [138]:
_, _ = get_duration(crises_details["train_utts"], key="fisher_train"), get_duration(crises_details["dev_utts"], key="fisher_dev")

--------------------------------------------------------------------------------
14444 total utts
5 not found
selected utts from fisher_train -- duration = 29.37 hours
--------------------------------------------------------------------------------
602 total utts
0 not found
selected utts from fisher_dev -- duration = 1.17 hours


In [139]:
print("\n".join(list(crises_details["words"].keys())))

lives
free
lost
police
stay
morning
high
town
change
make
black
situation
return
give
waiting
areas
public
huge
service
found
number
kill
love
news
government
gets
coming
life
terrible
send
remember
saying
died
women
girl
first
case
leave
home
water
watch
need


In [140]:
bow_crises_vocab = create_vocab(crises_details)

In [141]:
pickle.dump(bow_crises_vocab, open(os.path.join(m_cfg['data_path'], "bow_crises_vocab.dict"), "wb"))

In [142]:
m_cfg['data_path']

'fbanks_80dim_nltk'

In [112]:
!ls fbanks_80dim_nltk

bow_crises_vocab.dict	      info.dict
bow_top_100_words_vocab.dict  kaldi_segment_map.dict
bow_topics_vocab.dict	      map.dict
buckets_sp.dict		      mix_sim.dict
callhome_devtest	      pre_trained_sim.dict
callhome_evltest	      rev_map.dict
callhome_train		      sim.dict
ch_train_vocab.dict	      train_reduced_vocab_enw.dict
fisher_dev		      train_top_K_enw_1000.dict
fisher_dev2		      train_top_K_enw.dict
fisher_test		      train_vocab.dict
fisher_train
