In [1]:
import nltk
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu, modified_precision
from nltk.translate.chrf_score import sentence_chrf, corpus_chrf
from nltk.metrics import scores
import scipy.io.wavfile
from IPython.display import Audio
from IPython.display import display
from nltk.stem import *
# from nltk.stem.snowball import SnowballStemmer
from stemming.porter2 import stem
import stemming
from nltk.metrics.scores import recall

from nltk.corpus import stopwords

%matplotlib inline

In [2]:
tableau20 = [(31, 119, 180), (174, 199, 232), (255, 127, 14), (255, 187, 120),    
             (44, 160, 44), (152, 223, 138), (214, 39, 40), (255, 152, 150),    
             (148, 103, 189), (197, 176, 213), (140, 86, 75), (196, 156, 148),    
             (227, 119, 194), (247, 182, 210), (127, 127, 127), (199, 199, 199),    
             (188, 189, 34), (219, 219, 141), (23, 190, 207), (158, 218, 229)]    
# Scale the RGB values to the [0, 1] range, which is the format matplotlib accepts.    
for i in range(len(tableau20)):    
    r, g, b = tableau20[i]    
    tableau20[i] = (r / 255., g / 255., b / 255.)

In [3]:
smooth_fun = nltk.translate.bleu_score.SmoothingFunction()

In [4]:
from nmt_run import *

In [None]:
cfg_path = "sp2enw_mel-80_vocab-nltk/sp_1.0_h-256_e-128_drpt-rnn-.3_lstm"

In [None]:
%%capture
last_epoch, model, optimizer, m_cfg, t_cfg = check_model(cfg_path)

### Load Fisher dataset

In [None]:
%%capture
# -------------------------------------------------------------------------
# get data dictionaries
# -------------------------------------------------------------------------
map_dict, vocab_dict, bucket_dict = get_data_dicts(m_cfg)
info_dict = pickle.load(open("fbanks_80dim_nltk/info.dict", "rb"))
sim_dict = pickle.load(open("./fbanks_80dim_nltk/mix_sim.dict", "rb"))

In [None]:
random.seed("meh")
# random.seed("haha")

### word level analysis

In [None]:
min_word_len = 1
top_k = 100

In [None]:
stop_words = set(nltk.corpus.stopwords.words("english"))
len(stop_words)

In [None]:
def get_words(m_dict):
    words = []
    for u in m_dict:
        if type(m_dict[u]['en_w']) == list:
            words.extend([w.decode() for w in m_dict[u]['en_w']])
        else:
            for ref in m_dict[u]['en_w']:
                words.extend([w.decode() for w in ref])
    return Counter(words)

In [None]:
# words in train
train_words = get_words(map_dict['fisher_train'])
train_words_top_k = [(w,f) for w, f in sorted(train_words.items(), reverse=True, key=lambda t:t[1]) 
                     if w not in stop_words and len(w) >= min_word_len][:top_k]

train_only_words = set(train_words.keys())

print("{0:20s} | {1:10d}".format("# train word types", len(train_words)))
print("{0:20s} | {1:10d}".format("# train word tokens", sum(train_words.values())))

In [None]:
train_words_top_k[:5]

In [None]:
[(w,f) for w,f in train_words_top_k if "'" in w]

In [None]:
dev_words = get_words(map_dict['fisher_dev'])
dev_words_top_k = [(w,f) for w, f in sorted(dev_words.items(), reverse=True, key=lambda t:t[1]) 
                     if w not in stop_words and len(w) >= min_word_len][:top_k]

dev_only_words = set(dev_words.keys())

In [None]:
dev_words_top_k[:5]

In [None]:
oov_words = {w:f for w,f in dev_words.items() if w not in train_only_words}

In [None]:
print("{0:20s} | {1:10d}".format("# dev word types", len(dev_only_words)))
print("{0:20s} | {1:10d}".format("# dev word tokens", sum(dev_words.values())))

print("{0:20s} | {1:10d}".format("# oov word types", len(oov_words)))
print("{0:20s} | {1:10d}".format("# oov word tokens", sum(oov_words.values())))


In [None]:
"{0:.1f}%".format(sum(oov_words.values()) / sum(dev_words.values()) * 100)

### Word level - get train, dev frequency, and utts in which they occur

In [None]:
word_utt_count = {"train": {}, "dev": {}, "train_utts": {}, "dev_utts": {}}

In [None]:
len(train_only_words), len(set([stem(w) for w in train_only_words]))

In [None]:
for u in tqdm(map_dict["fisher_train"].keys()):
    for w in set(map_dict["fisher_train"][u]["en_w"]):
        curr_word = w.decode()
        if curr_word not in word_utt_count["train"]:
            word_utt_count["train"][curr_word] = 0
            word_utt_count["train_utts"][curr_word] = set()
        word_utt_count["train"][curr_word] += 1
        word_utt_count["train_utts"][curr_word].update({u})
    # end for words in current utt
# end for all utts

In [None]:
for u in tqdm(map_dict["fisher_dev"].keys()):
    for ref in map_dict["fisher_dev"][u]["en_w"]:
        for w in set(ref):
            curr_word = w.decode()
            if curr_word not in word_utt_count["dev"]:
                word_utt_count["dev"][curr_word] = 0
                word_utt_count["dev_utts"][curr_word] = set()
            word_utt_count["dev"][curr_word] += 1            
            word_utt_count["dev_utts"][curr_word].update({u})
        # end for words in current ref
    # end for all references
# end for all utts

In [None]:
all_train_utts = set()
for w in word_utt_count["train_utts"]:
    all_train_utts.update(word_utt_count["train_utts"][w])
# end for

all_dev_utts = set()
for w in word_utt_count["dev_utts"]:
    all_dev_utts.update(word_utt_count["dev_utts"][w])
# end for

In [None]:
len(all_train_utts), len(all_dev_utts)

In [None]:
print("word types")
len(word_utt_count['train']), len(word_utt_count['dev'])

In [None]:
print("common word types")
common_words = set(word_utt_count['train'].keys()) & set(word_utt_count['dev'].keys())
len(common_words)

In [None]:
def get_details_for_words(words, min_dev_freq, max_dev_freq, min_train_freq, min_len):
    details = {"words": {}, "train_utts": set(), "dev_utts": set()}
    
    in_vocab_words = set(words) & set(common_words)
    print("number of in-vocab words = {0:d}".format(len(in_vocab_words)))

    for w in in_vocab_words:
        t_count, d_count = word_utt_count["train"][w], word_utt_count["dev"][w]
        if ((d_count >= min_dev_freq) and 
            (d_count <= max_dev_freq) and
            (len(w) >= min_len) and
            (t_count >= min_train_freq)):
            details["words"][w] = {"train": t_count, "dev": d_count}
            details["train_utts"].update(word_utt_count["train_utts"][w])
            details["dev_utts"].update(word_utt_count["dev_utts"][w])
        # end meets criteria
    # end for in-vocab word
    return details
# end function

In [None]:
def get_duration(utts, key):
    dur = 0
    utts_not_found = []
    for u in utts:
        if u not in info_dict[key]:
            #print("argh!", u)
            utts_not_found.append(u)
        else:
            dur += (info_dict[key][u]['sp'] * 10)
    dur = dur / 60 / 60 / 1000
    print("-"*80)
    print("{0:d} total utts".format(len(utts)))
    print("{0:d} not found".format(len(utts_not_found)))
    print("selected utts from {0:s} -- duration = {1:.2f} hours".format(key, dur))
    return dur

In [None]:
train_dur, dev_dur = get_duration(all_train_utts, key="fisher_train"), get_duration(all_dev_utts, key="fisher_dev")

### Task 1 - randomly selected frequent words

In [None]:
terms_of_interest = get_details_for_words(common_words, 
                                          min_dev_freq=10, 
                                          max_dev_freq=100, 
                                          min_train_freq=100, 
                                          min_len=5)

In [None]:
print("total words meeting criteria = {0:d}".format(len(terms_of_interest["words"])))

In [None]:
_, _ = get_duration(terms_of_interest["train_utts"], key="fisher_train"), get_duration(terms_of_interest["dev_utts"], key="fisher_dev")

In [None]:
random.seed("hmm")
sample_terms = random.sample(list(terms_of_interest["words"].keys()), 100)

In [None]:
sample_terms_details = get_details_for_words(sample_terms, 
                                              min_dev_freq=10, 
                                              max_dev_freq=100, 
                                              min_train_freq=100, 
                                              min_len=5)

In [None]:
print("total words meeting criteria = {0:d}".format(len(sample_terms_details["words"])))

In [None]:
_, _ = get_duration(sample_terms_details["train_utts"], key="fisher_train"), get_duration(sample_terms_details["dev_utts"], key="fisher_dev")

In [None]:
sample_terms[:10]

### Task 2 - topics as keywords

In [None]:
train_text_fname= "../installs/fisher-callhome-corpus/corpus/ldc/fisher_train.en"
topics_fname = "../criseslex/fsp06_topics_in_english.txt"

In [None]:
topics = [ "peace", "Music", "Marriage", "Religion", "Cell phones", 
           "Dating", "Telemarketing and SPAM", "Politics", "Travel", 
           "Technical devices", "Healthcare", "Advertisements", "Power", 
           "Occupations", "Movies", "Welfare", "Breaking up", "Location", 
           "Justice", "Memories", "Crime", "Violence against women", "Equality", 
            "Housing", "Immigration",     
            # new topics
           "Interracial", "Christians", "muslims", "jews", "e-mail", 
           "phone", "democracy", "Democratic", "Republican", "technology", 
           "leadership", "community", "jury", "police", "inequality", 
           "renting", "Violence", "immigrants", "immigrant", "skilled", 
           "Telemarketing", "SPAM", "skill", "job", "health", "mobile", 
            "ads", "physical", "emotional", "bubble", "rent", "economy", 
            "abuse", "women", "city", "country", "suburban", "dollar", 
            "united states", "laws", "phone", "race", "biracial", "interracial", 
            "marriage", "lyrics", "sexuality", "medicine", "television", "european",
            "home", "protect", "spouse", "language", "cellphone", "money",
            "doctor", "insurance", "cigarettes", "alcohol", "income", "salary",
            "class", "censor", "rating", "programs", "government",
            "relationship", "legal", "event", "life", "safe", "victim", "cops",
            "wage", "illegal"
            ]
topics = list(set(t.lower() for t in topics))
topics_stem = [stem(t) for t in topics]

# add similar topic words
new_topics = []
for t in topics:
    if t.encode() in sim_dict['w']:
        new_topics.extend([w.decode() for w in sim_dict['w'][t.encode()]])
topics.extend(new_topics)

In [None]:
len(topics)

In [None]:
topics_details = get_details_for_words(topics, 
                                       min_dev_freq=10, 
                                       max_dev_freq=100, 
                                       min_train_freq=100, 
                                       min_len=1)
print("total words meeting criteria = {0:d}".format(len(topics_details["words"])))

In [None]:
_, _ = get_duration(topics_details["train_utts"], key="fisher_train"), get_duration(topics_details["dev_utts"], key="fisher_dev")

In [None]:
print("\n".join(list(topics_details["words"].keys())))

### Task 3 - crises terms as keywords

In [None]:
crises_lex_fname = "../criseslex/CrisisLexLexicon/CrisisLexRec.txt"

In [None]:
crises = set()
with open(crises_lex_fname, "r") as in_f:
    for line in in_f:
        crises.update(line.strip().split())
crises = list(crises)
crises_stem = [stem(w) for w in crises]

# new_crises = []
# for t in crises:
#     if t.encode() in sim_dict['w']:
#         new_crises.extend([w.decode() for w in sim_dict['w'][t.encode()]])
# crises.extend(new_crises)

In [None]:
len(crises)

In [None]:
crises_details = get_details_for_words(crises, 
                                       min_dev_freq=10, 
                                       max_dev_freq=100, 
                                       min_train_freq=100, 
                                       min_len=1)
print("total words meeting criteria = {0:d}".format(len(crises_details["words"])))

In [None]:
_, _ = get_duration(crises_details["train_utts"], key="fisher_train"), get_duration(crises_details["dev_utts"], key="fisher_dev")

In [None]:
print("\n".join(list(crises_details["words"].keys())))