In [1]:
import nltk
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu, modified_precision
from nltk.translate.chrf_score import sentence_chrf, corpus_chrf
from nltk.metrics import scores
import scipy.io.wavfile
from IPython.display import Audio
from IPython.display import display
from nltk.stem import *
# from nltk.stem.snowball import SnowballStemmer
from stemming.porter2 import stem
import stemming
from nltk.metrics.scores import recall

from nltk.corpus import stopwords

%matplotlib inline

In [2]:
tableau20 = [(31, 119, 180), (174, 199, 232), (255, 127, 14), (255, 187, 120),    
             (44, 160, 44), (152, 223, 138), (214, 39, 40), (255, 152, 150),    
             (148, 103, 189), (197, 176, 213), (140, 86, 75), (196, 156, 148),    
             (227, 119, 194), (247, 182, 210), (127, 127, 127), (199, 199, 199),    
             (188, 189, 34), (219, 219, 141), (23, 190, 207), (158, 218, 229)]    
# Scale the RGB values to the [0, 1] range, which is the format matplotlib accepts.    
for i in range(len(tableau20)):    
    r, g, b = tableau20[i]    
    tableau20[i] = (r / 255., g / 255., b / 255.)

In [3]:
smooth_fun = nltk.translate.bleu_score.SmoothingFunction()

In [4]:
from nmt_run import *

In [5]:
cfg_path = "sp2enw_mel-80_vocab-nltk/sp_1.0_h-256_e-128_drpt-rnn-.3_lstm"

In [6]:
%%capture
last_epoch, model, optimizer, m_cfg, t_cfg = check_model(cfg_path)

### Load Fisher dataset

In [56]:
%%capture
# -------------------------------------------------------------------------
# get data dictionaries
# -------------------------------------------------------------------------
map_dict, vocab_dict, bucket_dict = get_data_dicts(m_cfg)
info_dict = pickle.load(open("fbanks_80dim_nltk/info.dict", "rb"))
sim_dict = pickle.load(open("./fbanks_80dim_nltk/mix_sim.dict", "rb"))

In [11]:
random.seed("meh")
# random.seed("haha")

### word level analysis

In [12]:
min_word_len = 1
top_k = 100

In [13]:
stop_words = set(nltk.corpus.stopwords.words("english"))
len(stop_words)

127

In [14]:
def get_words(m_dict):
    words = []
    for u in m_dict:
        if type(m_dict[u]['en_w']) == list:
            words.extend([w.decode() for w in m_dict[u]['en_w']])
        else:
            for ref in m_dict[u]['en_w']:
                words.extend([w.decode() for w in ref])
    return Counter(words)

In [15]:
# words in train
train_words = get_words(map_dict['fisher_train'])
train_words_top_k = [(w,f) for w, f in sorted(train_words.items(), reverse=True, key=lambda t:t[1]) 
                     if w not in stop_words and len(w) >= min_word_len][:top_k]

train_only_words = set(train_words.keys())

print("{0:20s} | {1:10d}".format("# train word types", len(train_words)))
print("{0:20s} | {1:10d}".format("# train word tokens", sum(train_words.values())))

# train word types   |      17830
# train word tokens  |    1497352


In [16]:
train_words_top_k[:5]

[('yes', 35054),
 ("'s", 24162),
 ("n't", 19184),
 ('like', 14334),
 ('well', 12354)]

In [17]:
[(w,f) for w,f in train_words_top_k if "'" in w]

[("'s", 24162), ("n't", 19184), ("'m", 5546), ("'re", 2832), ("'ve", 2392)]

In [18]:
dev_words = get_words(map_dict['fisher_dev'])
dev_words_top_k = [(w,f) for w, f in sorted(dev_words.items(), reverse=True, key=lambda t:t[1]) 
                     if w not in stop_words and len(w) >= min_word_len][:top_k]

dev_only_words = set(dev_words.keys())

In [19]:
dev_words_top_k[:5]

[('yes', 3652), ("n't", 1999), ("'s", 1866), ('like', 1826), ('know', 1294)]

In [20]:
oov_words = {w:f for w,f in dev_words.items() if w not in train_only_words}

In [21]:
print("{0:20s} | {1:10d}".format("# dev word types", len(dev_only_words)))
print("{0:20s} | {1:10d}".format("# dev word tokens", sum(dev_words.values())))

print("{0:20s} | {1:10d}".format("# oov word types", len(oov_words)))
print("{0:20s} | {1:10d}".format("# oov word tokens", sum(oov_words.values())))


# dev word types     |       4835
# dev word tokens    |     165206
# oov word types     |       1011
# oov word tokens    |       1599


In [22]:
"{0:.1f}%".format(sum(oov_words.values()) / sum(dev_words.values()) * 100)

'1.0%'

### Word level - get train, dev frequency, and utts in which they occur

In [23]:
word_utt_count = {"train": {}, "dev": {}, "train_utts": {}, "dev_utts": {}}

In [24]:
len(train_only_words), len(set([stem(w) for w in train_only_words]))

(17830, 12011)

In [25]:
for u in tqdm(map_dict["fisher_train"].keys()):
    for w in set(map_dict["fisher_train"][u]["en_w"]):
        curr_word = w.decode()
        if curr_word not in word_utt_count["train"]:
            word_utt_count["train"][curr_word] = 0
            word_utt_count["train_utts"][curr_word] = set()
        word_utt_count["train"][curr_word] += 1
        word_utt_count["train_utts"][curr_word].update({u})
    # end for words in current utt
# end for all utts

100%|██████████| 138819/138819 [00:02<00:00, 52023.06it/s]


In [26]:
for u in tqdm(map_dict["fisher_dev"].keys()):
    for ref in map_dict["fisher_dev"][u]["en_w"]:
        for w in set(ref):
            curr_word = w.decode()
            if curr_word not in word_utt_count["dev"]:
                word_utt_count["dev"][curr_word] = 0
                word_utt_count["dev_utts"][curr_word] = set()
            word_utt_count["dev"][curr_word] += 1            
            word_utt_count["dev_utts"][curr_word].update({u})
        # end for words in current ref
    # end for all references
# end for all utts

100%|██████████| 3979/3979 [00:00<00:00, 14711.07it/s]


In [27]:
all_train_utts = set()
for w in word_utt_count["train_utts"]:
    all_train_utts.update(word_utt_count["train_utts"][w])
# end for

all_dev_utts = set()
for w in word_utt_count["dev_utts"]:
    all_dev_utts.update(word_utt_count["dev_utts"][w])
# end for

In [28]:
len(all_train_utts), len(all_dev_utts)

(138795, 3979)

In [29]:
print("word types")
len(word_utt_count['train']), len(word_utt_count['dev'])

word types


(17830, 4835)

In [30]:
print("common word types")
common_words = set(word_utt_count['train'].keys()) & set(word_utt_count['dev'].keys())
len(common_words)

common word types


3824

In [31]:
def get_details_for_words(words, min_dev_freq, max_dev_freq, min_train_freq, min_len):
    details = {"words": {}, "train_utts": set(), "dev_utts": set()}
    
    in_vocab_words = set(words) & set(common_words)
    print("number of in-vocab words = {0:d}".format(len(in_vocab_words)))

    for w in in_vocab_words:
        t_count, d_count = word_utt_count["train"][w], word_utt_count["dev"][w]
        if ((d_count >= min_dev_freq) and 
            (d_count <= max_dev_freq) and
            (len(w) >= min_len) and
            (t_count >= min_train_freq)):
            details["words"][w] = {"train": t_count, "dev": d_count}
            details["train_utts"].update(word_utt_count["train_utts"][w])
            details["dev_utts"].update(word_utt_count["dev_utts"][w])
        # end meets criteria
    # end for in-vocab word
    return details
# end function

In [32]:
def get_duration(utts, key):
    dur = 0
    utts_not_found = []
    for u in utts:
        if u not in info_dict[key]:
            #print("argh!", u)
            utts_not_found.append(u)
        else:
            dur += (info_dict[key][u]['sp'] * 10)
    dur = dur / 60 / 60 / 1000
    print("-"*80)
    print("{0:d} total utts".format(len(utts)))
    print("{0:d} not found".format(len(utts_not_found)))
    print("selected utts from {0:s} -- duration = {1:.2f} hours".format(key, dur))
    return dur

In [33]:
train_dur, dev_dur = get_duration(all_train_utts, key="fisher_train"), get_duration(all_dev_utts, key="fisher_dev")

--------------------------------------------------------------------------------
138795 total utts
89 not found
selected utts from fisher_train -- duration = 161.62 hours
--------------------------------------------------------------------------------
3979 total utts
2 not found
selected utts from fisher_dev -- duration = 4.35 hours


### Task 1 - randomly selected frequent words

In [34]:
terms_of_interest = get_details_for_words(common_words, 
                                          min_dev_freq=10, 
                                          max_dev_freq=100, 
                                          min_train_freq=100, 
                                          min_len=5)

number of in-vocab words = 3824


In [35]:
print("total words meeting criteria = {0:d}".format(len(terms_of_interest["words"])))

total words meeting criteria = 372


In [36]:
_, _ = get_duration(terms_of_interest["train_utts"], key="fisher_train"), get_duration(terms_of_interest["dev_utts"], key="fisher_dev")

--------------------------------------------------------------------------------
62610 total utts
35 not found
selected utts from fisher_train -- duration = 109.48 hours
--------------------------------------------------------------------------------
2084 total utts
1 not found
selected utts from fisher_dev -- duration = 3.30 hours


In [37]:
random.seed("hmm")
sample_terms = random.sample(list(terms_of_interest["words"].keys()), 100)

In [38]:
sample_terms_details = get_details_for_words(sample_terms, 
                                              min_dev_freq=10, 
                                              max_dev_freq=100, 
                                              min_train_freq=100, 
                                              min_len=5)

number of in-vocab words = 100


In [39]:
print("total words meeting criteria = {0:d}".format(len(sample_terms_details["words"])))

total words meeting criteria = 100


In [40]:
_, _ = get_duration(sample_terms_details["train_utts"], key="fisher_train"), get_duration(sample_terms_details["dev_utts"], key="fisher_dev")

--------------------------------------------------------------------------------
25108 total utts
10 not found
selected utts from fisher_train -- duration = 49.19 hours
--------------------------------------------------------------------------------
985 total utts
0 not found
selected utts from fisher_dev -- duration = 1.83 hours


In [83]:
sample_terms[:10]

['white',
 'married',
 'moment',
 'phone',
 'attention',
 'terrible',
 'companies',
 'prefer',
 'changed',
 'until']

### Task 2 - topics as keywords

In [42]:
train_text_fname= "../installs/fisher-callhome-corpus/corpus/ldc/fisher_train.en"
topics_fname = "../criseslex/fsp06_topics_in_english.txt"

In [65]:
topics = [ "peace", "Music", "Marriage", "Religion", "Cell phones", 
           "Dating", "Telemarketing and SPAM", "Politics", "Travel", 
           "Technical devices", "Healthcare", "Advertisements", "Power", 
           "Occupations", "Movies", "Welfare", "Breaking up", "Location", 
           "Justice", "Memories", "Crime", "Violence against women", "Equality", 
            "Housing", "Immigration",     
            # new topics
           "Interracial", "Christians", "muslims", "jews", "e-mail", 
           "phone", "democracy", "Democratic", "Republican", "technology", 
           "leadership", "community", "jury", "police", "inequality", 
           "renting", "Violence", "immigrants", "immigrant", "skilled", 
           "Telemarketing", "SPAM", "skill", "job", "health", "mobile", 
            "ads", "physical", "emotional", "bubble", "rent", "economy", 
            "abuse", "women", "city", "country", "suburban", "dollar", 
            "united states", "laws", "phone", "race", "biracial", "interracial", 
            "marriage", "lyrics", "sexuality", "medicine", "television", "european",
            "home", "protect", "spouse", "language", "cellphone", "money",
            "doctor", "insurance", "cigarettes", "alcohol", "income", "salary",
            "class", "censor", "rating", "programs", "government",
            "relationship", "legal", "event", "life", "safe", "victim", "cops",
            "wage", "illegal"
            ]
topics = list(set(t.lower() for t in topics))
topics_stem = [stem(t) for t in topics]

# add similar topic words
new_topics = []
for t in topics:
    if t.encode() in sim_dict['w']:
        new_topics.extend([w.decode() for w in sim_dict['w'][t.encode()]])
topics.extend(new_topics)

In [66]:
len(topics)

555

In [67]:
topics_details = get_details_for_words(topics, 
                                       min_dev_freq=10, 
                                       max_dev_freq=100, 
                                       min_train_freq=100, 
                                       min_len=1)
print("total words meeting criteria = {0:d}".format(len(topics_details["words"])))

number of in-vocab words = 170
total words meeting criteria = 30


In [68]:
_, _ = get_duration(topics_details["train_utts"], key="fisher_train"), get_duration(topics_details["dev_utts"], key="fisher_dev")

--------------------------------------------------------------------------------
9595 total utts
4 not found
selected utts from fisher_train -- duration = 20.29 hours
--------------------------------------------------------------------------------
312 total utts
0 not found
selected utts from fisher_dev -- duration = 0.59 hours


In [85]:
print("\n".join(list(topics_details["words"].keys())))

jury
country
relationship
tv
movies
home
government
rent
politics
program
classes
women
phone
marriage
religions
race
class
immigration
christian
job
language
dollars
travel
life
crime
television
europe
police
programs
religious


### Task 3 - crises terms as keywords

In [48]:
crises_lex_fname = "../criseslex/CrisisLexLexicon/CrisisLexRec.txt"

In [77]:
crises = set()
with open(crises_lex_fname, "r") as in_f:
    for line in in_f:
        crises.update(line.strip().split())
crises = list(crises)
crises_stem = [stem(w) for w in crises]

# new_crises = []
# for t in crises:
#     if t.encode() in sim_dict['w']:
#         new_crises.extend([w.decode() for w in sim_dict['w'][t.encode()]])
# crises.extend(new_crises)

In [78]:
len(crises)

288

In [79]:
crises_details = get_details_for_words(crises, 
                                       min_dev_freq=10, 
                                       max_dev_freq=100, 
                                       min_train_freq=100, 
                                       min_len=1)
print("total words meeting criteria = {0:d}".format(len(crises_details["words"])))

number of in-vocab words = 123
total words meeting criteria = 42


In [80]:
_, _ = get_duration(crises_details["train_utts"], key="fisher_train"), get_duration(crises_details["dev_utts"], key="fisher_dev")

--------------------------------------------------------------------------------
14444 total utts
5 not found
selected utts from fisher_train -- duration = 29.37 hours
--------------------------------------------------------------------------------
602 total utts
0 not found
selected utts from fisher_dev -- duration = 1.17 hours


In [88]:
print("\n".join(list(crises_details["words"].keys())))

change
saying
died
give
make
case
remember
home
coming
water
return
lost
government
women
send
need
free
leave
service
gets
girl
huge
public
love
waiting
found
watch
news
situation
stay
terrible
lives
high
morning
life
kill
black
areas
number
police
first
town
