In [None]:
from __future__ import print_function
from __future__ import division
import os
import pickle
import json
import subprocess
from IPython.display import display
from IPython.display import Audio
import bisect
from collections import namedtuple
import numpy as np
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
import textwrap
import scipy.io.wavfile
from python_speech_features import mfcc
import tqdm

%run ../antonissameer/dba.py
%run ../antonissameer/utils.py

from matplotlib.ticker import MultipleLocator, \
     FormatStrFormatter, AutoMinorLocator
%matplotlib inline

In [None]:
with open("config.json") as json_data_file:
    config = json.load(json_data_file)

In [None]:
Align = namedtuple('Align', ['word', 'start', 'end'])

In [None]:
segment_map = pickle.load(open(config['es']['segment_dict_fname'], "rb"))
align_dict = pickle.load(open(config['es']['align_dict_fname'], "rb"), encoding='utf-8')

### Find words spoken across speakers, and within speakers

Find words longer than *threshold* ms in duration

In [None]:
dur_thresh = 40 # 400 ms
word_type = "es_cnt"

def get_word_details(dur_thresh, word_type):
    word_list = [(a.word, fid, sid, i) for fid in align_dict 
                for sid in align_dict[fid] 
                for i, a in enumerate(align_dict[fid][sid][word_type]) 
                if (a.end-a.start) > dur_thresh]

    word_details = {}
    for w, f, s, i in word_list:
        if w not in word_details:
            word_details[w] = {"calls":[f], "uttrs":[s], "ixs":[i]}
        else:
            word_details[w]["calls"].append(f)
            word_details[w]["uttrs"].append(s)
            word_details[w]["ixs"].append(i)
    
    for w in word_details:
        word_details[w]["word_count"] = len(word_details[w]["ixs"])
        word_details[w]["call_count"] = len(set(word_details[w]["calls"]))
        word_details[w]["uttr_count"] = len(set(word_details[w]["uttrs"]))
        
    print("duration filter: {0:d} ms".format(dur_thresh*10))
    print("{0:10s} --- {1:5d}".format("total words", len(word_details)))
    
    return word_details

In [None]:
def print_word_details(word_details, w):
    print("details for: {0:10s}".format(w))
    print("{0:10s} | {1:5d}".format("word count", word_details[w]["word_count"]))
    print("{0:10s} | {1:5d}".format("call count", word_details[w]["call_count"]))
    print("{0:10s} | {1:5d}".format("uttr count", word_details[w]["uttr_count"]))

In [None]:
word_details = get_word_details(dur_thresh, word_type)

In [None]:
# display the most common words
most_common = sorted([(w, word_details[w]["word_count"]) for w in word_details], 
                           reverse=True, key=lambda t:t[1])[:5]
print("{0:20s} --- {1:10s}".format("word", "count"))
print("".join(["{0:20s} --- {1:5d}\n".format(w, f) for w, f in most_common]))

In [None]:
most_common_by_calls = sorted(
    [(w, word_details[w]["word_count"], word_details[w]["call_count"]) 
     for w in word_details], reverse=True, key=lambda t:t[2])
print("{0:10s} | {1:5} | {2:5} ".format("word", "count", "calls"))
print("".join(["{0:10s} | {1:5d} | {2:5d} \n".format(w,c,f) 
              for w, c, f in most_common_by_calls[:10]]))

In [None]:
es_word = 'TELéFONO'
uttr_wavs_path_string = "../uttr_fa_vad_wavs/uttr_wavs/{0:s}.wav"

In [None]:
print_word_details(word_details, es_word)

In [None]:
def get_segment_wav(sid, start_10ms, end_10ms):
    sr1, y1 = scipy.io.wavfile.read(uttr_wavs_path_string.format(sid))
    start = start_10ms * int(sr1 / 100)
    end = end_10ms * int(sr1 / 100)
    return sr1, y1[start:end]

In [None]:
def get_segment_mfcc(sid, start_10ms, end_10ms):
    sr1, y1 = get_segment_wav(sid, start_10ms, end_10ms)
    mfcc_segment = mfcc(y1, sr1)
    return mfcc_segment

In [None]:
def get_start_end_10ms(f,s,ind):
    return align_dict[f][s]["es_cnt"][ind].start, align_dict[f][s]["es_cnt"][ind].end

In [None]:
def compute_pairwise_dtw(loc_tuples):
    d = len(loc_tuples)
    cost = zeros((d, d))
    
    with tqdm(total=d*d) as pbar:
        for i in range(d):
            f1, s1, ind1 = loc_tuples[i]
            start_i, end_i = get_start_end_10ms(f1, s1, ind1)
            i_mfcc = get_segment_mfcc(s1, start_i, end_i)
            for j in range(d):
                it = i * d + (j + 1)
                f2, s2, ind2 = es_cnt_word_loc[j]
                start_j, end_j = get_start_end_10ms(f2, s2, ind2)
                j_mfcc = get_segment_mfcc(s2, start_j, end_j)
                # compute dtw
                pair_cost = dtw(i_mfcc, j_mfcc)
                cost[i,j] = cost[j,i] = pair_cost
                pbar.set_description("it: {0:d}".format(it))
    return cost
    

In [None]:
def get_calls_for_word(words_details, word):
    word_segments = {}
    for w, f, s, i in words_details:
        if w == word:
            if f not in word_segments:
                word_segments[f] = 1
            else:
                word_segments[f] += 1
    print("done")
    return word_segments

In [None]:
def play_audio_seg(words_details, w, ix):
    call = words_details[w]["calls"][ix]
    uttr = words_details[w]["uttrs"][ix]
    index = words_details[w]["ixs"][ix]
    start_i, end_i = get_start_end_10ms(call, uttr, index)
    sr, y = get_segment_wav(uttr, start_i, end_i)
    display(Audio(y, rate=sr))

In [None]:
# Play few audio segments:
for i in range(5):
    print("{0:10s} | {1:7s}".format(es_word, word_details[es_word]["uttrs"][i]))
    play_audio_seg(word_details, es_word, i)

In [None]:
y1.shape, y1.shape[0] / sr1 * 100, 227 * (sr1/100)

In [None]:
mfcc1.shape

In [None]:
!soxi "../uttr_fa_vad_wavs/uttr_wavs/108.030.wav"