In [210]:
from __future__ import print_function
from __future__ import division
import os
import pickle
import json
import subprocess
from IPython.display import display
from IPython.display import Audio
import bisect
from collections import namedtuple
import numpy as np
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
import textwrap
import scipy.io.wavfile
from python_speech_features import mfcc

%run ../antonissameer/dba.py
%run ../antonissameer/utils.py

from matplotlib.ticker import MultipleLocator, \
     FormatStrFormatter, AutoMinorLocator
%matplotlib inline

In [175]:
with open("config.json") as json_data_file:
    config = json.load(json_data_file)

In [176]:
Align = namedtuple('Align', ['word', 'start', 'end'])

In [177]:
segment_map = pickle.load(open(config['es']['segment_dict_fname'], "rb"))
align_dict = pickle.load(open(config['es']['align_dict_fname'], "rb"), encoding='utf-8')

### Find words spoken across speakers, and within speakers

Find words longer than *threshold* ms in duration

In [178]:
dur_thresh = 40 # 800 ms

es_words = [a.word for fid in align_dict 
            for sid in align_dict[fid] for a in align_dict[fid][sid]['es'] 
            if (a.end-a.start) > dur_thresh]
es_cnt_words = [(a.word, fid, sid, i) for fid in align_dict 
                for sid in align_dict[fid] 
                for i, a in enumerate(align_dict[fid][sid]['es_cnt']) 
                if (a.end-a.start) > dur_thresh]

print("duration filter: {0:d} ms".format(dur_thresh*10))
print("{0:20s} | {1:5d}".format("total es words", len(es_words)))

print("{0:20s} | {1:5d}".format("total es cnt words", len(es_cnt_words)))

duration filter: 400 ms
total es words       | 34263
total es cnt words   | 28087


In [179]:
es_cnt_word_count = Counter([w for w, f, _, _ in es_cnt_words])
es_cnt_word_calls = {}
es_cnt_word_call_count = {}
for w, f, _, _ in es_cnt_words:
    if w not in es_cnt_word_calls:
        es_cnt_word_calls[w] = set([f])
    else:
        es_cnt_word_calls[w].update([f])

for w, f, _, _ in es_cnt_words:
    es_cnt_word_call_count[w] = len(es_cnt_word_calls[w])
    
es_cnt_word_details = [(w, es_cnt_word_count[w], es_cnt_word_call_count[w]) 
                       for w in es_cnt_word_count]
es_cnt_word_loc = {}
for w, f, s, i in es_cnt_words:
    if w not in es_cnt_word_loc:
        es_cnt_word_loc[w] = [(f,s,i)]
    else:
        es_cnt_word_loc[w].append((f,s,i))


In [180]:
# display the most common words
print("{0:20s} --- {1:10s}".format("word", "count"))
print("".join(["{0:20s} --- {1:5d}\n".format(w, f) for w, f in es_cnt_word_count.most_common(5)]))

word                 --- count     
MMM                  ---   504
<LAUGH>              ---   499
ENTONCES             ---   450
CLARO                ---   363
AH                   ---   298



In [181]:
es_cnt_word_details = sorted(es_cnt_word_details, reverse=True, key=lambda t:t[2])
print("{0:20s} --- {1:5} --- {2:5}".format("word", "count", "files"))
print("".join(["{0:20s} --- {1:5d} --- {2:5d}\n".format(w,c,f) 
              for w, c, f in es_cnt_word_details[:10]]))

word                 --- count --- files
ENTONCES             ---   450 ---    91
MMM                  ---   504 ---    88
<LAUGH>              ---   499 ---    79
AH                   ---   298 ---    79
BUENO                ---   291 ---    78
BIEN                 ---   247 ---    77
CLARO                ---   363 ---    71
TELéFONO             ---    94 ---    55
EH                   ---   163 ---    54
DESPUéS              ---    70 ---    50



In [182]:
es_word = 'TELéFONO'

In [207]:
def get_segment_mfcc(sid, start_10ms, end_10ms):
    sr1, y1 = scipy.io.wavfile.read("../uttr_fa_vad_wavs/uttr_wavs/{0:s}.wav".format(sid))
    start = start_10ms * (sr1 / 100)
    end = end_10ms * (sr1 / 100)
    mfcc_segment = mfcc(y1[start:end], sr1)
    return mfcc_segment

In [None]:
def compute_pairwise_dtw(es_cnt_word_loc, w):
    cost = 
    f1, s1, i1 = es_cnt_word_loc[w]

In [183]:
print("  ".join(es_cnt_word_calls[es_word]))

108  029  078  085  090  070  012  100  082  043  095  015  033  048  116  045  113  007  115  032  110  049  083  071  021  073  088  094  092  041  101  009  006  063  065  052  075  086  030  018  084  087  120  038  054  027  081  059  014  093  047  056  053  072  001


In [184]:
es_cnt_word_loc[es_word][:5]

[('108', '108.030', 1),
 ('078', '078.146', 6),
 ('090', '090.018', 10),
 ('090', '090.018', 12),
 ('090', '090.015', 1)]

In [205]:
align_dict['108']['108.030']['es_cnt']

[Align(word='CONTESTABA', start=172, end=216),
 Align(word='TELéFONO', start=227, end=282),
 Align(word='AYER', start=282, end=319),
 Align(word='EH', start=319, end=340)]

In [186]:
get_segment_mfcc(es_cnt_word_loc[es_word], start_10ms, end_10ms)

In [203]:
y1.shape, y1.shape[0] / sr1 * 100, 227 * (sr1/100)

((27360,), 342.0, 18160.0)

In [204]:
mfcc1.shape

(341, 13)

In [192]:
!soxi "../uttr_fa_vad_wavs/uttr_wavs/108.030.wav"


Input File     : '../uttr_fa_vad_wavs/uttr_wavs/108.030.wav'
Channels       : 1
Sample Rate    : 8000
Precision      : 16-bit
Duration       : 00:00:03.42 = 27360 samples ~ 256.5 CDDA sectors
File Size      : 54.8k
Bit Rate       : 128k
Sample Encoding: 16-bit Signed Integer PCM

