In [None]:
import pandas as pd
import glob
import os
import librosa
import numpy as np
import scripts
import matplotlib.pyplot as plt
%matplotlib inline

# Loading data

## Annotations

In [None]:
elan_annotation_paths = scripts.find_annotation_paths('.txt')
praat_annotation_paths = scripts.find_annotation_paths('.TextGrid')

In [None]:
elan_annotation_paths

In [None]:
praat_annotation_paths

In [None]:
elan_annotations = scripts.get_elan_annotations(elan_annotation_paths)

In [None]:
elan_annotations[-1].head()

In [None]:
praat_annotations = scripts.get_praat_annotations(praat_annotation_paths)

In [None]:
praat_annotations[-1].head()

In [None]:
praat_annotations[1].head()

In [None]:
annotations = elan_annotations.copy()
annotations.extend(praat_annotations)

In [None]:
annotations_paths = elan_annotation_paths.copy()
annotations_paths.extend(praat_annotation_paths)

## Audio

In [None]:
# annotations_paths = annotations_paths[:3]

In [None]:
annotations_paths

In [None]:
audio_paths = scripts.find_audio_from_annotations_paths(annotations_paths)
audio_paths

In [None]:
len(audio_paths)

In [None]:
# audios, sr = scripts.get_audios(audio_paths[12:13])
audios, sr = scripts.get_audios(audio_paths)

In [None]:
len(audios)

## Demo examples of classes

In [None]:
data_a = scripts.get_label_data(annotations[12], audios[-1], 'a', sr)
merged_data_a = np.hstack(data_a)
librosa.output.write_wav('../demo/label_a_tmp.wav', merged_data_a, sr)

In [None]:
data_a = scripts.get_label_data(annotations[12], audios[-1], 'b', sr)
merged_data_a = np.hstack(data_a)
librosa.output.write_wav('../demo/label_b_tmp.wav', merged_data_a, sr)

In [None]:
data_a = scripts.get_unlabel_data(annotations[12], audios[-1], ['a', 'b'], sr)
merged_data_a = np.hstack(data_a)
librosa.output.write_wav('../demo/label_c_tmp.wav', merged_data_a, sr)

## Creating dataset

In [None]:
interval_time = 0.3
interval_len = librosa.time_to_samples(interval_time) # [0]

step_time = 0.05
step_len = librosa.time_to_samples(step_time) # [0]

In [None]:
interval_len, step_len

In [None]:
def get_label_data(annotation, audio, labels, sr=22050):
    data = []
    targets = []
    addings = []
    
    label_map = {'a': 0, 'b': 1, 'c': 2}
    for label in labels:
        l = label_map[label]
        start_samples_indxs = librosa.time_to_samples(annotation[annotation.label ==\
                                                              label].start.values, sr)
        finish_samples_indxs = librosa.time_to_samples(annotation[annotation.label ==\
                                                               label].finish.values, sr)
        for s, f in zip(start_samples_indxs, finish_samples_indxs):
            n = f - s
            i = 0
            while n >= interval_len:
                crop = np.array(audio[s + i * step_len:s + i * step_len + interval_len])
                data.append(crop)
                n -= step_len
                i += 1
                addings.append(0)
                targets.append(l)
            if n < interval_len:
                data.append(np.pad(audio[s + i * step_len:f], (0, interval_len - n),
                                   'mean'))
                addings.append(interval_len - n)
                targets.append(l)
    return data, targets, addings

In [None]:
d, t, a = get_label_data(annotations[-1], audios[-1], ['a', 'b'])

In [None]:
plt.hist(a, bins=20);

In [None]:
pd.Series.value_counts(t)

In [None]:
set([len(dd) for dd in d])

In [None]:
plt.plot(d[-10])

In [None]:
from IPython.display import Audio

In [None]:
wid = Audio(d[-10], rate=22050)

display(wid)

In [None]:
def get_unlabel_data(annotation, audio, labels, additions, sr=22050):
    data = []

    start_samples_indxs = librosa.time_to_samples(\
                          annotation[~annotation.label.isin(labels)].start.values, sr)
    finish_samples_indxs = librosa.time_to_samples(\
                           annotation[~annotation.label.isin(labels)].finish.values, sr)
    for s, f in zip(start_samples_indxs, finish_samples_indxs):
        n = f - s
        i = 0
        while n >= interval_len:
            crop = np.array(audio[s + i * interval_len:s + (i + 1) * interval_len])
            # because random value 0 is bad for slicing, set minimum value 1
#             sl = max(int(np.random.exponential(exp_dist_scale)), 1)
            # limit maximum random value by 70% of interval_len
#             sl = min(sl, int(interval_len * max_crop_percent))

            # another version - better
            sl = np.random.choice(additions)
            if sl != 0:
                crop = crop[:-sl]
                crop = np.pad(crop, (0, sl), 'mean')
            data.append(crop)
            n -= interval_len
            i += 1
    targets = [2] * len(data)
    return data, targets

In [None]:
len(annotations), len(audios)

## Dataset

In [None]:
%%time
X, y = [], []
X_c, y_c = [], []
additions = []

labels_map = {
    'a': 0,
    'b': 1,
    'c': 2
}

for ann, aud in zip(annotations, audios):
    data, targets, adds = get_label_data(ann, aud, ['a', 'b'])
    X.append(data) # was extend
    y.append(targets) # was extend
    additions.extend(adds)
    
for ann, aud in zip(annotations, audios):
    data, targets = get_unlabel_data(ann, aud, ['a', 'b'], additions)
    
    c_indxs = np.random.choice(range(len(data)), len(data) // 2, replace=False)
#     X.extend(np.array(X_c)[c_indxs].tolist())
    X_c.append(np.array(data)[c_indxs].tolist()) # was extend
    y_c.append([2] * len(c_indxs))

# c_indxs = np.random.choice(range(len(X_c)), len(X_c) // 2, replace=False)
# X.extend(np.array(X_c)[c_indxs].tolist())
# y.extend([2] * (len(X_c) // 2))

In [None]:
len(X), len(y)

In [None]:
len(X_c), len(y_c)

In [None]:
for i in range(3):
    print(len(X[i]), len(y[i]))
    print(len(X_c[i]), len(y_c[i]))
    print()

In [None]:
for i in range(len(X)):
    X[i].extend(X_c[i])
    y[i].extend(y_c[i])

In [None]:
for i in range(3):
    print(len(X[i]), len(y[i]))
    print()

In [None]:
pd.Series.value_counts(y[0])

In [None]:
plt.hist(additions);

In [None]:
plt.hist(np.random.choice(additions, len(additions)));

In [None]:
plt.hist(np.random.exponential(np.mean(additions), len(additions)));

## Check

In [None]:
len(y)

In [None]:
len(X)

In [None]:
len(X[0])

In [None]:
xx = []
for x_, y_ in zip(X[-1:], y[-1:]):
    for x__, y__ in zip(x_, y_):
        if y__ == 0:
            xx.extend(x__)

In [None]:
len(X[-1])

In [None]:
len(xx)

In [None]:
for i in range(10):
    plt.plot(X[-1][i])
    plt.show()

In [None]:
# Audio(np.concatenate(X[-1][:10]), rate=sr)

In [None]:
Audio(xx, rate=sr)

In [None]:
librosa.output.write_wav('../demo/eee.wav', xx, sr)

In [None]:
# import random

In [None]:
# temp = list(zip(X, y))
# random.shuffle(temp)
# X, y = zip(*temp)

In [None]:
# xs = [x_ for x_, y_ in zip(X[0], y[0]) if y_ == 2]

In [None]:
# len(X[0])

In [None]:
# len(xs[0])

In [None]:
# for i in range(20,30):
#     plt.plot(xs[i])
#     plt.show()

In [None]:
# xs = np.concatenate(xs[20:30])

In [None]:
# xs = np.concatenate(X[:100])

In [None]:
# Audio(xs, rate=sr)

In [None]:
# librosa.output.write_wav('../demo/eee.wav', xs, sr)

### Pickle

In [None]:
import pickle
import gzip

In [None]:
with gzip.open('../cache/dataset.pkl.gz', 'wb') as f:
    pickle.dump([X, y], f)

In [None]:
sr