In [None]:
import pandas as pd
import numpy as np
import librosa
import librosa.display
from matplotlib.pyplot import specgram
import matplotlib.pyplot as plt
import pickle
import gzip
from tqdm import tqdm_notebook, tqdm
import tensorflow as tf
import keras.backend as K

%matplotlib inline

In [None]:
gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.25)

sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))

In [None]:
K.set_session(sess)

# Reading dataset

In [None]:
with gzip.open('../cache/dataset.pkl.gz', 'rb') as f:
    X_raw, y_raw = pickle.load(f)

In [None]:
len(X_raw), len(y_raw)

In [None]:
X_holdout, y_holdout = X_raw[10], y_raw[10] # lavina

In [None]:
X_raw = X_raw[:10] + X_raw[11:]
y_raw = y_raw[:10] + y_raw[11:]

In [None]:
len(X_holdout)

In [None]:
pd.Series.value_counts(y_holdout)

In [None]:
X = []
y = []

for x in X_raw:
    X.extend(x)

for y_ in y_raw:
    y.extend(y_)

In [None]:
len(X), len(y)

In [None]:
a = [x for x, y_ in zip(X, y) if y_ == 0]
b = [x for x, y_ in zip(X, y) if y_ == 1]
c = [x for x, y_ in zip(X, y) if y_ == 2]

In [None]:
c = [np.array(x) for x in c]

In [None]:
len(a), len(b), len(c)

In [None]:
from IPython.display import Audio, display

In [None]:
wids = [Audio(a[i], rate=22050) for i in np.random.randint(0, len(a), 3)]

for w in wids:
    display(w)

In [None]:
wids = [Audio(b[i], rate=22050) for i in np.random.randint(0, len(b), 3)]

for w in wids:
    display(w)

In [None]:
wids = [Audio(c[i], rate=22050) for i in np.random.randint(0, len(c), 3)]

for w in wids:
    display(w)

# New

In [None]:
from scipy.fftpack import fft
from scipy import signal

In [None]:
sample_rate = 22050

In [None]:
def log_specgram(audio, sample_rate, window_size=20,
                 step_size=10, eps=1e-10):
    nperseg = int(round(window_size * sample_rate / 1e3))
    noverlap = int(round(step_size * sample_rate / 1e3))
    freqs, times, spec = signal.spectrogram(audio,
                                    fs=sample_rate,
                                    window='hann',
                                    nperseg=nperseg,
                                    noverlap=noverlap,
                                    detrend=False)
    return freqs, times, np.log(spec.T.astype(np.float32) + eps)

In [None]:
samples = c[666]

In [None]:
Audio(samples, rate=sample_rate)

In [None]:
%%timeit
freqs, times, spectrogram = log_specgram(samples, sample_rate)

In [None]:
freqs, times, spectrogram = log_specgram(samples, sample_rate)

In [None]:
len(freqs)

In [None]:
fig = plt.figure(figsize=(14, 8))
ax1 = fig.add_subplot(211)
ax1.set_title('Raw wave')
ax1.set_ylabel('Amplitude')
ax1.plot(np.linspace(0, len(samples) / sample_rate, len(samples)), samples)

ax2 = fig.add_subplot(212)
ax2.imshow(spectrogram.T, aspect='auto', origin='lower', 
           extent=[times.min(), times.max(), freqs.min(), freqs.max()])
ax2.set_yticks(freqs[::16])
ax2.set_xticks(times[::16])
ax2.set_title('Spectrogram')
ax2.set_ylabel('Freqs in Hz')
ax2.set_xlabel('Seconds')

### Resample

https://librosa.github.io/librosa/generated/librosa.core.resample.html

In [None]:
new_sample_rate = 8000

In [None]:
%%time
samples_r = librosa.resample(samples, sample_rate, new_sample_rate)

In [None]:
%%timeit
freqs, times, spectrogram = log_specgram(samples_r, new_sample_rate)

In [None]:
freqs, times, spectrogram = log_specgram(samples_r, new_sample_rate)

In [None]:
len(freqs)

In [None]:
fig = plt.figure(figsize=(14, 8))
ax1 = fig.add_subplot(211)
ax1.set_title('Raw wave')
ax1.set_ylabel('Amplitude')
ax1.plot(np.linspace(0, len(samples_r) / 8000, len(samples_r)), samples_r)

ax2 = fig.add_subplot(212)
ax2.imshow(spectrogram.T, aspect='auto', origin='lower', 
           extent=[times.min(), times.max(), freqs.min(), freqs.max()])
ax2.set_yticks(freqs[::16])
ax2.set_xticks(times[::16])
ax2.set_title('Spectrogram')
ax2.set_ylabel('Freqs in Hz')
ax2.set_xlabel('Seconds')

In [None]:
spectrogram.shape

In [None]:
Audio(samples, rate=sample_rate)

In [None]:
Audio(samples_r, rate=new_sample_rate)

In [None]:
# From this tutorial
# https://github.com/librosa/librosa/blob/master/examples/LibROSA%20demo.ipynb
S = librosa.feature.melspectrogram(samples, sr=sample_rate, n_mels=128)

# Convert to log scale (dB). We'll use the peak power (max) as reference.
log_S = librosa.power_to_db(S, ref=np.max)

plt.figure(figsize=(12, 4))
librosa.display.specshow(log_S, sr=sample_rate, x_axis='time', y_axis='mel')
plt.title('Mel power spectrogram ')
plt.colorbar(format='%+02.0f dB')
plt.tight_layout()

In [None]:
log_S.shape

In [None]:
mfcc = librosa.feature.mfcc(S=log_S, n_mfcc=13)

# Let's pad on the first and second deltas while we're at it
delta2_mfcc = librosa.feature.delta(mfcc, order=2)

plt.figure(figsize=(12, 4))
librosa.display.specshow(delta2_mfcc)
plt.ylabel('MFCC coeffs')
plt.xlabel('Time')
plt.title('MFCC')
plt.colorbar()
plt.tight_layout()

## FFT

In [None]:
plt.plot(np.abs(fft(samples))[:len(samples)//2])
# plt.ylim(0, 10)

In [None]:
def custom_fft(y, fs):
#     T = 1.0 / fs
    N = y.shape[0]
    yf = fft(y)
#     xf = np.linspace(0.0, 1.0/(2.0*T), N//2)
#     vals = 2.0/N * np.abs(yf[0:N//2]) 
#     FFT is also complex, to we take just the real part (abs)
    vals = np.abs(yf[0:N//2])
    return vals

### Before and after resample

In [None]:
Audio(samples, rate=sample_rate)

In [None]:
Audio(samples_r, rate=new_sample_rate)

In [None]:
vals = custom_fft(samples, sample_rate)
plt.figure(figsize=(12, 4))
plt.title('FFT of recording sampled with ' + str(sample_rate) + ' Hz')
plt.plot(range(len(vals)), vals)
plt.xlabel('Frequency')
plt.grid()
plt.show()

In [None]:
vals = custom_fft(samples_r, new_sample_rate)
plt.figure(figsize=(12, 4))
plt.title('FFT of recording sampled with ' + str(new_sample_rate) + ' Hz')
plt.plot(range(len(vals)), vals)
plt.xlabel('Frequency')
plt.grid()
plt.show()

## Different slices

In [None]:
vals = custom_fft(samples, sample_rate)
plt.figure(figsize=(12, 4))
plt.title('FFT of recording sampled with ' + str(sample_rate) + ' Hz')
plt.plot(range(len(vals)), vals)
plt.xlabel('Frequency')
plt.grid()
plt.show()

In [None]:
for i, aa in enumerate(a[:3]):
    vals = custom_fft(aa, new_sample_rate)
    plt.figure(figsize=(12, 4))
    plt.title('FFT of {}'.format(i))
    plt.plot(range(len(vals)), vals)
    plt.xlabel('Frequency')
    plt.grid()
    plt.show()

In [None]:
Audio(a[0], rate=sample_rate)

In [None]:
Audio(a[1], rate=sample_rate)

In [None]:
Audio(a[2], rate=sample_rate)

# Feature extratction

In [None]:
X = [x if type(x) == type(np.array([])) else np.array(x) for x in X]

In [None]:
X_holdout = [x if type(x) == type(np.array([])) else np.array(x) for x in X_holdout]

In [None]:
len(X)

In [None]:
pd.Series.value_counts([len(x) for x in X])

In [None]:
bad_ind = (np.array([len(x) == 6318 for x in X])).nonzero()[0][0]

In [None]:
bad_ind

In [None]:
X = X[:bad_ind] + X[bad_ind + 1:]
y = y[:bad_ind] + y[bad_ind + 1:]

In [None]:
pd.Series.value_counts([len(x) for x in X])

In [None]:
_, _, s = log_specgram(a[1], sample_rate)

In [None]:
plt.imshow(s[:, :])

In [None]:
s.shape

In [None]:
def extract_features(x, lim=100):
    _, _, spec = log_specgram(x, sample_rate)
#     return np.expand_dims(spec[:, :100], -1)
    return spec[:, :100]

In [None]:
len(X)

In [None]:
X_spec = []
for x in tqdm(X):
    spec = extract_features(x)
    X_spec.append(spec)
X_spec = np.array(X_spec)

In [None]:
X_spec.shape

In [None]:
X_holdout_spec = []
for x in tqdm(X_holdout):
    spec = extract_features(x)
    X_holdout_spec.append(spec)
X_holdout_spec = np.array(X_holdout_spec)

## Checking mean specs for different audios

In [None]:
# X_raw_specs = []
# for raw in tqdm(X_raw):
#     specs = []
#     for sample in raw:
#         spec = extract_features(np.array(sample))
#         if spec.shape == (28, 100):
#             specs.append(spec)
#     specs = np.array(specs)
#     X_raw_specs.append(specs)

In [None]:
# for i in range(len(X_raw_specs)):
# #     print(X_raw_specs[i].shape)
#     print(X_raw_specs[i].mean(), X_raw_specs[i].std())

# Rnn model

In [None]:
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import classification_report

In [None]:
import keras
from keras.models import Sequential, Model
from keras.layers import Conv2D, MaxPool2D, Dense, Flatten, Dropout, AveragePooling2D
from keras.layers import GRU, BatchNormalization
from keras.utils import to_categorical
from keras.layers import Input, Conv1D, Activation, MaxPool1D
from keras.models import Model

In [None]:
X_spec.shape

In [None]:
X_spec.mean()

In [None]:
X_spec.std()

In [None]:
mean = X_spec.mean()
std = X_spec.std()

In [None]:
min_spec, max_spec = X_spec.min(), X_spec.max()

In [None]:
min_spec, max_spec

In [None]:
plt.hist(X_spec.ravel());

In [None]:
plt.imshow(X_spec.min(axis=0))

In [None]:
plt.imshow(X_spec.max(axis=0))

In [None]:
plt.imshow(X_spec.mean(axis=0))

In [None]:
# X_spec -= mean
# X_spec /= std

In [None]:
X_spec -= min_spec
X_spec /= (max_spec - min_spec)

In [None]:
X_spec.mean(), X_spec.std()

In [None]:
X_spec.min(), X_spec.max()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_spec, y, test_size=0.1,
                                                    stratify=y, random_state=10)

In [None]:
# X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1,
#                                                     stratify=y_train, random_state=10)

In [None]:
X_val, y_val = X_holdout_spec, y_holdout

In [None]:
# X_val -= mean
# X_val /= std

In [None]:
X_val -= min_spec
X_val /= (max_spec - min_spec)

In [None]:
X_val.mean(), X_val.std()

In [None]:
X_val.min(), X_val.max()

In [None]:
y_train, y_val, y_test = to_categorical(y_train), to_categorical(y_val),\
                         to_categorical(y_test)

In [None]:
vc = pd.Series.value_counts(np.argmax(y_train, 1), True)
vc

In [None]:
vc = pd.Series.value_counts(np.argmax(y_val, 1), True)
vc

In [None]:
vc = pd.Series.value_counts(np.argmax(y_test, 1), True)
vc

In [None]:
frac = 0.75

In [None]:
vc[2] / vc[0] * frac - 1, vc[2] / vc[1] * frac - 1

In [None]:
X_train = np.vstack((X_train, np.repeat(X_train[np.argmax(y_train, 1) == 0], 4, 0)))
y_train = np.vstack((y_train, np.repeat(y_train[np.argmax(y_train, 1) == 0], 4, 0)))

In [None]:
X_train = np.vstack((X_train, np.repeat(X_train[np.argmax(y_train, 1) == 1], 20, 0)))
y_train = np.vstack((y_train, np.repeat(y_train[np.argmax(y_train, 1) == 1], 20, 0)))

In [None]:
pd.Series.value_counts(np.argmax(y_train, 1), True)

---

### Best

In [None]:
def create_model(input_shape):
    x_input = Input(input_shape)
#     x = Conv1D(64, 1)(x_input)
#     x = Activation('relu')(x)
    x = Conv1D(128, 3)(x_input)
    x = BatchNormalization()(x)
    x = Activation('relu')(x)
    x = Dropout(0.5)(x)
    
    x = GRU(128, return_sequences=True)(x)
    x = Dropout(0.5)(x)
    x = BatchNormalization()(x)
    
    x = GRU(192, return_sequences=False)(x)
    x = Dropout(0.5)(x)
    x = BatchNormalization()(x)
    
    x = Dense(3, activation='softmax')(x)
    model = Model(inputs=x_input, outputs=x)
    return model

In [None]:
model = create_model(X_train[0].shape)

In [None]:
X_train[0].shape

In [None]:
model.summary()

In [None]:
model.compile('adam', 'categorical_crossentropy')

In [None]:
model.get_weights()[0].shape

In [None]:
plt.imshow(X_train[0])

In [None]:
# plt.imshow(model.predict(X_train[:1])[0])

## Predict without training

In [None]:
X_val[0].shape

In [None]:
X_val.shape

In [None]:
pr = model.predict(np.array(X_val), batch_size=32, verbose=1)

In [None]:
pr[:10].shape

In [None]:
pr[:10].argmax(1)

In [None]:
pr = pr.argmax(1)

In [None]:
pd.Series.value_counts(pr)

In [None]:
accuracy_score(np.argmax(y_val, 1), pr)

In [None]:
print(classification_report(np.argmax(y_val, 1), pr))

## Training

In [None]:
pd.Series.value_counts(np.argmax(y_test, 1), True)

In [None]:
def schedule(i, lr):
    if i == 0:
        lr *= 0.5
    if i == 5:
        lr *= 0.2
    if i == 10:
        lr *= 0.2
    return lr

In [None]:
# def schedule(i, lr):
#     if i == 0:
#         lr = 0.0005
#     if i == 5:
#         lr *= 0.2
#     if i == 10:
#         lr *= 0.2
#     return lr

In [None]:
hist = model.fit(X_train, y_train, batch_size=64, epochs=10,
#                  class_weight={0: 3, 1: 5, 2: 1},
                 validation_data=(X_test, y_test), # val
                 callbacks=[keras.callbacks.LearningRateScheduler(schedule, verbose=1)])

In [None]:
model.save('../cache/model2.h5')

In [None]:
model_json = model.to_json()

In [None]:
with open('../cache/model2.json', 'w') as f:
    f.write(model_json)

In [None]:
model.save_weights('../cache/model2_weights.h5')

In [None]:
pr = model.predict(X_val, batch_size=32, verbose=1)

In [None]:
# pr = pr.ravel()
pr = pr.argmax(1)

In [None]:
pd.Series.value_counts(pr)

In [None]:
accuracy_score(np.argmax(y_val, 1), pr)

In [None]:
plt.plot(hist.history['loss'], label='loss')
plt.plot(hist.history['val_loss'], label='val_loss')

In [None]:
cm = confusion_matrix(np.argmax(y_val, 1), pr)

In [None]:
cm

In [None]:
print(classification_report(np.argmax(y_val, 1), pr)) # :100 standart fcn

In [None]:
print(classification_report(np.argmax(y_val, 1), pr)) # :100 standart fcn

---

In [None]:
pr = model.predict(X_test, batch_size=32, verbose=1)

In [None]:
# pr = pr.ravel()
pr = pr.argmax(1)

In [None]:
pd.Series.value_counts(pr)

In [None]:
accuracy_score(np.argmax(y_test, 1), pr)

In [None]:
cm = confusion_matrix(np.argmax(y_test, 1), pr)

In [None]:
cm

In [None]:
print(classification_report(np.argmax(y_test, 1), pr)) # :100 standart fcn

In [None]:
print(classification_report(np.argmax(y_test, 1), pr)) # :100 standart fcn

---

In [None]:
# %time model.predict(X_val[:1, :, :, :])

In [None]:
%time model.predict(X_val[:1, :, :])

In [None]:
plt.hist(np.argmax(y_test, 1), bins=3);

In [None]:
plt.hist(pr, bins=3);

In [None]:
accuracy_score(np.argmax(y_test, 1), [2] * len(y_test))

In [None]:
model.get_weights()[0].shape

In [None]:
plt.imshow(model.get_weights()[0][:, :, 2])
plt.gray()

## Holdout test

In [None]:
pr = model.predict(X_test, batch_size=128, verbose=1)

In [None]:
# pr = pr.ravel()
pr = pr.argmax(1)

In [None]:
pd.Series.value_counts(pr)

In [None]:
accuracy_score(np.argmax(y_test, 1), pr)

In [None]:
cm = confusion_matrix(np.argmax(y_test, 1), pr)

In [None]:
cm

In [None]:
print(classification_report(np.argmax(y_test, 1), pr)) # :100 standart fcn

# Testing on new data

In [None]:
def get_praat_annotations(paths):
    dfs = []
    for path in paths:
        with open(path) as f:
            annotation = [l.strip() for l in f.readlines()]
            
        indxs = [i for i, l in enumerate(annotation) if l == '"IntervalTier"']
        annotation = annotation[indxs[0] + 5:indxs[1] if len(indxs) > 1\
                                else len(annotation)]
        annotation_dicts = []

        for s, e, l in zip(annotation[0::3], annotation[1::3], annotation[2::3]):
            annotation_dicts.append({
                'start': float(s),
                'finish': float(e),
                'label': l.replace('"', ''),
                'length': float(e) - float(s)
            })
        df = pd.DataFrame(annotation_dicts)
        dfs.append(df)
    return dfs

test_annotation = get_praat_annotations(['../annotations/lavina_class.TextGrid'])

In [None]:
test_audio, sr = librosa.load('../audio/lavina_class.m4a')

In [None]:
interval_time = 0.3
interval_step = 0.1

interval_len, step_len = librosa.time_to_samples(interval_time), \
                 librosa.time_to_samples(interval_step) #[0]

In [None]:
interval_len, step_len

In [None]:
test_annotation[0][test_annotation[0].label == 'a'].length.describe()

In [None]:
librosa.time_to_samples(0.18)

In [None]:
def get_intersection_of_intervals(a, b):
    # a - fixed, b - random, a[0] - x1, a[1] - y1
    int_len = a[1] - a[0]
    
    if b[1] > a[0]:
        right_int = max(a[1] - b[1], 0)
    else:
        right_int = int_len
    
    if b[0] < a[1]:
        left_int = max(b[0] - a[0], 0)
    else:
        left_int = int_len
    return int_len - right_int - left_int

In [None]:
len(test_audio)

In [None]:
min_delta = interval_len // 3

In [None]:
min_delta

In [None]:
def match_intervals(annotation, audio, labels, sr=22050):
    audio_intervals = [(x, x + interval_len) for x in range(0, len(audio), step_len)][:-4]
    targets = []
    values = []
    label_map = {'a': 0, 'b': 1, 'c': 2}
    for interval in audio_intervals:
        value = 0
        for label in labels:
            l = label_map[label]
            start_samples_indxs = librosa.time_to_samples(annotation[annotation.label ==\
                                                                  label].start.values, sr)
            finish_samples_indxs = librosa.time_to_samples(annotation[annotation.label ==\
                                                                   label].finish.values,
                                                                   sr)
            for s, f in zip(start_samples_indxs, finish_samples_indxs):
                value = get_intersection_of_intervals(interval, (s, f))
                if value > min_delta:
                    break
                else:
                    value = 0
            if value != 0:
                break
        if value != 0:
            targets.append(l)
        else:
            targets.append(2)
        values.append(value)
    return audio_intervals, targets, values

Идем с шагом step_len и берем интервалы по interval_len, проверяем - с каким из размеченных интервалов пересекается наш интервал больше чем на min_delta и присваиваем его метку

Не учитываются случаи, когда идет несколько подряд "эээ" и "нуу", и так как метки отсортированы, пока интервал будет касаться левой стороной первого "эээ", а правой "нуу", то будет присвоена метка "эээ". **TODO**

In [None]:
%%time
test_intervals, test_targets, vals = match_intervals(test_annotation[0], test_audio,
                                                     ['a', 'b'])

In [None]:
len(test_intervals), len(test_targets)

In [None]:
pd.Series.value_counts(test_targets)

In [None]:
plt.plot(test_targets[:100])

In [None]:
plt.plot(vals[:100])

In [None]:
# def get_label_data(annotation, audio, labels, sr=22050):
#     # по дефолту все 2го класса
#     data = pd.Series(index=np.arange(0, len(audio) - interval_len, step_len), data=2)
#     label_map = {'a': 0, 'b': 1, 'c': 2}
#     for label in labels:
#         l = label_map[label]
#         start_samples_indxs = librosa.time_to_samples(annotation[annotation.label ==\
#                                                               label].start.values, sr)
#         finish_samples_indxs = librosa.time_to_samples(annotation[annotation.label ==\
#                                                                label].finish.values, sr)
#         for s, f in zip(start_samples_indxs, finish_samples_indxs):
#             s = int(np.round(s / step_len)) * step_len
#             f = (int(np.round(f / step_len)) - 2) * step_len
#             # вычитаем 2 потому что f - это конец интервала, в Series мы заносим метку для 
#             # начала  интервала, длина интервала 2 * step, поэтому нужно вычесть 2
#             data[(data.index >= s) & (data.index <= f)] = l
#     return data

In [None]:
len(test_audio)

In [None]:
test_samples = [test_audio[s:f] for s, f in test_intervals]

### Checking intervals

In [None]:
t1 = [(t1, t2) for t1, t2 in zip(test_samples, vals) if t2 !=0]

In [None]:
list(filter(lambda x: x!=0, vals))[:10]

In [None]:
Audio(t1[9][0], rate=sample_rate)

In [None]:
test_samples[-1].shape

In [None]:
len(test_samples)

In [None]:
test_features = np.stack([extract_features(x) for x in tqdm(test_samples)])

In [None]:
test_features.shape

TODO: нормальная нормальзация

-20.350365, 2.89434

In [None]:
# test_features -= -20.350365
# test_features /= 2.89434

In [None]:
test_features -= min_spec
test_features /= (max_spec - min_spec)

In [None]:
test_features.min(), test_features.max()

In [None]:
test_predict = model.predict(test_features, batch_size=32).argmax(1)

In [None]:
test_predict_pr = model.predict(test_features, batch_size=32)

**TODO** сделать усреднение предикта с 1 влево 1 вправо интервалами

In [None]:
accuracy_score(test_targets, test_predict)

In [None]:
accuracy_score(test_targets, [2] * len(test_predict))

In [None]:
# print(classification_report(test_targets, test_predict))

In [None]:
print(classification_report(test_targets, test_predict))

---

In [None]:
plt.plot(test_predict[:2000], '--', label='pr', alpha=0.4)
plt.plot(test_targets[:2000], '-.', label='tr', alpha=0.4)
plt.legend()

## Sliding window

In [None]:
test_predict_mean = np.stack(((np.pad(test_predict_pr[1:, 0], (0, 1), 'edge') + \
                    test_predict_pr[:, 0] + \
    np.pad(test_predict_pr[:-1, 0], (1, 0), 'edge')) / 3,
          (np.pad(test_predict_pr[1:, 1], (0, 1), 'edge') + test_predict_pr[:, 1] + \
    np.pad(test_predict_pr[:-1, 1], (1, 0), 'edge')) / 3,
          (np.pad(test_predict_pr[1:, 2], (0, 1), 'edge') + test_predict_pr[:, 2] + \
    np.pad(test_predict_pr[:-1, 2], (1, 0), 'edge')) / 3), axis=1)

In [None]:
test_predict_mean.shape

In [None]:
lim = 100

In [None]:
plt.plot(test_predict[:lim])

In [None]:
plt.plot(test_predict_pr[:lim, 0], label='0')
plt.plot(test_predict_pr[:lim, 1], label='1')
plt.plot(test_predict_pr[:lim, 2], label='2')
plt.legend()

In [None]:
plt.plot(test_predict_mean[:lim, 0], label='0')
plt.plot(test_predict_mean[:lim, 1], label='1')
plt.plot(test_predict_mean[:lim, 2], label='2')
plt.legend()

In [None]:
test_predict_mean[:, 2]

In [None]:
test_predict_pr[:, 2]

In [None]:
print(classification_report(test_targets, test_predict_pr.argmax(1)))

In [None]:
print(classification_report(test_targets, test_predict_mean.argmax(1)))

In [None]:
confusion_matrix(test_targets, test_predict_pr.argmax(1))

In [None]:
confusion_matrix(test_targets, test_predict_mean.argmax(1))

## Errors analysis

In [None]:
e = np.nonzero((test_targets != test_predict_pr.argmax(1)) & \
               (test_predict_pr.argmax(1) == 0))[0]

wids = [Audio(test_samples[i], rate=22050) for i in e[:5]]
for w in wids:
    display(w)

In [None]:
e = np.nonzero((test_targets != test_predict_pr.argmax(1)) & \
               (test_predict_pr.argmax(1) == 1))[0]

wids = [Audio(test_samples[i], rate=22050) for i in e[:5]]
for w in wids:
    display(w)

In [None]:
e = np.nonzero((test_targets != test_predict_pr.argmax(1)) & \
               (test_predict_pr.argmax(1) == 2))[0]

wids = [Audio(test_samples[i], rate=22050) for i in e[:5]]
for w in wids:
    display(w)

## Отображение результата

In [None]:
from PIL import Image
import os
import shutil

In [None]:
# os.rmdir('../cache/imgs/')
shutil.rmtree('../cache/imgs/')

In [None]:
os.mkdir('../cache/imgs')

In [None]:
1 / interval_step

In [None]:
interval_step

In [None]:
for i, (r, t) in enumerate(zip(test_predict_mean.argmax(1), test_targets)):
    im_r = np.zeros((32, 64, 3), dtype='uint8')
    if r == 0:
        im_r[:,:,0] = 255
    elif r == 2:
        im_r[...,2] = 255
    else:
        im_r[...,1] = 255
        
    im_t = np.zeros((32, 64, 3), dtype='uint8')
    if t == 0:
        im_t[:,:,0] = 255
    elif t == 2:
        im_t[...,2] = 255
    else:
        im_t[...,1] = 255
        
    im = np.vstack((im_r, im_t))
    im = Image.fromarray(im)
    im.save('../cache/imgs/test_{:04d}.png'.format(i + 2))
im.save('../cache/imgs/test_{:04d}.png'.format(0))
im.save('../cache/imgs/test_{:04d}.png'.format(1))

In [None]:
os.system('ffmpeg -r 10 -i ../cache/imgs/test_%04d.png -i ../audio/lavina_class.m4a -vcodec mpeg4 -y ../cache/a.mp4')

## Ручная корректировка шума

In [None]:
test_predict_new = test_predict_new.ravel()

In [None]:
%%time
test_predict_new2 = []

start = False
for t, tn in zip(test_predict_new, np.append(test_predict_new[1:], [1])):
    if t == 0 and tn == 0 and not start:
        # если встретили 0 и следующий 0 и до этого не началось, включаем начало и доб-ем 0
        start = True
        test_predict_new2.append(0)
    elif t == 0 and tn == 0 and start:
        # если встретили 0 и следующий 0 и до этого началось, доб-ем 0 - мы в серии нулей
        test_predict_new2.append(0)
    elif t == 0 and tn != 0 and not start:
        # если встретили 0, а следующий не 0 и до этого не началось, значит шум, не 0 
        test_predict_new2.append(1)
    elif t == 0 and tn != 0 and start:
        # если встретили 0, а след. не 0 и до этого началось, значит 0, но начало в False
        start = False
        test_predict_new2.append(0)
    elif t == 1:
        # если встрили 1, добавляем 1
        test_predict_new2.append(1)
        start = False
    else:
        print(t, tn, start)

In [None]:
pd.Series.value_counts(test_predict_new)

In [None]:
pd.Series.value_counts(test_predict_new2)

In [None]:
print(classification_report(test_data.values, test_predict_new))

In [None]:
print(classification_report(test_data.values, test_predict_new2))

Precision поднялся на 0.07, а recall упал лишь на 0.01

## Отображение результата

In [None]:
shutil.rmtree('../cache/imgs/')

In [None]:
os.mkdir('../cache/imgs')

In [None]:
for i, (r, t) in enumerate(zip(test_predict_new2, test_data.values)):
    im_r = np.zeros((32, 64, 3), dtype='uint8')
    if r == 0:
        im_r[:,:,0] = 255
    elif r == 1:
        im_r[...,2] = 255
    else:
        im_r[...,1] = 255
        
    im_t = np.zeros((32, 64, 3), dtype='uint8')
    if t == 0:
        im_t[:,:,0] = 255
    elif t == 1:
        im_t[...,2] = 255
    else:
        im_t[...,1] = 255
        
    im = np.vstack((im_r, im_t))
    im = Image.fromarray(im)
    im.save('../cache/imgs/test_{:04d}.png'.format(i + 2))
im.save('../cache/imgs/test_{:04d}.png'.format(0))
im.save('../cache/imgs/test_{:04d}.png'.format(1))

In [None]:
os.system('ffmpeg -r 20 -i ../cache/imgs/test_%04d.png -i ../audio/arhis1.mp3 -vcodec mpeg4 -y ../cache/aa.mp4')