## Libraies:

In [1]:
import os
import pandas as pd
import numpy as np
from sklearn import model_selection, preprocessing, metrics
from keras.models import Model
from keras.layers import Input, LSTM, Dense, TimeDistributed, Bidirectional # Keras's layers
from tqdm import tqdm
from collections import Counter

## Common settings and params:

In [24]:
# Change working directory:
os.chdir('C:/Users/User/Desktop/VegaVAD project/')
train_file_1 = 'data/train_features1.csv'
train_file_2 = 'data/train_features2.csv'
test_file_1 = 'data/test_features1.csv'
test_file_2 = 'data/test_features2.csv'

# Common params:
fs = 16000
frame_length = 0.025
frame_step = 0.01

# Params for ViVoVAD model:
n_frames = 300 # Number frames in one batch for BILSTM
epochs_ViVoVAD = 25 # Number epochs for BILSTM
batch_size_ViVoVAD = 64 # Number batchs for BILSTM

# Params for Hybrid CNN-BiLSTM:

## Load data:

In [3]:
train_1 = pd.read_csv(train_file_1, header=None)
train_2 = pd.read_csv(train_file_2, header=None)
valid_1 = pd.read_csv(test_file_1, header=None)
valid_2 = pd.read_csv(test_file_2, header=None)

train_1 = np.array(train_1)
train_2 = np.array(train_2)
valid_1 = np.array(valid_1)
valid_2 = np.array(valid_2)

print('Train size', train_1.shape, train_2.shape)
print('Valid size', valid_1.shape, valid_2.shape)

Train size (897000, 34) (27000, 1025)
Valid size (299000, 34) (9000, 1025)


# ViVoVAD:a Voice Activity Detection Tool based on Recurrent Neural Networks

## Build and training model:

We will use BLSTM layers stacked with linear layer on output. In order to reduce the delay of the dependencies, training and evaluation is perfomed with limited length sequences of 300 frames (3 seconds).

![BILSTM_pic.PNG](attachment:BILSTM_pic.PNG)

## Preparation data:

We need to split into test and train sets and to scale data to input the model.

In [4]:
# Split dataset on train and test:
X = train_1[:, 1:]
y = train_1[:, 0]
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.33, shuffle=True, random_state=1)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

# Scale data:
scaler = preprocessing.StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Reshape datasets to 1 x 300 x num_features:
# So, each group need to consist of 300 frames:
X_train_reshaped = X_train[:int(X_train.shape[0] / n_frames) * n_frames]
X_train_reshaped = X_train_reshaped.reshape(int(X_train_reshaped.shape[0] / n_frames), n_frames, X_train_reshaped.shape[1])
X_test_reshaped = X_test[:int(X_test.shape[0] / n_frames) * n_frames]
X_test_reshaped = X_test_reshaped.reshape(int(X_test_reshaped.shape[0] / n_frames), n_frames, X_test_reshaped.shape[1])

(600990, 33) (296010, 33) (600990,) (296010,)


In [5]:
# Encoding label:
y_train = pd.get_dummies(y_train)
y_test = pd.get_dummies(y_test)
y_train = np.array(y_train)
y_test = np.array(y_test)

y_train_reshaped = y_train[:int(y_train.shape[0] / n_frames) * n_frames]
y_train_reshaped = y_train_reshaped.reshape(int(y_train_reshaped.shape[0] / n_frames), n_frames, y_train_reshaped.shape[1])
y_test_reshaped = y_test[:int(y_test.shape[0] / n_frames) * n_frames]
y_test_reshaped = y_test_reshaped.reshape(int(y_test_reshaped.shape[0] / n_frames), n_frames, y_test_reshaped.shape[1])

## Build and fit model:

![Results%20for%20BILSTM.PNG](attachment:Results%20for%20BILSTM.PNG)
According pics we need to use the next struct for model:
1 layer with 128 neurons to reduse miss.

In [6]:
input_layer = Input(shape=(n_frames, X_train.shape[1]))
BLSTM_1_layer = Bidirectional(LSTM(128, return_sequences=True))(input_layer)
output_layer = TimeDistributed(Dense(3, activation='softmax'))(BLSTM_1_layer)
model = Model(inputs=input_layer, outputs=output_layer)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [7]:
model.fit(X_train_reshaped, y_train_reshaped, validation_data=(X_test_reshaped, y_test_reshaped), epochs=epochs_ViVoVAD, batch_size=batch_size_ViVoVAD)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25


Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25


Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25


Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25


Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.callbacks.History at 0x20b92e14088>

## Do the validation:

In [28]:
X_valid = valid_1[:, 1:]
order_record = valid_1[:, 0]
last_number = order_record[-1]
list_number = np.arange(0, last_number + 1, 1)
list_predicted_labels = list()

# Scaling:
X_valid = scaler.transform(X_valid)

for i in list_number:
    # Choose frames:
    indexes = [j for j in range(len(order_record)) if order_record[j] == i]
    
    # Reshape datasets to 1 x 300 x num_features:
    X_valid_tmp = X_valid[indexes[0]:indexes[-1] + 1]
    if X_valid_tmp.shape[0] < n_frames:
        added_frames = np.zeros((n_frames - X_valid_tmp.shape[0], X_valid_tmp.shape[1]))
        X_valid_tmp = np.vstack((X_valid_tmp, added_frames))
    X_valid_reshaped = X_valid_tmp[:int(X_valid_tmp.shape[0] / n_frames) * n_frames]
    X_valid_reshaped = X_valid_reshaped.reshape(int(X_valid_reshaped.shape[0] / n_frames), n_frames, X_valid_reshaped.shape[1])
    prediction = model.predict(X_valid_reshaped)
    
    predicted_label = np.zeros(prediction.shape[0] * prediction.shape[1])
    shift = prediction.shape[1]
    for j in range(prediction.shape[0]):
        for jj in range(prediction.shape[1]):
            predicted_label[jj + j * shift] = np.argmax(prediction[j][jj])
            
    list_predicted_labels.append(Counter(predicted_label).most_common()[0][0])

In [29]:
list_predicted_labels

[0.0,
 2.0,
 2.0,
 0.0,
 1.0,
 2.0,
 2.0,
 2.0,
 2.0,
 2.0,
 1.0,
 0.0,
 0.0,
 2.0,
 0.0,
 2.0,
 1.0,
 0.0,
 0.0,
 1.0,
 0.0,
 1.0,
 0.0,
 0.0,
 1.0,
 2.0,
 1.0,
 1.0,
 2.0,
 0.0,
 0.0,
 2.0,
 0.0,
 0.0,
 1.0,
 1.0,
 0.0,
 0.0,
 1.0,
 1.0,
 1.0,
 2.0,
 0.0,
 1.0,
 2.0,
 1.0,
 0.0,
 2.0,
 0.0,
 1.0,
 2.0,
 2.0,
 0.0,
 2.0,
 0.0,
 1.0,
 1.0,
 0.0,
 2.0,
 2.0,
 2.0,
 2.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 2.0,
 0.0,
 2.0,
 2.0,
 2.0,
 2.0,
 0.0,
 0.0,
 2.0,
 2.0,
 2.0,
 2.0,
 2.0,
 2.0,
 2.0,
 1.0,
 1.0,
 1.0,
 0.0,
 2.0,
 1.0,
 0.0,
 1.0,
 0.0,
 1.0,
 2.0,
 0.0,
 0.0,
 1.0,
 0.0,
 1.0,
 2.0,
 2.0,
 0.0,
 1.0,
 0.0,
 2.0,
 1.0,
 1.0,
 1.0,
 2.0,
 2.0,
 2.0,
 0.0,
 1.0,
 1.0,
 2.0,
 1.0,
 2.0,
 1.0,
 2.0,
 0.0,
 2.0,
 0.0,
 2.0,
 1.0,
 2.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 1.0,
 1.0,
 0.0,
 1.0,
 2.0,
 0.0,
 2.0,
 2.0,
 1.0,
 1.0,
 1.0,
 2.0,
 2.0,
 0.0,
 0.0,
 1.0,
 0.0,
 1.0,
 2.0,
 1.0,
 2.0,
 1.0,
 2.0,
 2.0,
 2.0,
 0.0,
 1.0,
 0.0,
 1.0,
 0.0,
 2.0,
 2.0,
 2.0,
 0.0,
 2.0,
 2.0,
 0.0,
 0.0