In [1]:
import os
import librosa
from matplotlib import pyplot
import IPython.display as ipd
import matplotlib.pyplot as plt
import numpy as np
from scipy.io import wavfile
import warnings
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import make_scorer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils
from tensorflow import keras
from keras.layers import Dense, Dropout, Flatten, Conv1D, Input, MaxPooling1D
from keras.models import Model
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras import backend as K
from pydub import AudioSegment


warnings.filterwarnings("ignore")

2021-11-03 19:36:18.413078: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-11-03 19:36:18.413132: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
labels=["eight", "four", "five", "three", "six"]

In [3]:
train_audio_path = '/home/wojtek/SG/speech_classification/'

all_wave = []
all_label = []
for label in labels:
    print(label)
    waves = [f for f in os.listdir(train_audio_path + '/'+ label) if f.endswith('.wav')]
    for wav in waves[:800]:
        samples, sample_rate = librosa.load(train_audio_path + '/' + label + '/' + wav, sr = 16000)
        samples = librosa.resample(samples, sample_rate, 8000)
        if(len(samples)== 8000): 
            all_wave.append(samples)
            all_label.append(label)

eight
four
five
three
six


In [5]:
le = LabelEncoder()
y=le.fit_transform(all_label)
classes= list(le.classes_)

In [6]:
y=np_utils.to_categorical(y, num_classes=len(labels))

In [7]:
all_wave = np.array(all_wave).reshape(-1,8000,1)

In [8]:
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=10, min_delta=0.0001) 
mc = ModelCheckpoint('best_model.hdf5', monitor='val_acc', verbose=1, save_best_only=True, mode='max')

In [87]:
def manipulate(sound, speed_factor):
    return librosa.effects.time_stretch(sound, speed_factor)

def manipulate2(sound, sound_rate, n_steps):
    return librosa.effects.pitch_shift(sound, sound_rate, n_steps)

@tf.function
def data_augmentation_sound(sound, p=0.5):
    if tf.random.uniform([])<p:
        sound = sound + 0.9*np.random.normal(0,1,8000)
        #manipulate(sound, 0.8)
        #manipulate2(sound, 8000, -3)
        print("dziala")
    else:
        sound
    #sound = tf.reshape(sound, (None,8000,1))
    #sound = tf.expand_dims(sound, 1)
    return sound

def data_augmentation(factor=0.5):
    return keras.layers.Lambda(lambda x: data_augmentation_sound(x, factor))

data_augmentation = data_augmentation()

class RandomAugmentation(keras.layers.Layer):
    def __init__(self, factor=0.5, **kwargs):
        super().__init__(**kwargs)
        self.factor = factor
        
    def call(self, x):
        return data_augmentation_sound(x)

In [10]:
def predict(audio, model):
    indices = np.empty(len(y_valid))
    e = 0
    for i in audio:
        prob=model.predict(i.reshape(-1,8000,1))
        index=np.argmax(prob[0])
        indices[e]=index
        e += 1
    return indices

In [11]:
import logging
import tensorflow as tf
tf.get_logger().setLevel('INFO')
tf.get_logger().setLevel(logging.ERROR)

x_train, x_valid, y_train, y_valid = train_test_split(np.array(all_wave),np.array(y),stratify=y,test_size = 0.1,random_state=777,shuffle=True)

precision_summary =[]
recall_summary = []
f1_summary = []
for i in range(3):
    K.clear_session()
    inputs = Input(shape=(8000,1))
    #First Conv1D layer
    conv = Conv1D(8,13, padding='valid', activation='relu', strides=1)(inputs)
    conv = MaxPooling1D(3)(conv)
    conv = Dropout(0.3)(conv)
    #Second Conv1D layer
    conv = Conv1D(16, 11, padding='valid', activation='relu', strides=1)(conv)
    conv = MaxPooling1D(3)(conv)
    conv = Dropout(0.3)(conv)
    #Third Conv1D layer
    conv = Conv1D(32, 9, padding='valid', activation='relu', strides=1)(conv)
    conv = MaxPooling1D(3)(conv)
    conv = Dropout(0.3)(conv)
    #Fourth Conv1D layer
    conv = Conv1D(64, 7, padding='valid', activation='relu', strides=1)(conv)
    conv = MaxPooling1D(3)(conv)
    conv = Dropout(0.3)(conv)
    #Flatten layer
    conv = Flatten()(conv)
    #Dense Layer 1
    conv = Dense(256, activation='relu')(conv)
    conv = Dropout(0.3)(conv)
    #Dense Layer 2
    conv = Dense(128, activation='relu')(conv)
    conv = Dropout(0.3)(conv)
    outputs = Dense(len(labels), activation='softmax')(conv)

    model = Model(inputs, outputs)
    #model.summary()
    model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])

    history = model.fit(x_train, y_train ,epochs=3, verbose=0, callbacks=[es,mc], batch_size=32, validation_data=(x_valid,y_valid))
    y_pred = predict(x_valid, model)
    y_valid_max = np.argmax(y_valid, axis=1)
    confmat = confusion_matrix(y_true=y_valid_max, y_pred=y_pred)
    print("\n\nMacierz pomylek: \n\n{}".format(confmat))

    precision_macro = precision_score(y_true=y_valid_max, y_pred=y_pred, average='macro')
    precision_summary.append(precision_macro)
    recall_macro = recall_score(y_true=y_valid_max, y_pred=y_pred, average='macro')
    recall_summary.append(recall_macro)
    f1_macro = f1_score(y_true=y_valid_max, y_pred=y_pred, average='macro')
    f1_summary.append(f1_macro)
precision_avg = sum(precision_summary)/len(precision_summary)
recall_avg = sum(recall_summary)/len(recall_summary)
f1_avg = sum(f1_summary)/len(f1_summary)
print("\nPrecyzja: {}".format(precision_summary))
print("\nPelnosc: {}".format(recall_summary))
print("\nF1: {}".format(f1_summary))
print("\nSrednia precyzja dla 10 prob: {}".format(precision_avg))
print("\nSrednia pelnosc dla 10 prob: {}".format(recall_avg))
print("\nSrednie F1 dla 10 prob: {}".format(f1_avg))

2021-11-03 19:37:29.655312: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2021-11-03 19:37:29.655359: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2021-11-03 19:37:29.655387: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (wojtek): /proc/driver/nvidia/version does not exist
2021-11-03 19:37:29.655604: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2021-11-03 19:37:29.773725: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 104480000 exceeds 10



Macierz pomylek: 

[[ 3  0  0 35 33]
 [ 0 39 19  9  6]
 [ 1  3 13 35 21]
 [ 1  0  1 52 20]
 [ 2  1  0 40 29]]


2021-11-03 19:38:25.557571: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 104480000 exceeds 10% of free system memory.




Macierz pomylek: 

[[20  0  0 34 17]
 [ 0 55 13  2  3]
 [ 1  1 62  9  0]
 [ 8  0  1 52 13]
 [ 5  0  1 36 30]]


2021-11-03 19:39:21.307551: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 104480000 exceeds 10% of free system memory.




Macierz pomylek: 

[[19  0  0 46  6]
 [ 0 40 29  2  2]
 [ 0  4 58 10  1]
 [ 6  0  3 58  7]
 [ 4  1  1 43 23]]

Precyzja: [0.459927235963978, 0.6485481752509618, 0.6271894808004111]

Pelnosc: [0.37201255374610576, 0.6007598472483673, 0.5426599230911405]

F1: [0.3479103347526187, 0.6014384258590528, 0.5355103333066559]

Srednia precyzja dla 10 prob: 0.578554964005117

Srednia pelnosc dla 10 prob: 0.5051441080285378

Srednie F1 dla 10 prob: 0.4949530313061092


In [90]:
precision_summary =[]
recall_summary = []
f1_summary = []
for i in range(3):
    K.clear_session()
    inputs = Input(shape=(8000,1))
    conv = data_augmentation(inputs)
    #First Conv1D layer
    conv = Conv1D(8,13, padding='valid', activation='relu', strides=1)(conv)
    #conv = Conv1D(8,13, padding='valid', activation='relu', strides=1)(inputs)
    conv = MaxPooling1D(3)(conv)
    conv = Dropout(0.3)(conv)
    #Second Conv1D layer
    conv = Conv1D(16, 11, padding='valid', activation='relu', strides=1)(conv)
    conv = MaxPooling1D(3)(conv)
    conv = Dropout(0.3)(conv)
    #Third Conv1D layer
    conv = Conv1D(32, 9, padding='valid', activation='relu', strides=1)(conv)
    conv = MaxPooling1D(3)(conv)
    conv = Dropout(0.3)(conv)
    #Fourth Conv1D layer
    conv = Conv1D(64, 7, padding='valid', activation='relu', strides=1)(conv)
    conv = MaxPooling1D(3)(conv)
    conv = Dropout(0.3)(conv)
    #Flatten layer
    conv = Flatten()(conv)
    #Dense Layer 1
    conv = Dense(256, activation='relu')(conv)
    conv = Dropout(0.3)(conv)
    #Dense Layer 2
    conv = Dense(128, activation='relu')(conv)
    conv = Dropout(0.3)(conv)
    outputs = Dense(len(labels), activation='softmax')(conv)

    model = Model(inputs, outputs)
    model.summary()
    model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])

    history = model.fit(x_train, y_train ,epochs=3, verbose=0, callbacks=[es,mc], batch_size=32, validation_data=(x_valid,y_valid))
    y_pred = predict(x_valid, model)
    y_valid_max = np.argmax(y_valid, axis=1)
    confmat = confusion_matrix(y_true=y_valid_max, y_pred=y_pred)
    print("\n\nMacierz pomylek: \n\n{}".format(confmat))

    precision_macro = precision_score(y_true=y_valid_max, y_pred=y_pred, average='macro')
    precision_summary.append(precision_macro)
    recall_macro = recall_score(y_true=y_valid_max, y_pred=y_pred, average='macro')
    recall_summary.append(recall_macro)
    f1_macro = f1_score(y_true=y_valid_max, y_pred=y_pred, average='macro')
    f1_summary.append(f1_macro)
precision_avg = sum(precision_summary)/len(precision_summary)
recall_avg = sum(recall_summary)/len(recall_summary)
f1_avg = sum(f1_summary)/len(f1_summary)
print("\nPrecyzja: {}".format(precision_summary))
print("\nPelnosc: {}".format(recall_summary))
print("\nF1: {}".format(f1_summary))
print("\nSrednia precyzja dla 10 prob: {}".format(precision_avg))
print("\nSrednia pelnosc dla 10 prob: {}".format(recall_avg))
print("\nSrednie F1 dla 10 prob: {}".format(f1_avg))

ValueError: The channel dimension of the inputs should be defined. Found `None`.