In [1]:
import collections
import librosa
import matplotlib as plt
import numpy as np
import os
import python_speech_features as psf
import random
import struct
import wave

In [30]:
import keras
from keras.models import Sequential,Input,Model
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D, Conv1D
from keras.layers.normalization import BatchNormalization
from keras.layers.advanced_activations import LeakyReLU

In [3]:
%matplotlib inline

### Constants

In [4]:
LANGUAGES_FOLDER = '/home/kolegor/Study/Master/Work/repo/LangIS/data/wav/'
LANGUAGES_LIST = os.listdir(LANGUAGES_FOLDER)
LANGUAGE_FOLDER_TEMPLATE = '/home/kolegor/Study/Master/Work/repo/LangIS/data/wav/{}/'
TEST_LANGUAGE = LANGUAGES_LIST[0]
TEST_WAVFILE_PATH = LANGUAGE_FOLDER_TEMPLATE.format(TEST_LANGUAGE) + os.listdir(LANGUAGES_FOLDER + LANGUAGES_LIST[0])[0]

In [5]:
MAP_LABEL_TO_LANGUAGE = dict(enumerate(LANGUAGES_LIST))
MAP_LANGUAGE_TO_LABEL = dict(zip(LANGUAGES_LIST, range(len(LANGUAGES_LIST))))

In [6]:
ONE_LANGUAGE_FILES_COUNT = 100

### Wav file wrapper

In [7]:
class WavFile(object):
    SAMPLE_SIZE = {
        1: np.int8,
        2: np.int16,
        4: np.int32
    }
    
    def __init__(self, wav_path):
        self.wav = wave.open(wav_path, 'r')

        params = self.wav.getparams()

        self.n_channels = params[0]
        self.sample_size = params[1]
        self.sample_rate = params[2]
        self.n_frames = params[3]

        frames = self.wav.readframes(self.n_frames)
        self.bytes = np.fromstring(frames, dtype=WavFile.SAMPLE_SIZE[self.sample_size])
    
    def __str__(self):
        return 'Channels (count): {}\nSample size (bytes): {}\nFrame rate (Hz): {}\nFrames (count): {}'.format(
            self.n_channels, self.sample_size, self.sample_rate, self.n_frames
        )
    
    def __repr__(self):
        return self.__str__()

### Features

In [8]:
class Config(object):
    def __init__(self):
        self.n_mfcc = 13
        self.frame_length_sec = 0.025
        self.frame_shift_sec = 0.01
        self.min_freq = 0
        self.max_freq = 5000
        self.n_mels_spec = 39


cnf = Config()

In [9]:
def mfcc(values, sample_rate, frame_length=cnf.frame_length_sec, frame_shift=cnf.frame_shift_sec, n_mfcc=cnf.n_mfcc):
    return librosa.feature.mfcc(
        y=values,
        sr=sample_rate,
        n_mfcc=n_mfcc,
        n_fft=int(frame_length * sample_rate),
        hop_length=int(frame_shift * sample_rate),
        fmin=cnf.min_freq,
        fmax=cnf.max_freq
    ).T


def mfcc_delta(mfccs):
    return librosa.feature.delta(mfccs, axis=0)


def mfcc_delta_delta(mfccs_delta):
    return librosa.feature.delta(mfccs_delta, axis=0)

In [10]:
def spectrogram(values, sample_rate, frame_length=cnf.frame_length_sec, frame_shift=cnf.frame_shift_sec, n_mels=cnf.n_mels_spec):
    return librosa.feature.melspectrogram(
        y=values,
        sr=sample_rate,
        n_fft=int(frame_length * sample_rate),
        hop_length=int(frame_shift * sample_rate),
        fmin=MIN_FREQ,
        fmax=MAX_FREQ,
        n_mels=n_mels
    ).T

### Data loading

In [11]:
def load_language(language_id):
    language_folder = LANGUAGE_FOLDER_TEMPLATE.format(language_id)
    language_wavs = []

    for i, filename in enumerate(os.listdir(language_folder)):
        filepath = language_folder + filename
        wav_file = WavFile(TEST_WAVFILE_PATH)
        language_wavs.append(wav_file)

        if i + 1 == ONE_LANGUAGE_FILES_COUNT:
            break
    
    return language_wavs

In [12]:
def load(languages_list):
    languages_data = dict()
    for language_id in languages_list:
        cur_language_wavs = load_language(language_id)
        languages_data[language_id] = cur_language_wavs
    return languages_data

### Prepare data (load and extract features)

In [13]:
def get_one_wav_file_features(wav_file):
    mfccs = mfcc(wav_file.bytes, wav_file.sample_rate)
    mfccs_d = mfcc_delta(mfccs)
    mfccs_dd = mfcc_delta_delta(mfccs_d)
    return np.hstack((mfccs, mfccs_d, mfccs_dd))

In [14]:
def split_train_test(X, y, train_size=0.7):
    combined_by_label = collections.defaultdict(list)
    for features, label in zip(X, y):
        combined_by_label[label].append(features)
    
    train_feature_label_pairs = list()
    test_feature_label_pairs = list()
    
    for label, all_features in combined_by_label.iteritems():
        random.shuffle(all_features)
        border = int(train_size * len(all_features))
        
        train_feature_label_pairs.extend((f, label) for f in all_features[:border])
        test_feature_label_pairs.extend((f, label) for f in all_features[border + 1:])
    
    X_train, y_train, X_test, y_test = list(), list(), list(), list()

    # shuffle train and test
    random.shuffle(test_feature_label_pairs)
    for features, label in train_feature_label_pairs:
        X_train.append(features)
        y_train.append(label)
    
    random.shuffle(train_feature_label_pairs)
    for features, label in test_feature_label_pairs:
        X_test.append(features)
        y_test.append(label)

    return X_train, y_train, X_test, y_test

In [15]:
def prepare(languages_list):
    languages_wav_files = load(languages_list)

    X, y = [], []

    for language_id, wav_files in languages_wav_files.iteritems():
        y_label = MAP_LANGUAGE_TO_LABEL[language_id]
        for wav_file in wav_files:
            features = get_one_wav_file_features(wav_file)
            
            X.append(features)
            y.append(y_label)

    return X, y

In [16]:
X, y = prepare(['it', 'pt'])  # LANGUAGES_LIST[:2])
print len(X), X[0].shape

200 (638, 39)


In [17]:
X_train, y_train, X_test, y_test = split_train_test(X, y)
print len(X_train), len(X_test)

140 58


### ML models

In [91]:
class NN(object):
    def __init__(self, input_shape, num_classes):
        self.model = Sequential()
        self.model.add(Conv2D(32, kernel_size=(3, 3), activation='linear', input_shape=input_shape, padding='same'))
        self.model.add(LeakyReLU(alpha=0.1))
        self.model.add(MaxPooling2D((2, 2), padding='same'))
        self.model.add(Conv2D(64, (3, 3), activation='linear', padding='same'))
        self.model.add(LeakyReLU(alpha=0.1))
        self.model.add(MaxPooling2D(pool_size=(2, 2), padding='same'))
        self.model.add(Conv2D(128, (3, 3), activation='linear', padding='same'))
        self.model.add(LeakyReLU(alpha=0.1))                  
        self.model.add(MaxPooling2D(pool_size=(2, 2), padding='same'))
        self.model.add(Flatten())
        self.model.add(Dense(128, activation='linear'))
        self.model.add(LeakyReLU(alpha=0.1))                  
        self.model.add(Dense(num_classes, activation='softmax'))
        
        self.model.compile(
            loss=keras.losses.categorical_crossentropy,
            optimizer=keras.optimizers.Adam(),
            metrics=['accuracy']
        )
        # print self.model.summary()
    
    def fit(self, X_train, y_train, batch_size=64, epochs=10):
        self.model.fit(
            X_train, y_train,
            batch_size=batch_size,
            epochs=epochs,
            verbose=1,
            # validation_data=(valid_X, valid_label)
        )

    
    def evaluate(self, X_test, y_test):
        return self.model.evaluate(X_test, y_test, verbose=0)

In [97]:
def remake_X(X):
    qwe = np.array(X)
    return np.reshape(qwe, qwe.shape + (1,))

def remake_y(Y):
    mapper = dict(zip(set(y_train), range(len(set(Y)))))
    return np.array([(1, 0) if mapper[y] == 0 else (0, 1) for y in Y])

In [98]:
X_train_remake, y_train_remake = remake_X(X_train), remake_y(y_train)
X_test_remake, y_test_remake = remake_X(X_test), remake_y(y_test)

In [92]:
nnm = NN(input_shape=(638, 39, 1), num_classes=2)

In [100]:
nnm.fit(X_train_remade, y_train_remade, epochs=2)

Epoch 1/2
Epoch 2/2


In [102]:
test_eval = nnm.evaluate(X_test_remake, y_test_remake)
print('Test loss:', test_eval[0])
print('Test accuracy:', test_eval[1])

('Test loss:', 8.0590477976305728)
('Test accuracy:', 0.49999999794466743)


### Run tests

In [42]:
wav_file = WavFile(TEST_WAVFILE_PATH)
print wav_file, wav_file.bytes

Channels (count): 1
Sample size (bytes): 2
Frame rate (Hz): 48000
Frames (count): 306000 [110 122 120 ...,  45  52  59]


In [39]:
qwe = spectrogram(wav_file.bytes, wav_file.sample_rate)

In [123]:
mfccs = mfcc(wav_file.bytes, wav_file.framerate)
mfccs_d = mfcc_delta(mfccs)
mfccs_dd = mfcc_delta_delta(mfccs_d)
print mfccs.shape, mfccs_d.shape, mfccs_dd.shape

In [129]:
mfccs[0]

array([ 432.05712026,   52.29930026,   12.09182029,   11.49087919,
          5.39945342,   35.74448741,   29.28330591,    4.74748869,
         -2.49883588,   13.88058917,   13.12185477,   -4.84476764,
        -18.20271413])

In [130]:
mfccs_d[0]

array([ 3.42088285,  0.28187383, -0.11280747,  0.59703594,  0.8517748 ,
       -2.96188816, -4.99040002,  0.15025697,  2.26703505, -0.13830081,
       -1.23693085,  1.12152547,  4.60190175])

In [131]:
mfccs_dd[0]

array([-0.24010707,  0.05888751, -0.1872149 ,  0.12102598,  0.04666345,
        0.17749048,  0.18754179, -0.28923803, -0.10040412,  0.2115827 ,
        0.16532077,  0.17607055, -0.1879921 ])