# Speaker recognition
based on https://m.habr.com/ru/post/144491/

Mozilla Common Voice (ru) https://voice.mozilla.org/ru/datasets

In [3]:
import warnings
import os
import numpy as np
import librosa
import librosa.feature
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA

warnings.simplefilter('error')
warnings.filterwarnings('ignore', category=FutureWarning)
%matplotlib inline

In [2]:
# загрузка и обработка аудио

num_voices = 55

def extract_mfcc(path, sampling_rate=24000, shift=32., L=128., mel_coefs=120, mfcc_coefs=12, alpha=0.9, eps=1e-9):
    '''
    Transfroms a sound wave into a sequence of MFCC coefficients.
    :param sound: 1D np.array with sound wave
    :param sampling rate: number of points sampled per second
    :param shift: difference between starting points of consecutive frames, ms.
    :param L: window length, ms.
    Returns 2D array of size (frames x mfcc_dim)
    '''
    sound, _ = librosa.load(path, sr=sample_rate) 
    #sound = librosa.util.normalize(sound)
    mfcc = librosa.feature.mfcc(sound, sampling_rate, 
                                n_mfcc = mfcc_coefs)
    energy = librosa.feature.rms(sound)
    mfcc = np.vstack((mfcc,energy))
    delta1 = librosa.feature.delta(mfcc)
    delta2 = librosa.feature.delta(delta1)
    mfcc = np.vstack((mfcc,delta1,delta2))
    return mfcc.T

X = []
y = []

num_frames = 64 #

# это надо исправить под наши данные
for voice in range(num_voices):
    names = os.listdir(path='./'+str(voice))
    for name in name:
        print(voice, name, mfcc.shape)
        mfcc = extract_mfcc(path+'/'+name)
        if len(mfcc) < num_frames:
            continue
        if len(mfcc) > num_frames:
            mfcc = mfcc[:num_frames]
        X = X + [extract_mfcc(path+'/'+name)]
        y = y + [voice]

X = np.array(X)
y = np.array(y)

X_train, y_train, X_test, y_test = train_test_split(x, y)

In [None]:
# кластеризация

num_clasters = num_voices * 4   # этого достаточно для классификации дикторов: 
                                # SPEAKER IDENTIFICATION USING MEL FREQUENCY CEPSTRAL COEFFICIENTS
clf = KMeans(num_clasters).fit(X_train, y_train)
y_pred = clf.predict()

print(accuracy_score(y_pred, y_test))

In [None]:
# визуализация
# from https://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_digits.html

reduced_data = PCA(n_components = 2).fit_transform(X_train)
reduces_clf = KMeans(num_clusters).fit(reduced_data, y_train)

# Step size of the mesh. Decrease to increase the quality of the VQ.
h = .02     # point in the mesh [x_min, x_max]x[y_min, y_max].

# Plot the decision boundary. For that, we will assign a color to each
x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1
y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

# Obtain labels for each point in mesh. Use last trained model.
Z = reduced_clf.predict(np.c_[xx.ravel(), yy.ravel()])

# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure(1)
plt.clf()
plt.imshow(Z, interpolation='nearest',
           extent=(xx.min(), xx.max(), yy.min(), yy.max()),
           cmap=plt.cm.Paired,
           aspect='auto', origin='lower')

plt.plot(reduced_data[:, 0], reduced_data[:, 1], 'k.', markersize=2)
# Plot the centroids as a white X
centroids = kmeans.cluster_centers_
plt.scatter(centroids[:, 0], centroids[:, 1],
            marker='x', s=169, linewidths=3,
            color='w', zorder=10)
plt.title('K-means clustering on the digits dataset (PCA-reduced data)\n'
          'Centroids are marked with white cross')
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)
plt.xticks(())
plt.yticks(())
plt.show()