In [11]:
import numpy as np
import soundfile as sf
import librosa

x, fs = sf.read('none.wav')
x = x.repeat(10)[:441000]
none_mfccs = librosa.feature.mfcc(x, sr=fs)
none_mfccs = np.vstack((np.ones((1,862))*0, none_mfccs)).T

x, fs = sf.read('up.wav')
x = x.repeat(10)[:441000]
up_mfccs = librosa.feature.mfcc(x, sr=fs)
up_mfccs = np.vstack((np.ones((1,862))*1, up_mfccs)).T

x, fs = sf.read('down.wav')
x = x.repeat(10)[:441000]
down_mfccs = librosa.feature.mfcc(x, sr=fs)
down_mfccs = np.vstack((np.ones((1,862))*2, down_mfccs)).T

data = np.vstack([none_mfccs, up_mfccs, down_mfccs])

In [12]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify = y)

In [13]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.svm import SVC
SKfold = StratifiedKFold(n_splits=4)

param_grid = {"C" : [1, 10, 100, 1000], "gamma" : [0.1, 0.01, 0.001]}

svm_gs = GridSearchCV(SVC(C=10, kernel="rbf"), param_grid, cv=SKfold, verbose=1)
svm_gs.fit(X_train,y_train)

Fitting 4 folds for each of 12 candidates, totalling 48 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  48 out of  48 | elapsed:    4.3s finished


GridSearchCV(cv=StratifiedKFold(n_splits=4, random_state=None, shuffle=False),
             error_score='raise-deprecating',
             estimator=SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='auto_deprecated', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='warn', n_jobs=None,
             param_grid={'C': [1, 10, 100, 1000], 'gamma': [0.1, 0.01, 0.001]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=1)

In [14]:
print(svm_gs.best_params_)
print(svm_gs.best_score_)
svm_clf=svm_gs.best_estimator_

{'C': 1, 'gamma': 0.001}
0.9408839779005524


In [15]:
svm_clf.fit(X_train, y_train)

SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [16]:
print("Train accuracy",svm_clf.score(X_train, y_train))
print("Test accuracy",svm_clf.score(X_test, y_test))

Train accuracy 0.967403314917127
Test accuracy 0.9497422680412371


In [17]:
from sklearn.metrics import confusion_matrix
pred = svm_clf.predict(X_test)
print(X_test[0].dtype)
confusion_matrix(pred, y_test)

float64


array([[257,   0,   0],
       [  0, 244,  23],
       [  1,  15, 236]])

In [19]:
import pyaudio

last_data = 0

chunk = 1024  * 4    #処理の重さによって値を変える
sr = 44100 # 小さくしていくと音質が悪くなる
speaker = pyaudio.PyAudio()

stream = speaker.open(
    format=pyaudio.paInt16,
    channels=1,
    rate=sr,
    frames_per_buffer=chunk,
    input=True,
    output=True
)

while stream.is_active():
    I = stream.read(chunk)
    # I = 何かしらの処理(I) 処理を加えるときはchunkをいくらか大きくする
    I = np.frombuffer(I, dtype='int16')[:1024] / float(2**15)
    print(I[:10])
    mfccs = librosa.feature.mfcc(I, sr=44100)
    pred = svm_clf.predict(mfccs.T[0].reshape(1,-1))
    print(pred) 

stream.stop_stream()
stream.close()
speaker.terminate()

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[2.]
[0.00021362 0.00021362 0.00018311 0.00015259 0.00021362 0.00012207
 0.00012207 0.00012207 0.00012207 0.00024414]
[2.]
[-0.00015259 -0.00012207 -0.00018311 -0.00018311 -0.00018311 -0.00018311
 -0.00018311 -0.00015259 -0.00015259 -0.00018311]
[2.]
[-0.00073242 -0.00079346 -0.00082397 -0.00085449 -0.00085449 -0.00085449
 -0.00079346 -0.0007019  -0.00073242 -0.0007019 ]
[2.]
[0.00234985 0.00216675 0.00204468 0.00219727 0.00213623 0.00198364
 0.0020752  0.00213623 0.0020752  0.00216675]
[2.]
[0.00241089 0.00222778 0.00204468 0.00177002 0.00146484 0.0012207
 0.00097656 0.00073242 0.00054932 0.00030518]
[2.]
[-0.00982666 -0.00970459 -0.00952148 -0.00927734 -0.00894165 -0.0085144
 -0.00796509 -0.00741577 -0.00680542 -0.00619507]
[2.]
[-0.05245972 -0.05206299 -0.05111694 -0.04980469 -0.04827881 -0.04672241
 -0.04534912 -0.04412842 -0.04318237 -0.04251099]
[2.]
[0.02380371 0.02682495 0.02215576 0.01855469 0.0229187  0.02282715
 0.01565552 0.01580811 0.0214233

KeyboardInterrupt: 