In [1]:
import os
import glob
import librosa
import numpy as np
import pandas as pd
import pickle

import tensorflow as tf
from tensorflow import keras

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

2023-10-16 20:45:12.138820: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-10-16 20:45:12.140296: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-10-16 20:45:12.171468: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-10-16 20:45:12.172789: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
max_pad_len = 174

def extract_feature(file_name):
    print('file name :', file_name)
    try:
        # res_type='kaiser_fast' : resampy 더 빠른 방법
        # 여부에 따라 값이 다르게 나오긴 함
        # resampy
        # 다차원 리샘플링을 지원하며 오디오 애플리케이션에 매우 적합합니다.
        # 장기간 신호(예: 고품질 샘플링 속도에서 몇 분)의 경우 resampy는 scipy.signal.resample 보다
        # 상당히 빠르며 오디오 품질에서는 인지할 수 있는 차이가 거의 없습니다.
        audio, sample_rate = librosa.load(file_name, res_type='kaiser_fast')
        # audio, sample_rate = librosa.load(file_name)
        mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
        pad_width = max_pad_len - mfccs.shape[1]
        mfccs = np.pad(mfccs, pad_width=((0,0), (0, pad_width)), mode='constant')
        print(mfccs.shape)
        
    except Exception as e:
        print("Error encountered while parsing file: ", file_name)
        print(e)
        return None
    
#     return padded_mfccs
    return mfccs

In [3]:
# training dataset
root_path = "../data/wavs/train/"
wav_list = os.listdir(root_path)
wav_files = [os.path.join(root_path, file) for file in wav_list if file.endswith('.wav')]
print(len(wav_files))

# data는 우리가 리브로사로 추출한 mfccs라는 특성이고
# class_label은 그 음향의 종류를 나타낸다.

features = []
for wav_file in wav_files:
    data = extract_feature(wav_file)
    class_label = 0
    if 'Overhead' in wav_file:
        class_label = 1
    elif 'Snare' in wav_file:
        class_label = 2
    elif 'Tom' in wav_file:
        class_label = 3
    else:
        class_label = 0
    features.append([data, class_label])

# Convert into a Panda dataframe 
featuresdf = pd.DataFrame(features, columns=['feature','class_label'])

120
file name : ../data/wavs/train/Overhead Sample 1.wav
(40, 174)
file name : ../data/wavs/train/Overhead Sample 10.wav
(40, 174)
file name : ../data/wavs/train/Overhead Sample 11.wav
(40, 174)
file name : ../data/wavs/train/Overhead Sample 12.wav
(40, 174)
file name : ../data/wavs/train/Overhead Sample 13.wav
(40, 174)
file name : ../data/wavs/train/Overhead Sample 14.wav
(40, 174)
file name : ../data/wavs/train/Overhead Sample 15.wav
(40, 174)
file name : ../data/wavs/train/Overhead Sample 16.wav
(40, 174)
file name : ../data/wavs/train/Overhead Sample 17.wav
(40, 174)
file name : ../data/wavs/train/Overhead Sample 18.wav
(40, 174)
file name : ../data/wavs/train/Overhead Sample 19.wav
(40, 174)
file name : ../data/wavs/train/Overhead Sample 2.wav
(40, 174)
file name : ../data/wavs/train/Overhead Sample 20.wav
(40, 174)
file name : ../data/wavs/train/Overhead Sample 21.wav
(40, 174)
file name : ../data/wavs/train/Overhead Sample 22.wav
(40, 174)
file name : ../data/wavs/train/Overhea

In [4]:

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error
from keras.utils import to_categorical

# 불러온 featuresdf에서 feature는 X에 저장하였고 class_label은 y로 저장하였다.
# 헌데 y는 yy로 변환과정을 거쳐서 다시 저장되었다.
# 둘의 차이는 원-핫-인코딩의 여부이다.
# 원-핫-인코딩은 1,2,3 있을 때 1: [1.0.0] / 2:[0.1.0] / 3:[0.0.1] 로 변환해주는 거임.
# 이렇게 변환해서 사용하는 이유는 우리가 작성할 딥러닝 모델이 멀티 클래스(3~ 가지) 분류를 하기 때문이다.

X = np.array(featuresdf.feature.tolist())
y = np.array(featuresdf.class_label.tolist())

le = LabelEncoder()
yy = to_categorical(le.fit_transform(y))

In [5]:
x_train, x_test, y_train, y_test = train_test_split(X, yy, test_size=0.2, random_state = 42)

print("x_train >> ", x_train.shape)
print("x_test >> ", x_test.shape)
print("y >> ", y[:10])
print("yy >> ", yy[:10])
print("y_test >> ", y_test[:10])

x_train >>  (96, 40, 174)
x_test >>  (24, 40, 174)
y >>  [1 1 1 1 1 1 1 1 1 1]
yy >>  [[1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]]
y_test >>  [[0. 1. 0.]
 [0. 1. 0.]
 [1. 0. 0.]
 [0. 1. 0.]
 [1. 0. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]]


In [6]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
pca = PCA(n_components=2, svd_solver='full')

def pca_scale(x):
    print(x.shape)
    scaled = scaler.transform(x)
    # scaled_x = pca.transform(scaled)
    
    # return scaled_x
    return scaled

In [7]:
# from sklearn.naive_bayes import CategoricalNB
# clf = CategoricalNB(force_alpha=True)
# clf.fit(x_train, y_train)

n_columns = 174    
n_row = 40       
n_channels = 1
n_classes = 3

# print(x_train[0][0])
print(x_train.shape)
x_train = x_train.reshape(-1, 3)
print(x_train.shape)
scaled = scaler.fit_transform(x_train)
# x_train = pca.fit_transform(scaled)
# x_train = pca_scale(scaled)

# # input shape 조정
# # cpu를 사용해서 수행한다
# with tf.device('/cpu:0'):
#     x_train = tf.reshape(x_train, [-1, n_row, n_columns, n_channels])
#     x_test = tf.reshape(x_test, [-1, n_row, n_columns, n_channels])


knr=KNeighborsRegressor(n_neighbors=2)
knr.fit(x_train, y_train)
print("결정 계수 >> ", knr.score(x_train, y_train))

# # importing module
# from sklearn.linear_model import LinearRegression
# # creating an object of LinearRegression class
# LR = LinearRegression()
# # fitting the training data
# LR.fit(x_train,y_train)

(96, 40, 174)
(222720, 3)


ValueError: Found input variables with inconsistent numbers of samples: [222720, 96]

In [None]:
root = "../data/wavs/test/"
test = root + "Overhead Sample 30.wav"
# test = root + "Snare Sample 30.wav"
# test = root + "Tom Sample 30.wav"

# input shape 조정
# cpu를 사용해서 수행한다

n_columns = 174    
n_row = 40       
n_channels = 1

# input shape 조정
# cpu를 사용해서 수행한다
input = np.array(extract_feature(test))
# with tf.device('/cpu:0'):
#     input = tf.reshape(input, [-1, n_row, n_columns, n_channels])
print(input)
input = np.array([input]).reshape(-1, 2)

input = pca_scale(input)
print(input)

# input = pca_scale(input)

# 예측
prediction = knr.predict(input)
print("prediction>>>", prediction)

In [None]:
x_train