In [70]:
import os
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, LSTM

import librosa

In [71]:
root_path = "../data/tmp_wavs/" # 드럼 녹음본 path
endswith = {"m4a":".m4a", "wav":".wav"} # 확장자명
ORIGINAL_AUDIO_EXT = 'm4a'
AUDIO_EXT = 'wav'

# sr(sampling rate) : default 22050
# -> 오디오 처리의 일반적인 관행으로 특히 음악 제작 환경에서는 44.1kHz의 샘플 속도가 자주 사용됩니다. 그러나 일부 특정 작업의 경우 또는 제한된 리소스를 처리할 때는 22.05kHz 또는 16kHz와 같은 낮은 샘플링 속도로도 충분
SAMPLE_RATE = 22050

def get_audios(ext):
    file_list = os.listdir(root_path)
    files = [os.path.join(root_path, file) for file in file_list if file.endswith(endswith[ext])]
    return files

### ➡️ m4a to wav (필요시)

In [72]:
# EXTRACT_VIDEO_COMMAND = ('ffmpeg -i "{from_video_path}" '
#                          '-f {audio_ext} -ab 22500 '
#                          '-vn "{to_audio_path}" ')

# files = get_audios(ORIGINAL_AUDIO_EXT)
# for file in files:
#     audio_file_name = file.replace(ORIGINAL_AUDIO_EXT, AUDIO_EXT)
#     print(audio_file_name)
#     command = EXTRACT_VIDEO_COMMAND.format(
#         from_video_path=file, audio_ext=AUDIO_EXT, to_audio_path=audio_file_name,
#     )
#     os.system(command)

### ➡️ train data 가져오기

In [73]:
files = get_audios(AUDIO_EXT)

## 🎵 코드 사전 정의

In [74]:
code2idx = {'CC_04':0, 'CC_08':1,
             'HH_04':2, 'HH_08':3, 'HH_16':4, 
             'KK_04':5, 'KK_08':6,
            'SD_04':7, 'SD_08':8}

idx2code = {0:'CC_04', 1:'CC_08',
             2:'HH_04', 3:'HH_08', 4:'HH_16', 
             5:'KK_04', 6:'KK_08',
            7:'SD_04', 8:'SD_08'}

### data 전처리

In [75]:
max_pad_len = 800

def extract_feature(file):
    audio, sample_rate = librosa.load(file)
    mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
    pad_width = max_pad_len - mfccs.shape[1]
    mfccs = np.pad(mfccs, pad_width=((0, 0), (0, pad_width)), mode='constant')
    return mfccs

In [76]:
output = {
    "CC_08" : ["CC_08"],
    "HH_04" : ["HH_04", "HH_04", "HH_04", "HH_04"],
    "HH_08" : ["HH_08", "HH_08", "HH_08", "HH_08", "HH_08", "HH_08", "HH_08", "HH_08"],
    "HH_16" : ["HH_16", "HH_16", "HH_16", "HH_16", "HH_16", "HH_16", "HH_16", "HH_16", "HH_16", "HH_16", "HH_16", "HH_16", "HH_16", "HH_16", "HH_16", "HH_16"],
    "KK_04" : ["KK_04", "KK_04", "KK_04", "KK_04"],
    "KK_08" : ["KK_08", "KK_08", "KK_08", "KK_08", "KK_08", "KK_08", "KK_08", "KK_08"],
    "SD_04" : ["SD_04", "SD_04", "SD_04", "SD_04"],
    "SD_08" : ["SD_08", "SD_08", "SD_08", "SD_08", "SD_08", "SD_08", "SD_08", "SD_08"],
}

In [77]:
features = []

for file in files:
    data = extract_feature(file)
    file_name = file.replace(root_path, "")
    class_label = output[file_name[0:5]]
    features.append([data, class_label])

# Convert into a Panda dataframe 
featuresdf = pd.DataFrame(features, columns=['feature','class_label'])

In [78]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical


X_train = np.array(featuresdf.feature.tolist())
y_train_class = np.array(featuresdf.class_label)
y_train = []

for y in y_train_class:
    tmp = []
    for idx in y:
        tmp.append(code2idx[idx])
    y_train.append(tmp)

X_train = np.array(X_train)  # Convert to NumPy array
X_train_transposed = np.transpose(X_train, (0, 2, 1))

# Assuming y_train is a list of lists
y_train_padded = pad_sequences(y_train, padding='post', maxlen=max_pad_len, value=-1)

# Convert to NumPy array
y_train = np.array(y_train_padded)

# One-hot encode y_train
y_train_one_hot = to_categorical(y_train, num_classes=9)  # Adjust the number of classes as needed

# le = LabelEncoder()
# yy = to_categorical(le.fit_transform(y_train))
# print("X_train >>> ", X_train)
# print("y_train >>> ", len(y_train[1]))

### 🧠 Model

In [79]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# Define the input shape based on your audio features
input_shape = (max_pad_len, 40)

input_layer = layers.Input(shape=input_shape)
conv1d_layer = layers.Conv1D(64, kernel_size=3, activation='relu')(input_layer)
lstm_layer = layers.LSTM(64, return_sequences=True)(conv1d_layer)
# output_layer = layers.Dense(1, activation='linear')(lstm_layer)
# output_layer = layers.Dense(len(code2idx), activation='softmax')(lstm_layer)
output_layer = Dense(9, activation='softmax')(lstm_layer)

model = Model(inputs=input_layer, outputs=output_layer)

model.summary()

Model: "model_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_12 (InputLayer)       [(None, 800, 40)]         0         
                                                                 
 conv1d_6 (Conv1D)           (None, 798, 64)           7744      
                                                                 
 lstm_6 (LSTM)               (None, 798, 64)           33024     
                                                                 
 dense_5 (Dense)             (None, 798, 9)            585       
                                                                 
Total params: 41353 (161.54 KB)
Trainable params: 41353 (161.54 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


### 🏃‍♂️ train

In [80]:
# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# X_train is your padded/truncated input data (WAV files), y_train is your output data (drum hits)
model.fit(X_train_transposed, y_train_one_hot, epochs=10, batch_size=32)

Epoch 1/10


ValueError: in user code:

    File "/home/jaeserrr/anaconda3/envs/drum/lib/python3.11/site-packages/keras/src/engine/training.py", line 1377, in train_function  *
        return step_function(self, iterator)
    File "/home/jaeserrr/anaconda3/envs/drum/lib/python3.11/site-packages/keras/src/engine/training.py", line 1360, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/home/jaeserrr/anaconda3/envs/drum/lib/python3.11/site-packages/keras/src/engine/training.py", line 1349, in run_step  **
        outputs = model.train_step(data)
    File "/home/jaeserrr/anaconda3/envs/drum/lib/python3.11/site-packages/keras/src/engine/training.py", line 1127, in train_step
        loss = self.compute_loss(x, y, y_pred, sample_weight)
    File "/home/jaeserrr/anaconda3/envs/drum/lib/python3.11/site-packages/keras/src/engine/training.py", line 1185, in compute_loss
        return self.compiled_loss(
    File "/home/jaeserrr/anaconda3/envs/drum/lib/python3.11/site-packages/keras/src/engine/compile_utils.py", line 277, in __call__
        loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    File "/home/jaeserrr/anaconda3/envs/drum/lib/python3.11/site-packages/keras/src/losses.py", line 143, in __call__
        losses = call_fn(y_true, y_pred)
    File "/home/jaeserrr/anaconda3/envs/drum/lib/python3.11/site-packages/keras/src/losses.py", line 270, in call  **
        return ag_fn(y_true, y_pred, **self._fn_kwargs)
    File "/home/jaeserrr/anaconda3/envs/drum/lib/python3.11/site-packages/keras/src/losses.py", line 2221, in categorical_crossentropy
        return backend.categorical_crossentropy(
    File "/home/jaeserrr/anaconda3/envs/drum/lib/python3.11/site-packages/keras/src/backend.py", line 5575, in categorical_crossentropy
        target.shape.assert_is_compatible_with(output.shape)

    ValueError: Shapes (None, 800, 9) and (None, 798, 9) are incompatible
