# 목적

IRMAS 데이터셋을 CNN에 활용할 수 있도록 스팩트로그램 이미지화한다.
이후 텐서로 변환해서 파일로 저장한다.

# 0.1. 구글 드라이브 연동

In [None]:
import os
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


# 0.2. 라이브러리 임포트 및 전역변수 설정

In [None]:
pip install pydub

Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1


In [None]:
from pathlib import Path
from easydict import EasyDict as edict
from pydub import AudioSegment
from PIL import Image
from tqdm import tqdm
from torchvision import transforms
from torch.utils.data import DataLoader, TensorDataset, random_split
from collections import Counter
import librosa
import librosa.display
import matplotlib.pyplot as plt
import numpy as np
import os
import torch
import torch.nn as nn
import torch.optim as optim

In [None]:
root = '/gdrive/My Drive/ctp431'
inst_pool = ['cel', 'cla', 'flu', 'gac', 'gel', 'org', 'pia', 'sax', 'tru', 'vio', 'voi']

# Step 1 : 테스트 데이터로 모델 입력 사이즈의 Mel Spectrogram 이미지 생성하기

VGG19는 244x244x3 사이즈의 이미지 입력을 기준으로 설계되어 있다.
VGG19를 계속 사용하게 될지는 모르겠지만, 일단 사용가능한 형태로 가공한다.

(0) IRMAS 트레이닝 데이터셋을 다운받아 구글드라이브에 업로드한다.

(1) IRMAS 각 사운드 데이터별로 아래 과정을 처리한다.
(1-1) 각 사운드를 로딩한 뒤 mel spectrogram 이미지로 변환한 후 저장한다.
(1-2) mel spectrogram 이미지를 224x224 픽셀로 리사이징한 후 저장한다.

In [None]:
class WavToImgProcessor:
    def __init__(self, source_parent_path, result_parent_path):
        self.source_parent_path = root + source_parent_path
        self.result_parent_path = root + result_parent_path

    def process(self):
        self.create_folders()
        self.normalize_db()
        self.create_spectrogram()
        self.resize_spectrogram()

    def create_folders(self):
        self.result_normalize_path = self.result_parent_path + "/Normalize"
        self.result_spectrogram_original_path = self.result_parent_path + "/Spectrogram/Original"
        self.result_spectrogram_resized_path = self.result_parent_path + "/Spectrogram/Resized"

        Path(self.result_normalize_path).mkdir(parents=True, exist_ok=True)
        Path(self.result_spectrogram_original_path).mkdir(parents=True, exist_ok=True)
        Path(self.result_spectrogram_resized_path).mkdir(parents=True, exist_ok=True)

    def normalize_db(self):
        for inst_name in os.listdir(self.source_parent_path):
            source_parent = self.source_parent_path + f'/{inst_name}'
            result_parent = self.result_normalize_path + f'/{inst_name}'
            Path(result_parent).mkdir(parents=True, exist_ok=True)
            Path(result_parent).mkdir(parents=True, exist_ok=True)
            for file_name in os.listdir(source_parent):
                normalize_db = -20
                source_path = source_parent + f"/{file_name}"
                result_path = result_parent + f"/{file_name}"

                if os.path.isfile(result_path):
                    print(f"Normalized file already exist: {result_path}")
                    continue
                else:
                    print(f"Normalizing: {result_path}")

                original_sound = AudioSegment.from_file(source_path, 'wav')
                db_diff = normalize_db - original_sound.dBFS
                normalized_sound = original_sound.apply_gain(db_diff)
                normalized_sound.export(result_path, format="wav")

    def create_spectrogram(self):
        for inst_name in os.listdir(self.result_normalize_path):
            source_parent = self.result_normalize_path + f'/{inst_name}'
            result_parent = self.result_spectrogram_original_path + f'/{inst_name}'
            Path(result_parent).mkdir(parents=True, exist_ok=True)
            for file_name in os.listdir(source_parent):
                source_path = source_parent + f"/{file_name}"
                result_path = result_parent + f"/{file_name}"
                result_path = result_path[:-3] + "jpg"

                if os.path.isfile(result_path):
                    print(f"Original spectrogram file already exist: {result_path}")
                    continue
                else:
                    print(f"Original spectrogram file processing: {result_path}")

                original_sound = AudioSegment.from_file(source_path, 'wav')
                y, sr = librosa.load(source_path)
                S = librosa.feature.melspectrogram(y=y, sr=sr)
                S_dB = librosa.power_to_db(S, ref=np.max)

                plt.figure(figsize=(6, 3), dpi=100)
                plt.axis('off')
                librosa.display.specshow(S_dB, sr=sr, x_axis=None, y_axis=None, fmax=sr/2)

                plt.savefig(result_path, bbox_inches='tight', pad_inches=0)
                plt.close()

    def resize_spectrogram(self):
        for inst_name in os.listdir(self.result_normalize_path):
            source_parent = self.result_spectrogram_original_path + f'/{inst_name}'
            result_parent = self.result_spectrogram_resized_path + f'/{inst_name}'
            Path(result_parent).mkdir(parents=True, exist_ok=True)
            for file_name in os.listdir(source_parent):
                source_path = source_parent + f"/{file_name}"
                result_path = result_parent + f"/{file_name}"

                if os.path.isfile(result_path):
                    print(f"Resized spectrogram file already exist: {result_path}")
                    continue
                else:
                    print(f"Resized spectrogram file processing: {result_path}")

                with Image.open(source_path) as img:
                    img = img.convert("RGB")
                    resized_img = img.resize((224, 224))
                    resized_img.save(result_path, format="JPEG")

In [None]:
wavToImg = WavToImgProcessor("/IRMAS-TrainingData", "/Processed/Training")
wavToImg.process()

[1;30;43m스트리밍 출력 내용이 길어서 마지막 5000줄이 삭제되었습니다.[0m
Resized spectrogram file already exist: /gdrive/My Drive/ctp431/Processed/Training/Spectrogram/Resized/cel/[cel][pop_roc]0062__2.jpg
Resized spectrogram file already exist: /gdrive/My Drive/ctp431/Processed/Training/Spectrogram/Resized/cel/[cel][pop_roc]0063__2.jpg
Resized spectrogram file already exist: /gdrive/My Drive/ctp431/Processed/Training/Spectrogram/Resized/cel/[cel][pop_roc]0062__1.jpg
Resized spectrogram file already exist: /gdrive/My Drive/ctp431/Processed/Training/Spectrogram/Resized/cel/[cel][pop_roc]0063__1.jpg
Resized spectrogram file already exist: /gdrive/My Drive/ctp431/Processed/Training/Spectrogram/Resized/cel/[cel][pop_roc]0091__2.jpg
Resized spectrogram file already exist: /gdrive/My Drive/ctp431/Processed/Training/Spectrogram/Resized/cel/[cel][pop_roc]0088__2.jpg
Resized spectrogram file already exist: /gdrive/My Drive/ctp431/Processed/Training/Spectrogram/Resized/cel/[cel][pop_roc]0088__1.jpg
Resized spectrogram

# Step 2 : 스팩트로그램 이미지를 (N, 3, 224, 224) 형태의 텐서로 가공하고 파일로 저장하기

각 악기 종류별로 균일한 테스트 풀을 보장하기 위해서

- 각 악기별 데이터 중 1번째~70번째 이미지 = 테스트 풀

- 각 악기별 데이터 중 71번째~350번째 이미지  = 트레이닝 풀

로 사용한다.

FYI. 각 악기별 데이터 수 cel(388), cla(505), flu(451), gac(637), gel(760), org(682), pia(721), sax(626), tru(577), vio(580), voi(778)

In [None]:
class ImgToTensorProcessor:
    def __init__(self, source_parent_path, result_file_path, result_file_name, is_training):
        self.source_parent_path = root + source_parent_path
        self.result_file_path = root + result_file_path
        self.result_file_name = result_file_name

        if (is_training):
            self.skip_count = 70
            self.collect_count = 280
        else:
            self.skip_count = 0
            self.collect_count = 70

    def process(self):
        print(f"[TrainingDataImgToTensorProcessor] process start.")
        self.count_images()
        self.image_to_tensor()
        self.debug_print()


    def count_images(self):
        file_count = 0;
        for inst_name in os.listdir(self.source_parent_path):
            file_count += len(os.listdir(self.source_parent_path + f'/{inst_name}'))

        print(f"[count_images] Total image count : {file_count}")


    def image_to_tensor(self):
        Path(self.result_file_path).mkdir(parents=True, exist_ok=True)
        result_file_path = self.result_file_path + f"/{self.result_file_name}"

        if os.path.isfile(result_file_path):
            print(f"[image_to_tensor] file already exist");
            return

        transform = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ])

        labels = inst_pool
        labels_to_idx = {label: idx for idx, label in enumerate(labels)}

        all_images = []
        all_labels = []

        for label in tqdm(labels, desc="Processing folders"):
            label_dir = self.source_parent_path + f"/{label}"
            if not os.path.isdir(label_dir):
                continue

            skip_remain = self.skip_count
            collect_remain = self.collect_count

            for img_name in os.listdir(label_dir):
                if (skip_remain > 0):
                    skip_remain -= 1
                    continue

                if (collect_remain > 0):
                    collect_remain -= 1
                else:
                    break;

                img_path = os.path.join(label_dir, img_name)
                try:
                    img = Image.open(img_path).convert('RGB')
                    img_tensor = transform(img)
                    all_images.append(img_tensor)
                    all_labels.append(labels_to_idx[label])
                except Exception as e:
                    print(f"Error loading image {img_path}: {e}")

        x = torch.stack(all_images)  # 이미지 데이터 텐서 (N, 3, 224, 224)
        y = torch.tensor(all_labels)  # 레이블 텐서 (N)

        torch.save((x, y), result_file_path)
        print(f"\nDataset saved to {self.result_file_name}")

    def debug_print(self):
        result_file_path = self.result_file_path + f"/{self.result_file_name}"

        x, y = torch.load(result_file_path)

        print(f"x shape = {x.shape}")
        print(f"x : {x}")
        print(f"y : {y}")

        count = Counter(y.tolist())

        for number in range(11):
            print(f"{number}: {count[number]}개")

        indices = torch.randperm(x.size(0))
        x = x[indices]
        y = y[indices]

        print(f"randomized y : {y}")

In [None]:
img_to_tensor_training = ImgToTensorProcessor("/Processed/Training/Spectrogram/Resized", "/MK2/DataSet", 'training_data.pt', True)
img_to_tensor_training.process()

[TrainingDataImgToTensorProcessor] process start.
[count_images] Total image count : 6705


Processing folders: 100%|██████████| 11/11 [02:13<00:00, 12.14s/it]



Dataset saved to training_data.pt


  x, y = torch.load(result_file_path)


x shape = torch.Size([3080, 3, 224, 224])
x : tensor([[[[-1.3644, -1.3987, -1.4672,  ..., -1.8097, -1.9980, -2.0837],
          [-1.3815, -1.3815, -1.4329,  ..., -1.6898, -1.9124, -1.9980],
          [-0.7137, -0.6794, -0.6965,  ..., -1.1418, -1.3815, -1.5185],
          ...,
          [ 1.1529,  1.1872,  1.3070,  ...,  2.2489,  2.2489,  2.2489],
          [ 0.9646,  0.9474,  0.9474,  ...,  1.5810,  1.8722,  1.9920],
          [ 0.8961,  0.8618,  0.8789,  ...,  1.5639,  1.9920,  2.1633]],

         [[-1.8782, -1.8957, -2.0007,  ..., -2.0182, -2.0182, -2.0182],
          [-1.9657, -1.9657, -2.0357,  ..., -1.9657, -2.0007, -2.0182],
          [-1.4580, -1.4055, -1.4405,  ..., -1.5805, -1.6506, -1.7031],
          ...,
          [-1.0553, -0.9853, -0.8627,  ...,  0.4853,  0.6429,  0.6954],
          [-1.2129, -1.1779, -1.1429,  ..., -0.1975,  0.1001,  0.2402],
          [-1.2304, -1.2129, -1.1954,  ..., -0.1975,  0.2402,  0.4328]],

         [[-0.2532, -0.3230, -0.5147,  ..., -0.8458, -0.

In [None]:
img_to_tensor_test = ImgToTensorProcessor("/Processed/Training/Spectrogram/Resized", "/MK2/DataSet", 'test_data.pt', False)
img_to_tensor_test.process()

[TrainingDataImgToTensorProcessor] process start.
[count_images] Total image count : 6705


Processing folders: 100%|██████████| 11/11 [00:36<00:00,  3.36s/it]



Dataset saved to test_data.pt


  x, y = torch.load(result_file_path)


x shape = torch.Size([770, 3, 224, 224])
x : tensor([[[[-2.0665, -2.1008, -2.0837,  ..., -2.0323, -2.0494, -2.0665],
          [-2.1008, -2.1179, -2.0494,  ..., -1.8097, -1.8268, -1.8610],
          [-1.7583, -1.7412, -1.6384,  ..., -1.5699, -1.5870, -1.6042],
          ...,
          [ 1.0331,  1.0844,  1.1872,  ...,  1.4269,  1.5982,  1.7009],
          [ 0.9303,  0.9646,  0.9817,  ...,  1.1358,  1.2557,  1.3755],
          [ 0.9988,  0.9817,  0.9474,  ...,  1.1015,  1.2043,  1.2899]],

         [[-1.9657, -2.0007, -2.0357,  ..., -2.0357, -2.0357, -2.0357],
          [-2.0182, -2.0357, -2.0182,  ..., -1.8782, -1.9307, -1.9657],
          [-1.7381, -1.7206, -1.6506,  ..., -1.7381, -1.7906, -1.8081],
          ...,
          [-1.0728, -1.0203, -0.9503,  ..., -0.7577, -0.6527, -0.5476],
          [-1.1078, -1.0728, -1.0378,  ..., -1.0378, -0.9678, -0.8627],
          [-0.9853, -0.9853, -1.0203,  ..., -1.0553, -1.0378, -0.9328]],

         [[-1.3687, -1.4036, -1.4210,  ..., -1.4036, -1.4