1. 입력층: 스펙토그램 데이터를 입력
2. CNN 또는 RNN(LSTM/GRU):
- 입력 특징 추출 및 시퀀스 길이 축소.

3. CTC 손실 적용:
- 최종 출력은 가능한 텍스트 시퀀스의 확률로 변환.


In [1]:
!pip install torch torchaudio

Collecting torch
  Using cached torch-2.5.1-cp310-none-macosx_11_0_arm64.whl (63.9 MB)
Collecting torchaudio
  Downloading torchaudio-2.5.1-cp310-cp310-macosx_11_0_arm64.whl (1.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting networkx
  Downloading networkx-3.4.2-py3-none-any.whl (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m20.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting filelock
  Using cached filelock-3.16.1-py3-none-any.whl (16 kB)
Collecting jinja2
  Downloading jinja2-3.1.5-py3-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.6/134.6 kB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sympy==1.13.1
  Using cached sympy-1.13.1-py3-none-any.whl (6.2 MB)
Collecting fsspec
  Downloading fsspec-2024.12.0-py3-none-any.whl (183 kB)
[2K     [90m━━━━━━━━━━━━━━

In [18]:
import torch
import torchaudio
print(torch.__version__)
print(torchaudio.__version__)

2.5.1
2.5.1


In [12]:
# 한국어 문자 집합 (초성, 중성, 종성, 공백 포함)
CHAR_SET = "가나다라마바사아자차카타파하ㅏㅑㅓㅕㅗㅛㅜㅠㅡㅣ "  # 필요한 문자 추가 가능
CHAR_TO_INDEX = {ch: i for i, ch in enumerate(CHAR_SET)}
INDEX_TO_CHAR = {i: ch for ch, i in CHAR_TO_INDEX.items()}

# 텍스트를 정수 시퀀스로 변환
def text_to_sequence(text):
    return [CHAR_TO_INDEX[ch] for ch in text if ch in CHAR_TO_INDEX]

# 정수 시퀀스를 텍스트로 변환
def sequence_to_text(sequence):
    return ''.join([INDEX_TO_CHAR[i] for i in sequence])

In [13]:
import torch
import torch.nn as nn

class SpeechRecognitionModel(nn.Module):
    def __init__(self, num_classes):
        super(SpeechRecognitionModel, self).__init__()
        self.cnn = nn.Conv2d(1, 32, kernel_size=(3, 3), padding=(1, 1))
        self.rnn = nn.LSTM(32 * 128, 128, num_layers=2, bidirectional=True, batch_first=True)
        self.fc = nn.Linear(128 * 2, num_classes)  # Bidirectional이므로 x2

    def forward(self, x):
        # CNN
        x = self.cnn(x)  # [B, 1, 128, T] -> [B, 32, 128, T]
        x = x.permute(0, 3, 1, 2).flatten(2)  # [B, T, 32 * 128]

        # RNN
        x, _ = self.rnn(x)

        # Fully Connected
        x = self.fc(x)  # [B, T, num_classes]
        return x.log_softmax(2)

In [7]:
ctc_loss = nn.CTCLoss()

# 입력: [T, B, C]
log_probs = torch.randn(100, 8, len(CHAR_SET)).log_softmax(2)  # 예제 입력
# 타겟: [sum(target_lengths)]
targets = torch.randint(0, len(CHAR_SET), (240,), dtype=torch.long)
# 입력 길이와 타겟 길이
input_lengths = torch.full((8,), 100, dtype=torch.long)
target_lengths = torch.full((8,), 30, dtype=torch.long)

loss = ctc_loss(log_probs, targets, input_lengths, target_lengths)
print("CTC Loss:", loss.item())

In [14]:
import torch.optim as optim

# 모델 및 옵티마이저 초기화
model = SpeechRecognitionModel(num_classes=len(CHAR_SET))
optimizer = optim.Adam(model.parameters(), lr=0.001)

# 학습 루프
for epoch in range(10):
    model.train()
    optimizer.zero_grad()

    # 예제 입력 데이터 (스펙트로그램)와 라벨
    inputs = torch.randn(8, 1, 128, 100)  # [B, C, F, T]
    labels = torch.randint(0, len(CHAR_SET), (240,), dtype=torch.long)
    input_lengths = torch.full((8,), 100, dtype=torch.long)
    target_lengths = torch.full((8,), 30, dtype=torch.long)

    # 모델 예측
    outputs = model(inputs)
    outputs = outputs.permute(1, 0, 2)  # [T, B, C]

    # 손실 계산 및 역전파
    loss = ctc_loss(outputs, labels, input_lengths, target_lengths)
    loss.backward()
    optimizer.step()

    print(f"Epoch {epoch+1}, Loss: {loss.item()}")

Epoch 1, Loss: 7.8162617683410645
Epoch 2, Loss: 7.033697605133057
Epoch 3, Loss: 5.620249271392822
Epoch 4, Loss: 3.9968440532684326
Epoch 5, Loss: 3.1942620277404785
Epoch 6, Loss: 3.031686544418335
Epoch 7, Loss: 3.3611464500427246
Epoch 8, Loss: 3.3846864700317383
Epoch 9, Loss: 3.3700976371765137
Epoch 10, Loss: 3.1600167751312256


In [15]:
def ctc_decode(log_probs):
    predicted_indices = log_probs.argmax(dim=2)
    decoded = []
    for seq in predicted_indices:
        text = []
        prev_char = None
        for char in seq:
            if char != prev_char and char != CHAR_TO_INDEX[' ']:  # 공백 제거
                text.append(char.item())
            prev_char = char
        decoded.append(sequence_to_text(text))
    return decoded

In [23]:
import os
import sys
print(sys.path)
print(os)

['/Users/kimdohun/.pyenv/versions/3.10.12/lib/python310.zip', '/Users/kimdohun/.pyenv/versions/3.10.12/lib/python3.10', '/Users/kimdohun/.pyenv/versions/3.10.12/lib/python3.10/lib-dynload', '', '/Users/kimdohun/kimdohoon/MF2M/venv/lib/python3.10/site-packages', '/var/folders/b7/xdzw8frs2vj5d72pmm8dc3dm0000gn/T/tmpqlv65hdz']
<module 'os' from '/Users/kimdohun/.pyenv/versions/3.10.12/lib/python3.10/os.py'>


In [30]:
import os
os.getcwd()

'/Users/kimdohun/kimdohoon/MF2M'

In [27]:
pip install pydub

Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [33]:
!brew install ffmpeg

[34m==>[0m [1mAuto-updating Homebrew...[0m
Adjust how often this is run with HOMEBREW_AUTO_UPDATE_SECS or disable with
HOMEBREW_NO_AUTO_UPDATE. Hide these hints with HOMEBREW_NO_ENV_HINTS (see `man brew`).
[34m==>[0m [1mAuto-updated Homebrew![0m
Updated 3 taps (homebrew/services, homebrew/core and homebrew/cask).
[34m==>[0m [1mNew Formulae[0m
cargo-flamegraph           openapi-tui                tiny
cargo-msrv                 pinact                     vgt
gh-ost                     proxyfor                   vue-language-server
ghc@9.10                   ratify                     wcstools
go-parquet-tools           runitor                    xc
harper                     serie                      xmq
libgit2@1.8                showcert                   yozefu
lol-html                   streamrip
ltex-ls-plus               swc
[34m==>[0m [1mNew Casks[0m
aw-edid-editor             green-go-control           readest
font-brass-mono            helio                    

In [35]:
!ffmpeg -version
!ffprobe -version

ffmpeg version 7.1 Copyright (c) 2000-2024 the FFmpeg developers
built with Apple clang version 16.0.0 (clang-1600.0.26.4)
configuration: --prefix=/opt/homebrew/Cellar/ffmpeg/7.1_4 --enable-shared --enable-pthreads --enable-version3 --cc=clang --host-cflags= --host-ldflags='-Wl,-ld_classic' --enable-ffplay --enable-gnutls --enable-gpl --enable-libaom --enable-libaribb24 --enable-libbluray --enable-libdav1d --enable-libharfbuzz --enable-libjxl --enable-libmp3lame --enable-libopus --enable-librav1e --enable-librist --enable-librubberband --enable-libsnappy --enable-libsrt --enable-libssh --enable-libsvtav1 --enable-libtesseract --enable-libtheora --enable-libvidstab --enable-libvmaf --enable-libvorbis --enable-libvpx --enable-libwebp --enable-libx264 --enable-libx265 --enable-libxml2 --enable-libxvid --enable-lzma --enable-libfontconfig --enable-libfreetype --enable-frei0r --enable-libass --enable-libopencore-amrnb --enable-libopencore-amrwb --enable-libopenjpeg --enable-libspeex --enabl

In [55]:
from pydub import AudioSegment

# 파일 로드
audio = AudioSegment.from_file("/Users/kimdohun/kimdohoon/MF2M/converted_audio.wav", format="wav")

# numpy 배열로 변환
samples = audio.get_array_of_samples()
samples

array('h', [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [73]:
from pydub import AudioSegment
import numpy as np
import torch

# 오디오 파일 로드
file_path = "/Users/kimdohun/kimdohoon/MF2M/converted_audio.wav"
def load_audio(file_path):
    audio = AudioSegment.from_file(file_path, format="wav")

    # numpy 배열로 변환
    samples = audio.get_array_of_samples()
    samples_np = np.array(samples)

    # 채널 분리
    channels = audio.channels
    if channels > 1:
        samples_np = samples_np.reshape(-1, channels).T  # [채널 수, 샘플 수]
    else:
        samples_np = samples_np[np.newaxis, :]  # 모노인 경우 [1, 샘플 수]

    # 샘플링 레이트 가져오기
    sample_rate = audio.frame_rate

    # PyTorch 텐서로 변환
    waveform = torch.tensor(samples_np, dtype=torch.float32)
    return sample_rate,waveform

In [46]:
file_path=os.getcwd()+"/converted_audio.wav"

In [76]:
import torchaudio.transforms as transforms
sample_rate,waveform=load_audio(file_path)
print(type(waveform))
mel_transform=transforms.MelSpectrogram(sample_rate=sample_rate,n_mels=128,n_fft=2048,hop_length=512)

mel_transform=mel_transform(waveform)
mel_transform.shape

<class 'torch.Tensor'>


torch.Size([1, 128, 605])

In [89]:
log_mel_spectrogram = mel_transform.clamp(min=1e-5).log2().unsqueeze(0)

In [92]:
with torch.no_grad():
    model.eval()
    output=model(log_mel_spectrogram)

In [93]:
ctc_decode(output)

['가']