In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import os
import torchaudio
import librosa
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

In [5]:
input_dim = 40
hidden_dim = 128
num_layers = 3
output_dim = 40
latent_dim = 16
seq_length = len(training_data[0])
batch_size = 5
learning_rate= 0.01
num_epochs = 100

In [2]:

class MFCCDataset(Dataset):
    def __init__(self, root_dir, sample_rate=16000, n_mfcc=40):
        self.root_dir = root_dir
        self.file_list = os.listdir(root_dir)
        self.sample_rate = sample_rate
        self.n_mfcc = n_mfcc
    
    def __len__(self):
        return len(self.file_list)
    
    def __getitem__(self, idx):
        file_path = os.path.join(self.root_dir, self.file_list[idx])
        waveform, _ = librosa.load(file_path, sr=self.sample_rate, mono=True)
        mfcc = librosa.feature.mfcc(y=waveform, sr=self.sample_rate, n_mfcc=self.n_mfcc)
        mfcc_tensor = torch.tensor(mfcc.T)  # Transpose for PyTorch compatibility (seq_length, n_mfcc)
        return mfcc_tensor


In [3]:
from torch.nn.utils.rnn import pad_sequence
training_data = MFCCDataset(r"C:\Users\rkdrn\Untitled Folder\dataset")
training_data_iterable = [training_data[i] for i in range(len(training_data))]
training_data = nn.utils.rnn.pad_sequence([torch.tensor(seq).clone().detach().requires_grad_(True) for seq in training_data], batch_first=True)
 

  training_data = nn.utils.rnn.pad_sequence([torch.tensor(seq).clone().detach().requires_grad_(True) for seq in training_data], batch_first=True)


In [6]:
training_data, validation_data = train_test_split(training_data, test_size=10)
train_dataloader = DataLoader(training_data, batch_size=batch_size, shuffle=True)
validation_dataloader= DataLoader(validation_data, batch_size=batch_size, shuffle=True)

In [7]:
del training_data, validation_data

In [8]:
class LSTMAutoEncoder(nn.Module):
    def __init__(self, num_layers, hidden_size, nb_feature, dropout=0, device=torch.device('cpu')):
        super(LSTMAutoEncoder, self).__init__()
        self.device = device
        self.encoder = Encoder(num_layers, hidden_size, nb_feature, dropout, device)
        self.decoder = Decoder(num_layers, hidden_size, nb_feature, dropout, device)

    def forward(self, input_seq):
        output = torch.zeros(size=input_seq.shape, dtype=torch.float)
        hidden_cell = self.encoder(input_seq)
        input_decoder = input_seq[:, -1, :].view(input_seq.shape[0], 1, input_seq.shape[2])
        for i in range(input_seq.shape[1] - 1, -1, -1):
            output_decoder, hidden_cell = self.decoder(input_decoder, hidden_cell)
            input_decoder = output_decoder
            output[:, i, :] = output_decoder[:, 0, :]
        return output


class Encoder(nn.Module):
    def __init__(self, num_layers, hidden_size, nb_feature, dropout=0, device=torch.device('cpu')):
        super(Encoder, self).__init__()

        self.input_size = nb_feature
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.device = device

        self.lstm = nn.LSTM(input_size=nb_feature, hidden_size=hidden_size,
                            num_layers=num_layers, batch_first=True, dropout=dropout, bias=True)

    def initHidden(self, batch_size):
        self.hidden_cell = (
            torch.randn((self.num_layers, batch_size, self.hidden_size), dtype=torch.float).to(self.device),
            torch.randn((self.num_layers, batch_size, self.hidden_size), dtype=torch.float).to(self.device)
        )

    def forward(self, input_seq):
        self.initHidden(input_seq.shape[0])
        _, self.hidden_cell = self.lstm(input_seq, self.hidden_cell)
        return self.hidden_cell


class Decoder(nn.Module):
    def __init__(self, num_layers, hidden_size, nb_feature, dropout=0, device=torch.device('cpu')):
        super(Decoder, self).__init__()

        self.input_size = nb_feature
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.device = device

        self.lstm = nn.LSTM(input_size=nb_feature, hidden_size=hidden_size,
                            num_layers=num_layers, batch_first=True, dropout=dropout, bias=True)
        self.linear = nn.Linear(in_features=hidden_size, out_features=nb_feature)

    def forward(self, input_seq, hidden_cell):
        output, hidden_cell = self.lstm(input_seq, hidden_cell)
        output = self.linear(output)
        return output, hidden_cell

In [None]:
class LSTMEncoder(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers):
        super(LSTMEncoder, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True)
    
    def forward(self, x,lengths):
        packed_x = pack_padded_sequence(x, lengths, batch_first=True, enforce_sorted=False)
        _, (hidden, _) = self.lstm(x)
        return hidden[-1]

class LSTMDecoder(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers, output_dim):
        super(LSTMDecoder, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, x, seq_length):
        lstm_out, _ = self.lstm(x)
        return self.fc(lstm_out).reshape(-1, seq_length, output_dim)

class LSTMAutoencoder(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers, output_dim, seq_length):
        super(LSTMAutoencoder, self).__init__()
        self.encoder = LSTMEncoder(input_dim, hidden_dim, num_layers)
        self.decoder = LSTMDecoder(hidden_dim, hidden_dim, num_layers, output_dim)
        self.seq_length = seq_length
    
    def forward(self, x):
        context = self.encoder(x)
        # Repeat the context vector to feed it to each time step of the decoder
        repeat_context = context.unsqueeze(1).repeat(1, self.seq_length, 1)
        output = self.decoder(repeat_context, self.seq_length)
        return output

In [None]:
class LSTMAutoencoder(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers=1):
        super(LSTMAutoencoder, self).__init__()
        
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers

        # Encoder LSTM
        self.lstm_encoder = nn.LSTM(input_dim, hidden_dim, 3, batch_first=True)
        # Decoder LSTM
        self.lstm_decoder = nn.LSTM(hidden_dim, input_dim, 3, batch_first=True)

    def forward(self, x):
        # Encoder
        _, (hidden, _) = self.lstm_encoder(x)
        
        # Replicate the hidden state across time steps
        repeated_hidden = hidden.repeat(1, x.size(1), 1)
        
        # Decoder
        output, _ = self.lstm_decoder(repeated_hidden)
        return output

In [None]:
input_dim = 40
hidden_dim = 128
num_layers = 3
output_dim = 40
seq_length = len(training_data[0])
batch_size = 10

In [18]:
model = RecurrentAutoencoder(seq_length, input_dim,  16)
loss_fn = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [9]:
model = LSTMAutoEncoder(3, hidden_dim, output_dim)
loss_fn = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [10]:
def train(dataloader, model, loss_fn, optimizer):
    model.train()  # 모델을 학습 모드로 설정
    running_loss = 0.0
    
    for batch, data in tqdm(enumerate(dataloader, 1), total=len(dataloader)):
        inputs = data

        # 순방향 전달 및 손실 계산
        recon = model(inputs)
        loss = loss_fn(recon, inputs)

        # 손실값을 역전파하여 모델을 갱신합니다.
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        # 통계 출력
        running_loss += loss.item()
    
    epoch_loss = running_loss / len(dataloader)
    return epoch_loss


def test(dataloader, model, loss_fn):
    model.eval()  # 모델을 평가 모드로 설정
    test_loss = 0.0
    
    with torch.no_grad(): 
        for data in dataloader:
            inputs = data  
            recon = model(inputs)
            test_loss += loss_fn(recon, inputs).item()
    
    epoch_loss = test_loss / len(dataloader)
    return epoch_loss

In [None]:
train_loss_list = []
val_loss_list = []

for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}\n-------------------------------")
    train(train_dataloader, model, loss_fn, optimizer)
    train_loss = test(train_dataloader, model, loss_fn)
    train_loss_list.append(train_loss)
    print(f"Train Error: {train_loss:.8f}")

    val_loss = test(validation_dataloader, model, loss_fn)
    val_loss_list.append(val_loss)
    print(f"Validation Error: {val_loss:.8f}")

print("Training complete")

Epoch 1
-------------------------------


  6%|████▍                                                                           | 1/18 [07:19<2:04:25, 439.12s/it]

In [None]:
n_steps = 50
n_features = 1
latent_dim = 10

ENCODER_1 = 512
ENCODER_2 = 256
ENCODER_3 = 128

LATENT_VECTOR = 64

DECODER_1 = 512
DECODER_2 = 256
DECODER_3 = 128

In [None]:
import torch.nn.init as init

class Autoencoder(nn.Module):
    def __init__(self):
        super(Autoencoder, self).__init__()

        self.encoder = nn.Sequential(
            nn.Linear(784, ENCODER_1),
            nn.SELU(),
            nn.Linear(ENCODER_1, ENCODER_2),
            nn.SELU(),
            nn.Linear(ENCODER_2, ENCODER_3),
            nn.SELU(),
            nn.Linear(ENCODER_3, LATENT_VECTOR),
        )
        self.initialize_weights(self.encoder)

        self.decoder = nn.Sequential(
            nn.Linear(LATENT_VECTOR, DECODER_1),
            nn.SELU(),
            nn.Linear(DECODER_1, DECODER_2),
            nn.SELU(),
            nn.Linear(DECODER_2, DECODER_3),
            nn.SELU(),
            nn.Linear(DECODER_3, 784),
            nn.Sigmoid()
        )
        self.initialize_weights(self.decoder)

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

    def initialize_weights(self, module):
        for m in module:
            if isinstance(m, nn.Linear):
                init.xavier_uniform_(m.weight)
                if m.bias is not None:
                    init.zeros_(m.bias)


model = Autoencoder().to(device)
print(model)

In [None]:
pip install torchaudio