# LSTM
Long Short-Term Memory

In [9]:
import glob
import os
from pathlib import Path
from scipy.io import wavfile

import numpy as np
import matplotlib.pyplot as plt
import torch
from torch import nn
from torch.nn import functional as F
# Dataset
from torch.utils.data import DataLoader, Dataset, random_split

import librosa
import librosa.display
import IPython.display as ipd

from tqdm.notebook import tqdm
# from google_drive_downloader import GoogleDriveDownloader as gdd

## AudioDataset Class

In [23]:
sliced_dataset = "short_audio_dataset"
sliced_dataset_lenght = 16050
# sliced_dataset = "shorter_audio_dataset"
# sliced_dataset_lenght = 4013
original_dataset = "audio_dataset"
original_dataset_lenght = 80249

class AudioDataset(Dataset):
    def __init__(self, root_path="./data/", drop_both=False, use_short=False, normalize=False, use_features=False):
        root_folder = root_path + original_dataset if not use_short else root_path + sliced_dataset
        self.use_features = use_features
        self.max_length = original_dataset_lenght if not use_short else sliced_dataset_lenght
        self.class_map = {"esben" : 0, "peter": 1, "both": 2}
        self.data = []
        self.wavs = []
        self.labels = []
        self.features_list = []
        self.min_val = 10e10
        self.max_val = 0
        print("Start reading files")
        for subdir, dirs, files in os.walk(root_folder):
            for file_name in files:
                if drop_both and "both" in subdir:
                   continue
        
                file_path = os.path.join(subdir, file_name)
                self.sample_rate, wav = wavfile.read(file_path)
                wav = wav.astype(np.float32)
                
                if wav.shape[0] > self.max_length:
                    self.max_length = wav.shape[0]
                    print("Found wav with more length than specified max one, new max is:", wav.shape[0])
                
                wav = np.pad(wav, (0, self.max_length-wav.shape[0]))
                features = np.transpose(self.feature_extraction(wav, self.sample_rate, normalize=normalize))
                label_str = file_path.split('/')[-3][2:]
                label = (np.int64(self.class_map[label_str]))
                
                self.max_val = np.max(wav) if np.max(wav) > self.max_val else self.max_val
                self.min_val = np.min(wav) if np.min(wav) < self.min_val else self.min_val
                
                self.wavs.append(wav)
                self.features_list.append(features)
                self.labels.append(label)
               
        self.wavs = np.array(self.wavs)
        self.mu  = self.wavs.mean()
        self.std = np.std(self.wavs)
        # self.wavs = torch.Tensor(self.wavs)
        if normalize:
            self.wavs = (self.wavs + np.abs(self.min_val)) / (np.abs(self.min_val) + self.max_val)
            # self.wavs = torch.nn.functional.normalize(self.wavs, dim=1)
        
        print("="*40)
        print("Loaded DATABASE from {}\n{} total file\nLongest file is {} long\nMean: {}\nStandard deviation: {}\nNormalization: {}".
              format(root_folder, len(self.wavs), self.max_length, self.mu, self.std, normalize))
        if use_features:
            print("Feature dimensions for the first few samples:")
            for i in range(5):  # Print dimensions for the first 5 samples as an example
                print("Sample {}: {}".format(i, self.features_list[i].shape))
        print("="*40)
    
    def feature_extraction(self, wav, sample_rate, n_mfcc=128, normalize=False):
        # extract features from the audio
        mfcc = np.mean(librosa.feature.mfcc(y=wav, sr=sample_rate, n_mfcc=n_mfcc).T, axis=0)
        if normalize:
            mfcc = (mfcc + np.abs(np.min(mfcc))) / (np.abs(np.min(mfcc)) + np.max(mfcc))
        return mfcc

    def __len__(self):
        return len(self.wavs)
    
    def __getitem__(self, idx):
        wav = self.wavs[idx]
        label = self.labels[idx]
        wav_tensor = torch.from_numpy(wav)
        label_tensor = torch.Tensor(label)
        if self.use_features:
            features = self.features_list[idx]
            features_tensor = torch.Tensor(features)
            return features_tensor, label_tensor
        return wav_tensor, label_tensor


Data Generation

In [24]:
# Load data

audio_dataset = AudioDataset(root_path="../data/", drop_both=True, use_short=True, normalize=True, use_features=True)
dataset_len = len(audio_dataset)
train_size, test_size, valid_size = int(dataset_len * 0.7), int(dataset_len * 0.2), int(dataset_len * 0.1)

dataset_train, dataset_test, dataset_valid = torch.utils.data.random_split(audio_dataset, (train_size, test_size, valid_size))

kwargs = {'batch_size': 1, 'num_workers': 2}
loader_train = torch.utils.data.DataLoader(dataset_train, **kwargs, shuffle=True)
loader_test = torch.utils.data.DataLoader(dataset_test, **kwargs, shuffle=True)
loader_valid = torch.utils.data.DataLoader(dataset_valid, **kwargs, shuffle=True)

# 33% should be considered like with the other methods 
# In the solution he plots the lengths of the data...but thats just for educational purpose so I didn't reuse that

Start reading files
Loaded DATABASE from ../data/short_audio_dataset
1000 total file
Longest file is 16050 long
Mean: -0.6988561153411865
Standard deviation: 2332.388427734375
Normalization: True
Feature dimensions for the first few samples:
Sample 0: (128,)
Sample 1: (128,)
Sample 2: (128,)
Sample 3: (128,)
Sample 4: (128,)


**Task**

* Create a class to load the dataset
    * To be able to batch emails, they have to have the same length. You can ensure this in different ways e.g. zero-padding
    * Make a training and validation set based on *emails_ascii* and *targets*  
* Create and train a recurrent model to classify whether the emails are spam or not
    * Have a look at the [LSTM](https://pytorch.org/docs/stable/generated/torch.nn.LSTM.html) module and the [Embedding](https://pytorch.org/docs/stable/generated/torch.nn.Embedding.html) module

To be able to batch-process the mails even though some of the mails are shorter than the chosen text-length, the dataset zero-pads the text and returns an index for the last valid character.

In [26]:
#class voiceClassifier(nn.Module):

#    def __init__(self, d=512, n_lstm_layers=1, dropout=0.):
#        super().__init__()
#        self.emb = nn.Embedding(128, d)  # max(emails_ascii) < 128 -------- WHAT DOES IT NEED TO BE FOR US?
#        self.lstm = nn.LSTM(d, d, n_lstm_layers, batch_first=True, dropout=dropout)
#        self.lin = nn.Linear(d, 1)

#    def forward(self, x, end_idx):  # (B, nx)
#        x = self.emb(x)  # (B, nx, d)
#        y = self.lstm(x)[0][torch.arange(len(x)), end_idx]  # (B, d)
#        return self.lin(y).view(-1)  # (B,)
class voiceClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(voiceClassifier, self).__init__()  # Use voiceClassifier instead of AudioRNN
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        self.sigmoid = nn.Sigmoid()  # For binary classification

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)

        # Forward propagate LSTM
        out, _ = self.lstm(x, (h0, c0))  # Output shape: (batch_size, seq_length, hidden_size)

        # Decode the hidden state of the last time step
        out = self.fc(out[:, -1, :])
        output = self.sigmoid(out)  # Sigmoid for binary classification
        return output

In [28]:
#device = torch.device('cuda')

#model = voiceClassifier().to(device)
input_size = 128  # I think this is the size of our features
hidden_size = 128  # Choose an appropriate size
num_layers = 1  # Number of LSTM layers
output_size = 1  # For binary classification
model = voiceClassifier(input_size, hidden_size, num_layers, output_size)
opt = torch.optim.Adam(model.parameters(), lr=1e-3)
sched = torch.optim.lr_scheduler.MultiStepLR(opt, (30,))

train_losses = []
valid_losses = []
train_accuracies = []
valid_accuracies = []
lrs = []

pbar = tqdm(range(40))
for epoch in pbar:
    # train
    model.train()
    losses = []
    correct = total = 0
    for x, end_idx, y in train_loader:
        x, end_idx, y = x.type(torch.LongTensor), end_idx.type(torch.LongTensor), y.type(torch.LongTensor)
        
        print("x:", x)
        
        print("end_idx:", end_idx)
        print("x:", x.size())
        print("end_idx:", end_idx.size())
        print("y:", y.size())
        logits = model(x)

        loss = F.binary_cross_entropy_with_logits(logits, y.float())
        opt.zero_grad()
        loss.backward()
        opt.step()

        losses.append(loss.item())
        correct += ((torch.sigmoid(logits) > 0.5) == y).sum().item()
        total += len(x)
    train_loss = np.mean(losses)
    train_acc = correct / total

    # valid
    model.eval()
    losses = []
    correct = total = 0
    for x, end_idx, y in loader_valid:
        x, end_idx, y = x.to(device), end_idx.to(device), y.to(device)
        with torch.no_grad():
            logits = model(x, end_idx)
        loss = F.binary_cross_entropy_with_logits(logits, y.float())
        losses.append(loss.item())
        correct += ((torch.sigmoid(logits) > 0.5) == y).sum().item()
        total += len(x)
    valid_loss = np.mean(losses)
    valid_acc = correct / total

    # sched
    sched.step()

    # history
    lrs.append(next(iter(opt.param_groups))['lr'])

    train_losses.append(train_loss)
    valid_losses.append(valid_loss)
    train_accuracies.append(train_acc)
    valid_accuracies.append(valid_acc)

    pbar.set_description(f'loss: {train_loss:.3f}/{valid_loss:.3f}, acc: {train_acc:.2f}/{valid_acc:.2f}')


  0%|          | 0/40 [00:00<?, ?it/s]

x: tensor([[-3449, -3397, -3408,  ...,     0,     0,     0],
        [ 1871,  2118,  2216,  ...,     0,     0,     0],
        [  821,   442,   120,  ...,     0,     0,     0],
        [   -1,    -3,    -5,  ...,     0,     0,     0],
        [ -602,  -253,  -305,  ...,     0,     0,     0]])
end_idx: tensor([80248, 80248, 80248, 80248, 80248])
x: torch.Size([5, 80249])
end_idx: torch.Size([5])
y: torch.Size([5])


RuntimeError: For unbatched 2-D input, hx and cx should also be 2-D but got (3-D, 3-D) tensors

In [None]:
# plot history
fig, axs = plt.subplots(1, 3, figsize=(10, 3))
axs[0].plot(train_losses, label='train')
axs[0].plot(valid_losses, labeFile ~/Documents/3Semester/DNN/Identity-recognition-from-audio/dnn_venv/lib/python3.10/site-packages/torch/nn/functional.py:2233, in embedding(input, weight, padding_idx, max_norm, norm_type, scale_grad_by_freq, sparse)
   2227     # Note [embedding_renorm set_grad_enabled]
   2228     # XXX: equivalent to
   2229     # with torch.no_grad():
   2230     #   torch.embedding_renorm_
   2231     # remove once script supports set_grad_enabled
   2232     _no_grad_embedding_renorm_(weight, input, max_norm, norm_type)
-> 2233 return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse)

IndexError: index out of range in selfl='valid')
axs[0].set_ylabel('loss')
axs[0].legend()
axs[1].plot(train_accuracies, label='train')
axs[1].plot(valid_accuracies, label='valid')
axs[1].set_ylabel('acc')
axs[1].set_ylim(0.8, 1)
axs[1].legend()
axs[2].plot(lrs)
axs[2].set_ylabel('lr')
axs[2].set_yscale('log')
plt.tight_layout()
plt.show()