# LSTM
Long Short-Term Memory

In [75]:
import glob
import os
from pathlib import Path
from scipy.io import wavfile

import numpy as np
import matplotlib.pyplot as plt
import torch
from torch import nn
from torch.nn import functional as F
# Dataset
from torch.utils.data import DataLoader, Dataset, random_split

from tqdm.notebook import tqdm
# from google_drive_downloader import GoogleDriveDownloader as gdd

## AudioDataset Class

In [76]:
sliced_dataset = "../data/short_audio_dataset"
sliced_dataset_lenght = 16050
original_dataset = "../data/audio_dataset"
original_dataset_lenght = 80249

class AudioDataset(Dataset):
    def __init__(self, drop_both=False):
        root_folder = original_dataset
        max_length = original_dataset_lenght
        self.class_map = {"esben" : 0, "peter": 1, "both": 2}
        self.data = []
        self.wavs = []
        self.labels = []
        for subdir, dirs, files in os.walk(root_folder):
            for file_name in files:
                if "both" in subdir and drop_both:
                   continue
                file_path = os.path.join(subdir, file_name)
                _, wav = wavfile.read(file_path)
                wav = wav.astype(np.float32)
                if wav.shape[0] > max_length:
                    max_length = wav.shape[0]
                    print("Found wav with more length than specified max one, new max is:", wav.shape[0])
                wav = np.pad(wav, (0, max_length-wav.shape[0]))
                label_str = file_path.split('/')[3][2:]
                label = (np.int64(self.class_map[label_str]))
                self.wavs.append(wav)
                self.labels.append(label)
                self.data.append([wav, label])
        print("Max length of wav files:", max_length)


    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        wav = self.wavs[idx]
        class_id = self.labels[idx]
        # class_id = self.class_map[label]
        #wav_tensor = torch.from_numpy(wav)
        #class_id = torch.tensor(class_id)
        return wav, (len(wav) - 1), class_id


Data Generation

In [77]:
# Load data

dataset = AudioDataset(drop_both=True)
print(len(dataset.data))
train_data, val_data, test_data = random_split (dataset, [150, 30, 20])
print(len(train_data), len(val_data), len(test_data))
train_loader = DataLoader(train_data, batch_size=5, shuffle=True)
valid_loader = DataLoader(val_data, batch_size=5, shuffle=True)
test_loader = DataLoader(test_data, batch_size=5, shuffle=True)

# 33% should be considered like with the other methods 
# In the solution he plots the lengths of the data...but thats just for educational purpose so I didn't reuse that

Max length of wav files: 80249
200
150 30 20


**Task**

* Create a class to load the dataset
    * To be able to batch emails, they have to have the same length. You can ensure this in different ways e.g. zero-padding
    * Make a training and validation set based on *emails_ascii* and *targets*  
* Create and train a recurrent model to classify whether the emails are spam or not
    * Have a look at the [LSTM](https://pytorch.org/docs/stable/generated/torch.nn.LSTM.html) module and the [Embedding](https://pytorch.org/docs/stable/generated/torch.nn.Embedding.html) module

To be able to batch-process the mails even though some of the mails are shorter than the chosen text-length, the dataset zero-pads the text and returns an index for the last valid character.

In [78]:
class voiceClassifier(nn.Module):

    def __init__(self, d=512, n_lstm_layers=1, dropout=0.):
        super().__init__()
        self.emb = nn.Embedding(128, d)  # max(emails_ascii) < 128 -------- WHAT DOES IT NEED TO BE FOR US?
        self.lstm = nn.LSTM(d, d, n_lstm_layers, batch_first=True, dropout=dropout)
        self.lin = nn.Linear(d, 1)

    def forward(self, x, end_idx):  # (B, nx)
        x = self.emb(x)  # (B, nx, d)
        y = self.lstm(x)[0][torch.arange(len(x)), end_idx]  # (B, d)
        return self.lin(y).view(-1)  # (B,)

In [79]:
#device = torch.device('cuda')

#model = voiceClassifier().to(device)
model = voiceClassifier()
opt = torch.optim.Adam(model.parameters(), lr=1e-3)
sched = torch.optim.lr_scheduler.MultiStepLR(opt, (30,))

train_losses = []
valid_losses = []
train_accuracies = []
valid_accuracies = []
lrs = []

pbar = tqdm(range(40))
for epoch in pbar:
    # train
    model.train()
    losses = []
    correct = total = 0
    for x, end_idx, y in train_loader:
        x, end_idx, y = x.type(torch.LongTensor), end_idx.type(torch.LongTensor), y.type(torch.LongTensor)
        print("x:", x)
        print("end_idx:", end_idx)
        print("y:", y)
        logits = model(x, end_idx)

        loss = F.binary_cross_entropy_with_logits(logits, y.float())
        opt.zero_grad()
        loss.backward()
        opt.step()

        losses.append(loss.item())
        correct += ((torch.sigmoid(logits) > 0.5) == y).sum().item()
        total += len(x)
    train_loss = np.mean(losses)
    train_acc = correct / total

    # valid
    model.eval()
    losses = []
    correct = total = 0
    for x, end_idx, y in loader_valid:
        x, end_idx, y = x.to(device), end_idx.to(device), y.to(device)
        with torch.no_grad():
            logits = model(x, end_idx)
        loss = F.binary_cross_entropy_with_logits(logits, y.float())
        losses.append(loss.item())
        correct += ((torch.sigmoid(logits) > 0.5) == y).sum().item()
        total += len(x)
    valid_loss = np.mean(losses)
    valid_acc = correct / total

    # sched
    sched.step()

    # history
    lrs.append(next(iter(opt.param_groups))['lr'])

    train_losses.append(train_loss)
    valid_losses.append(valid_loss)
    train_accuracies.append(train_acc)
    valid_accuracies.append(valid_acc)

    pbar.set_description(f'loss: {train_loss:.3f}/{valid_loss:.3f}, acc: {train_acc:.2f}/{valid_acc:.2f}')


  0%|          | 0/40 [00:00<?, ?it/s]

x: tensor([[  437,   459,   439,  ...,    56,    44,     0],
        [ 1690,   115, -2135,  ...,   134,   134,     0],
        [   81,   589,   744,  ...,     0,     0,     0],
        [-2589, -3260, -3546,  ..., -4841, -5530,     0],
        [ -941, -1327, -1920,  ...,    39,    43,     0]])
end_idx: tensor([80248, 80248, 80248, 80248, 80248])
y: tensor([1, 0, 0, 0, 0])


IndexError: index out of range in self

In [None]:
# plot history
fig, axs = plt.subplots(1, 3, figsize=(10, 3))
axs[0].plot(train_losses, label='train')
axs[0].plot(valid_losses, labeFile ~/Documents/3Semester/DNN/Identity-recognition-from-audio/dnn_venv/lib/python3.10/site-packages/torch/nn/functional.py:2233, in embedding(input, weight, padding_idx, max_norm, norm_type, scale_grad_by_freq, sparse)
   2227     # Note [embedding_renorm set_grad_enabled]
   2228     # XXX: equivalent to
   2229     # with torch.no_grad():
   2230     #   torch.embedding_renorm_
   2231     # remove once script supports set_grad_enabled
   2232     _no_grad_embedding_renorm_(weight, input, max_norm, norm_type)
-> 2233 return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse)

IndexError: index out of range in selfl='valid')
axs[0].set_ylabel('loss')
axs[0].legend()
axs[1].plot(train_accuracies, label='train')
axs[1].plot(valid_accuracies, label='valid')
axs[1].set_ylabel('acc')
axs[1].set_ylim(0.8, 1)
axs[1].legend()
axs[2].plot(lrs)
axs[2].set_ylabel('lr')
axs[2].set_yscale('log')
plt.tight_layout()
plt.show()