## Recurrent Neural Network (incl. LSTM and GRU)

This notebook contains the codes for building RNN in pytorch. It also includes the structures of LSTM and GRU, as well as they work under the hood. It is recommended to run this in Google Colas as training without GPU will take a really long time.

In [None]:
dataset_folder = "" # this should change depending on where the data files are stored

In [None]:
import math
import os
import random
from collections import defaultdict

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.utils.data import Dataset
import numpy as np
from scipy.io.wavfile import read
import librosa
from matplotlib import pyplot as plt

cuda = True if torch.cuda.is_available() else False

Tensor = torch.cuda.FloatTensor if cuda else torch.FloatTensor


In [None]:
from os.path import exists
from os import makedirs

if not os.path.exists('/content/data_speech_commands_v0.02/'):
    os.makedirs('/content/data_speech_commands_v0.02/')

if not exists('data_speech_commands_v0.02.zip'):
    !wget -O data_speech_commands_v0.02.zip https://www.doc.ic.ac.uk/~pam213/co460_files/data_speech_commands_v0.02.zip

!unzip data_speech_commands_v0.02.zip -d "/content/data_speech_commands_v0.02/"

In [None]:
def set_seed(seed_value):
    """Set seed for reproducibility.
    """
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)

set_seed(42)

In [None]:
class SpeechCommandsDataset(Dataset):
    """Google Speech Commands dataset."""

    def __init__(self, root_dir, split):
        """
        Args:
            root_dir (string): Directory with all the data files.
            split    (string): In ["train", "valid", "test"].
        """
        self.root_dir = root_dir
        self.split = split

        self.number_of_classes = len(self.get_classes())

        self.class_to_file = defaultdict(list)

        self.valid_filenames = self.get_valid_filenames()
        self.test_filenames = self.get_test_filenames()

        for c in self.get_classes():
            file_name_list = sorted(os.listdir(self.root_dir + "data_speech_commands_v0.02/" + c))
            for filename in file_name_list:
                if split == "train":
                    if (filename not in self.valid_filenames[c]) and (filename not in self.test_filenames[c]):
                        self.class_to_file[c].append(filename)
                elif split == "valid":
                    if filename in self.valid_filenames[c]:
                        self.class_to_file[c].append(filename)
                elif split == "test":
                    if filename in self.test_filenames[c]:
                        self.class_to_file[c].append(filename)
                else:
                    raise ValueError("Invalid split name.")

        self.filepath_list = list()
        self.label_list = list()
        for cc, c in enumerate(self.get_classes()):
            f_extension = sorted(list(self.class_to_file[c]))
            l_extension = [cc for i in f_extension]
            f_extension = [self.root_dir + "data_speech_commands_v0.02/" + c + "/" + filename for filename in f_extension]
            self.filepath_list.extend(f_extension)
            self.label_list.extend(l_extension)
        self.number_of_samples = len(self.filepath_list)

    def __len__(self):
        return self.number_of_samples

    def __getitem__(self, idx):
        sample = np.zeros((16000, ), dtype=np.float32)

        sample_file = self.filepath_list[idx]

        sample_from_file = read(sample_file)[1]
        sample[:sample_from_file.size] = sample_from_file
        sample = sample.reshape((16000, ))
        
        sample = librosa.feature.mfcc(y=sample, sr=16000, hop_length=512, n_fft=2048).transpose().astype(np.float32)

        label = self.label_list[idx]

        return sample, label

    def get_classes(self):
        return ['one', 'two', 'three']

    def get_valid_filenames(self):
        class_names = self.get_classes()

        class_to_filename = defaultdict(set)
        with open(self.root_dir + "data_speech_commands_v0.02/validation_list.txt", "r") as fp:
            for line in fp:
                clean_line = line.strip().split("/")

                if clean_line[0] in class_names:
                    class_to_filename[clean_line[0]].add(clean_line[1])

        return class_to_filename

    def get_test_filenames(self):
        class_names = self.get_classes()

        class_to_filename = defaultdict(set)
        with open(self.root_dir + "data_speech_commands_v0.02/testing_list.txt", "r") as fp:
            for line in fp:
                clean_line = line.strip().split("/")

                if clean_line[0] in class_names:
                    class_to_filename[clean_line[0]].add(clean_line[1])

        return class_to_filename

In [None]:
train_dataset = SpeechCommandsDataset(dataset_folder,
                                      "train")
valid_dataset = SpeechCommandsDataset(dataset_folder,
                                      "valid")

test_dataset = SpeechCommandsDataset(dataset_folder,
                                     "test")

batch_size = 100


num_epochs = 5
valid_every_n_steps = 20
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                           batch_size=batch_size,
                                           shuffle=True)
valid_loader = torch.utils.data.DataLoader(dataset=valid_dataset,
                                           batch_size=batch_size,
                                           shuffle=False)

test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
                                          batch_size=batch_size,
                                          shuffle=False)

In [None]:
# Define LSTM and GRU

class LSTMCell(nn.Module):
    def __init__(self, input_size, hidden_size, bias=True):
        super(LSTMCell, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.bias = bias

        self.x2h = nn.Linear(input_size, 4*hidden_size, bias = bias)
        self.h2h = nn.Linear(hidden_size, 4*hidden_size, bias = bias)
        
        self.reset_parameters()

    def reset_parameters(self):
        std = 1.0 / math.sqrt(self.hidden_size)
        for w in self.parameters():
            w.data.uniform_(-std, std)

    def forward(self, input, hx=None):
        if hx is None:
            hx = input.new_zeros(input.size(0), self.hidden_size, requires_grad=False)
            hx = (hx, hx)
            
        # We used hx to pack both the hidden and cell states
        hx, cx = hx

        # Linear transformation
        combined = self.x2h(input) + self.h2h(hx)

        # Non-linearities
        gates = torch.sigmoid(combined[ : , : 3*self.hidden_size])
        f_t = gates[ : , : self.hidden_size ]
        i_t = gates[ : , self.hidden_size : 2*self.hidden_size]
        o_t = gates[ : , 2 * self.hidden_size : ]

        cell_temp = torch.tanh(combined[ : , 3 * self.hidden_size : ])

        # Output
        cy = f_t * cx + i_t * cell_temp
        hy = o_t * torch.tanh(cy)

        return (hy, cy)

class BasicRNNCell(nn.Module):
    def __init__(self, input_size, hidden_size, bias=True, nonlinearity="tanh"):
        super(BasicRNNCell, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.bias = bias
        self.nonlinearity = nonlinearity
        if self.nonlinearity not in ["tanh", "relu"]:
            raise ValueError("Invalid nonlinearity selected for RNN.")

        self.x2h = nn.Linear(input_size, hidden_size, bias=bias)
        self.h2h = nn.Linear(hidden_size, hidden_size, bias=bias)

        self.reset_parameters()
        

    def reset_parameters(self):
        std = 1.0 / math.sqrt(self.hidden_size)
        for w in self.parameters():
            w.data.uniform_(-std, std)

            
    def forward(self, input, hx=None):
        if hx is None:
            hx = input.new_zeros(input.size(0), self.hidden_size, requires_grad=False)

        activation = getattr(nn.functional, self.nonlinearity)
        hy = activation(self.x2h(input) + self.h2h(hx))

        return hy

    
    
class GRUCell(nn.Module):
    def __init__(self, input_size, hidden_size, bias=True):
        super(GRUCell, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.bias = bias

        self.x2h = nn.Linear(input_size, hidden_size, bias = bias)
        self.h2h = nn.Linear(hidden_size, hidden_size, bias = bias)

        self.x2r = nn.Linear(input_size, 2*hidden_size, bias = bias)
        self.h2r = nn.Linear(hidden_size, 2*hidden_size, bias = bias)

        self.reset_parameters()
        

    def reset_parameters(self):
        std = 1.0 / math.sqrt(self.hidden_size)
        for w in self.parameters():
            w.data.uniform_(-std, std)

    def forward(self, input, hx=None):
        if hx is None:
            hx = input.new_zeros(input.size(0), self.hidden_size, requires_grad=False)


        # Linear transformation
        combined = self.x2r(input) + self.h2r(hx)
        x_transformed = self.x2h(input)
        h_transformed = self.h2h(hx)

        # Non-linearities
        gates = torch.sigmoid(combined)
        r_t = gates[ : , : self.hidden_size]
        z_t = gates[ : , self.hidden_size : ]

        n_t = torch.tanh(x_transformed + r_t * h_transformed)

        # Output
        hy = (1-z_t) * n_t + z_t * hx
        
        return hy

In [None]:
# Define RNN models
class RNNModel(nn.Module):
    def __init__(self, mode, input_size, hidden_size, num_layers, bias, output_size):
        super(RNNModel, self).__init__()
        self.mode = mode
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.bias = bias
        self.output_size = output_size
        
        self.rnn_cell_list = nn.ModuleList()
        
        if mode == 'LSTM':

              self.rnn_cell_list.append(LSTMCell(self.input_size,
                                                self.hidden_size,
                                                self.bias))
              
              for i in range(1, self.num_layers):
                    self.rnn_cell_list.append(LSTMCell(self.hidden_size,
                                                      self.hidden_size,
                                                      self.bias))


        elif mode == 'GRU':
    
              self.rnn_cell_list.append(GRUCell(self.input_size,
                                                self.hidden_size,
                                                self.bias))
              
              for i in range(1, self.num_layers):
                    self.rnn_cell_list.append(GRUCell(self.hidden_size,
                                                      self.hidden_size,
                                                      self.bias))     
        

        elif mode == 'RNN_TANH':
    
              self.rnn_cell_list.append(BasicRNNCell(self.input_size,
                                                    self.hidden_size,
                                                    self.bias,
                                                    "tanh"))
              
              for i in range(1, self.num_layers):
                    self.rnn_cell_list.append(BasicRNNCell(self.hidden_size,
                                                          self.hidden_size,
                                                          self.bias,
                                                          "tanh"))

                
        elif mode == 'RNN_RELU':
        
              self.rnn_cell_list.append(BasicRNNCell(self.input_size,
                                                    self.hidden_size,
                                                    self.bias,
                                                    "relu"))
              
              for i in range(1, self.num_layers):
                    self.rnn_cell_list.append(BasicRNNCell(self.hidden_size,
                                                          self.hidden_size,
                                                          self.bias,
                                                          "relu"))

        else:
            raise ValueError("Invalid RNN mode selected.")


        # self.att_fc = nn.Linear(self.hidden_size, 1)
        self.fc = nn.Linear(self.hidden_size, self.output_size)

        
    def forward(self, input, hx=None):

        outs = []
        h0 = [None] * self.num_layers if hx is None else list(hx)
        
        X = list(input.permute(1,0,2))

        for j, cell in enumerate(self.rnn_cell_list):

              hx_minus_one = h0[j]

              for i in range(input.shape[1]):

                    hx = cell(X[i], hx_minus_one)
                    hx_minus_one = hx
                    
                    if self.mode == "LSTM":
                        X[i] = hx[0]
                    else:
                        X[i] = hx    

        outs = X
        out = outs[-1].squeeze()
        out = self.fc(out)
        
        return out
    

class BidirRecurrentModel(nn.Module):
    def __init__(self, mode, input_size, hidden_size, num_layers, bias, output_size):
        super(BidirRecurrentModel, self).__init__()
        self.mode = mode
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.bias = bias
        self.output_size = output_size
        
        self.rnn_cell_list = nn.ModuleList()
        self.rnn_cell_list_rev = nn.ModuleList()
        
        if mode == 'LSTM':
            self.rnn_cell_list.append(LSTMCell(self.input_size,
                                               self.hidden_size,
                                               self.bias))
        
            for i in range(1, self.num_layers):
                self.rnn_cell_list.append(LSTMCell(self.hidden_size,
                                                   self.hidden_size,
                                                   self.bias))

            self.rnn_cell_list_rev.append(LSTMCell(self.input_size,
                                                   self.hidden_size,
                                                   self.bias))
        
            for i in range(1, self.num_layers):
                self.rnn_cell_list_rev.append(LSTMCell(self.hidden_size,
                                                       self.hidden_size,
                                                       self.bias))



        elif mode == 'GRU':
            self.rnn_cell_list.append(GRUCell(self.input_size,
                                              self.hidden_size,
                                              self.bias))
        
            for i in range(1, self.num_layers):
                self.rnn_cell_list.append(GRUCell(self.hidden_size,
                                                  self.hidden_size,
                                                  self.bias))

            self.rnn_cell_list_rev.append(GRUCell(self.input_size,
                                                   self.hidden_size,
                                                   self.bias))
        
            for i in range(1, self.num_layers):
                self.rnn_cell_list_rev.append(GRUCell(self.hidden_size,
                                                      self.hidden_size,
                                                      self.bias))




        elif mode == 'RNN_TANH':
            self.rnn_cell_list.append(BasicRNNCell(self.input_size,
                                                   self.hidden_size,
                                                   self.bias,
                                                   "tanh"))
        
            for i in range(1, self.num_layers):
                self.rnn_cell_list.append(BasicRNNCell(self.hidden_size,
                                                       self.hidden_size,
                                                       self.bias,
                                                       "tanh"))

            self.rnn_cell_list_rev.append(BasicRNNCell(self.input_size,
                                                       self.hidden_size,
                                                       self.bias,
                                                       "tanh"))
        
            for i in range(1, self.num_layers):
                self.rnn_cell_list_rev.append(BasicRNNCell(self.hidden_size,
                                                           self.hidden_size,
                                                           self.bias,
                                                           "tanh"))




        elif mode == 'RNN_RELU':
            self.rnn_cell_list.append(BasicRNNCell(self.input_size,
                                                   self.hidden_size,
                                                   self.bias,
                                                   "relu"))
        
            for i in range(1, self.num_layers):
                self.rnn_cell_list.append(BasicRNNCell(self.hidden_size,
                                                       self.hidden_size,
                                                       self.bias,
                                                       "relu"))

            self.rnn_cell_list_rev.append(BasicRNNCell(self.input_size,
                                                       self.hidden_size,
                                                       self.bias,
                                                       "relu"))
            
            for i in range(1, self.num_layers):
                self.rnn_cell_list_rev.append(BasicRNNCell(self.hidden_size,
                                                           self.hidden_size,
                                                           self.bias,
                                                           "relu"))
            

        else:
            raise ValueError("Invalid RNN mode selected.")

          
        self.fc = nn.Linear(self.hidden_size*2, self.output_size)
        
        
        
    def forward(self, input, hx=None):
        
        # Forward processing
        outs = []
        h0 = [None] * self.num_layers * 2 if hx is None else list(hx)
        X = list(input.permute(1,0,2))

        for j, cell in enumerate(self.rnn_cell_list):

              hx_minus_one = h0[j]

              for i in range(input.shape[1]):

                    hx = cell(X[i], hx_minus_one)
                    hx_minus_one = hx
                    if self.mode == "LSTM":
                        X[i] = hx[0]
                    else:
                        X[i] = hx    

        outs = X
    

        # Reverse processing
        outs_rev = []
        X = list(input.permute(1,0,2))
        X.reverse()

        for j, cell in enumerate(self.rnn_cell_list_rev):

              hx_minus_one = h0[j + self.num_layers]

              for i in range(input.shape[1]):

                    hx = cell(X[i], hx_minus_one)
                    hx_minus_one = hx
                    if self.mode == "LSTM":
                        X[i] = hx[0]
                    else:
                        X[i] = hx    

        outs_rev = X

        out = outs[-1].squeeze()
        out_rev = outs_rev[0].squeeze()
        out = torch.cat((out, out_rev), 1)

        out = self.fc(out)
        return out

In [None]:
# Train the model

seq_dim, input_dim = train_dataset[0][0].shape
output_dim = 3

hidden_dim = 32
# hidden_dim = 48
layer_dim = 2
bias = True

### Change the code below to try running different models:
# model = RNNModel("RNN_RELU", input_dim, hidden_dim, layer_dim, bias, output_dim)
model = BidirRecurrentModel("LSTM", input_dim, hidden_dim, layer_dim, bias, output_dim)

if torch.cuda.is_available():
    model.cuda()
    
criterion = nn.CrossEntropyLoss()

learning_rate = 0.01
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

loss_list = []
iter = 0
max_v_accuracy = 0
reported_t_accuracy = 0
max_t_accuracy = 0
for epoch in range(num_epochs):
    for i, (audio, labels) in enumerate(train_loader):
        if torch.cuda.is_available():
            audio = Variable(audio.view(-1, seq_dim, input_dim).cuda())
            labels = Variable(labels.cuda())
        else:
            audio = Variable(audio.view(-1, seq_dim, input_dim))
            labels = Variable(labels)

        optimizer.zero_grad()

        outputs = model(audio)

        loss = criterion(outputs, labels)

        if torch.cuda.is_available():
            loss.cuda()

        loss.backward()

        optimizer.step()

        loss_list.append(loss.item())
        iter += 1

        if iter % valid_every_n_steps == 0:
            correct = 0
            total = 0
            for audio, labels in valid_loader:
                if torch.cuda.is_available():
                    audio = Variable(audio.view(-1, seq_dim, input_dim).cuda())
                else:
                    audio = Variable(audio.view(-1, seq_dim, input_dim))

                outputs = model(audio)

                _, predicted = torch.max(outputs.data, 1)

                total += labels.size(0)

                if torch.cuda.is_available():
                    correct += (predicted.cpu() == labels.cpu()).sum()
                else:
                    correct += (predicted == labels).sum()

            v_accuracy = 100 * correct // total
            
            is_best = False
            if v_accuracy >= max_v_accuracy:
                max_v_accuracy = v_accuracy
                is_best = True

            if is_best:
                for audio, labels in test_loader:
                    if torch.cuda.is_available():
                        audio = Variable(audio.view(-1, seq_dim, input_dim).cuda())
                    else:
                        audio = Variable(audio.view(-1, seq_dim, input_dim))

                    outputs = model(audio)

                    _, predicted = torch.max(outputs.data, 1)

                    total += labels.size(0)

                    if torch.cuda.is_available():
                        correct += (predicted.cpu() == labels.cpu()).sum()
                    else:
                        correct += (predicted == labels).sum()

                t_accuracy = 100 * correct // total
                reported_t_accuracy = t_accuracy

            print('Iteration: {}. Loss: {}. V-Accuracy: {}  T-Accuracy: {}'.format(iter, loss.item(), v_accuracy, reported_t_accuracy))

