In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import time

import torch
from torch import nn, optim
from torch.autograd import Variable

np.random.seed(42)

In [11]:
class LSTMWithCDropout(nn.Module):

    def __init__(self, input_size, output_size, gpu_id, weight_regularizer,
            dropout_regularizer, init_min, init_max):

        super(LSTMWithCDropout, self).__init__()
        # Post drop out layer
        self.layer = nn.LSTM(input_size, output_size, batch_first=True)
        # Input dim for regularisation scaling
        self.input_dim = input_size
        # Regularisation hyper-parameters
        self.weight_regularizer = weight_regularizer
        self.dropout_regularizer = dropout_regularizer
        # Initialise p_logit
        init_min = np.log(init_min) - np.log(1. - init_min)
        init_max = np.log(init_max) - np.log(1. - init_max)
        self.p_logit = nn.Parameter(torch.Tensor(1))
        nn.init.uniform(self.p_logit, a=init_min, b=init_max)

        self.gpu_id = gpu_id

        self.use_dropout = True

    def forward(self, x):

        if self.use_dropout:
            return self.layer(self._concrete_dropout(x))

        return self.layer(x)

    def regularisation(self):

        if not self.use_dropout:
            return 0.0

        self.p = nn.functional.sigmoid(self.p_logit)
        
        weights_regularizer = self.weight_regularizer * self.sum_n_square() / (1 - self.p)
        dropout_regularizer = self.p * torch.log(self.p)
        dropout_regularizer += (1. - self.p) * torch.log(1. - self.p)
        dropout_regularizer *= self.dropout_regularizer*self.input_dim
        regularizer = weights_regularizer + dropout_regularizer
        return regularizer

    def _concrete_dropout(self, x):
        """Forward pass for dropout layer
        """
        eps = 1e-7
        temp = 0.1
        self.p = nn.functional.sigmoid(self.p_logit)

        uniform_distribution = torch.distributions.uniform.Uniform(0, 1)

        unif_noise = uniform_distribution.sample(sample_shape=x.size()).cuda(self.gpu_id)

        drop_prob = (torch.log(self.p + eps)
                    - torch.log(1 - self.p + eps)
                    + torch.log(unif_noise + eps)
                    - torch.log(1 - unif_noise + eps))
        drop_prob = nn.functional.sigmoid(drop_prob / temp)

        random_tensor = 1 - drop_prob
        retain_prob = 1 - self.p
        x  = torch.mul(x, random_tensor)
        x /= retain_prob
        return x

    def sum_n_square(self):
        return torch.sum(torch.pow(self.layer.weight_ih_l0, 2)) + \
                torch.sum(torch.pow(self.layer.bias_ih_l0, 2))


class Model(nn.Module):

    def __init__(self, graphemes_vocab_size, phonemes_vocab_size, seq_length,
            dropout):

        super(Model, self).__init__()

        embedding_size = 100
        lstm_thickness = 100

        self.mode = dropout['mode']

        self.embedding = nn.Embedding(graphemes_vocab_size, embedding_size)

        if self.mode == 'dropout':
            self.encoder_dropout1 = nn.Dropout(dropout_data['prob'])
            self.encoder_lstm1 = nn.LSTM(embedding_size, lstm_thickness, batch_first=True)
        else:
            self.encoder_lstm1 = LSTMWithCDropout(embedding_size, lstm_thickness,
                    dropout['gpu_id'], dropout['wr'], dropout['dr'],
                    dropout['init_min'], dropout['init_max'])

        self.encoder_batchnorm2 = nn.BatchNorm1d(lstm_thickness)

        if self.mode == 'dropout':
            self.encoder_dropout2 = nn.Dropout(dropout_data['prob'])
            self.encoder_lstm2 = nn.LSTM(lstm_thickness, lstm_thickness, batch_first=True)
        else:
            self.encoder_lstm2 = LSTMWithCDropout(lstm_thickness, lstm_thickness,
                    dropout['gpu_id'], dropout['wr'], dropout['dr'],
                    dropout['init_min'], dropout['init_max'])

        self.encoder_batchnorm3 = nn.BatchNorm1d(lstm_thickness)

        if self.mode == 'dropout':
            self.encoder_dropout3 = nn.Dropout(dropout_data['prob'])
            self.encoder_lstm3 = nn.LSTM(lstm_thickness, lstm_thickness, batch_first=True)
        else:
            self.encoder_lstm3 = LSTMWithCDropout(lstm_thickness, lstm_thickness,
                    dropout['gpu_id'], dropout['wr'], dropout['dr'],
                    dropout['init_min'], dropout['init_max'])

        self.average_pooling = nn.AvgPool1d(seq_length)

        self.decoder_batchnorm1 = nn.BatchNorm1d(lstm_thickness)

        if self.mode == 'dropout':
            self.decoder_dropout1 = nn.Dropout(dropout_data['prob'])
            self.decoder_lstm1 = nn.LSTM(lstm_thickness, lstm_thickness, batch_first=True)
        else:
            self.decoder_lstm1 = LSTMWithCDropout(lstm_thickness, lstm_thickness,
                    dropout['gpu_id'], dropout['wr'], dropout['dr'],
                    dropout['init_min'], dropout['init_max'])

        self.decoder_batchnorm2 = nn.BatchNorm1d(lstm_thickness)

        if self.mode == 'dropout':
            self.decoder_dropout2 = nn.Dropout(dropout_data['prob'])
            self.decoder_lstm2 = nn.LSTM(lstm_thickness, lstm_thickness, batch_first=True)
        else:
            self.decoder_lstm2 = LSTMWithCDropout(lstm_thickness, lstm_thickness,
                    dropout['gpu_id'], dropout['wr'], dropout['dr'],
                    dropout['init_min'], dropout['init_max'])

        self.decoder_linear = nn.Linear(lstm_thickness, phonemes_vocab_size)

    def forward(self, x):

        # x of shape (batch, seq)

        x = self.embedding(x)

        if self.mode == 'dropout':
            x = self.encoder_dropout1(x)
        x = self.encoder_lstm1(x)[0]

        x = x.transpose(1, 2)
        x = self.encoder_batchnorm2(x)
        x = x.transpose(1, 2)
        if self.mode == 'dropout':
            x = self.encoder_dropout2(x)
        x = self.encoder_lstm2(x)[0]

        x = x.transpose(1, 2)
        x = self.encoder_batchnorm3(x)
        x = x.transpose(1, 2)
        if self.mode == 'dropout':
            x = self.encoder_dropout3(x)
        x = self.encoder_lstm3(x)[0]

        x = x.transpose(1, 2)
        x = self.average_pooling(x)
        x = torch.cat([x]*seq_length, 2)

        x = self.decoder_batchnorm1(x)
        x = x.transpose(1, 2)
        if self.mode == 'dropout':
            x = self.decoder_dropout1(x)
        x = self.decoder_lstm1(x)[0]

        x = x.transpose(1, 2)
        x = self.decoder_batchnorm2(x)
        x = x.transpose(1, 2)
        if self.mode == 'dropout':
            x = self.decoder_dropout2(x)
        x = self.decoder_lstm2(x)[0]

        x = self.decoder_linear(x)

        return nn.functional.log_softmax(x, 2)

def get_regularization_loss(model):

    regularization_loss = 0.0

    def get_module_regularization_loss(module):

        nonlocal regularization_loss
        
        if module.__class__.__name__.endswith('LSTMWithCDropout'):
            regularization_loss = regularization_loss + module.regularisation()

    model.apply(get_module_regularization_loss)

    return regularization_loss

def set_dropout_state(model, value):

    def set_dropout_state_in_module(module):

        if module.__class__.__name__.endswith('LSTMWithCDropout'):
            module.use_dropout = value

    model.apply(set_dropout_state_in_module)

In [3]:
def vectorize_data(data, seq_length):

    unique_graphemes = set()
    unique_phonemes = set()

    for word, transcription in zip(data[:, 0], data[:, 1]):
        unique_graphemes |= set(word)
        unique_phonemes |= set(transcription.split())

    grapheme_codes = {grapheme:i + 1 for i, grapheme in enumerate(unique_graphemes)}
    phoneme_codes = {phoneme:i + 1 for i, phoneme in enumerate(unique_phonemes)}

    encoded_words = np.zeros((len(data), seq_length), dtype=int)
    encoded_transcriptions = np.zeros((len(data), seq_length), dtype=int)

    for index, (word, transcription) in enumerate(zip(data[:, 0], data[:, 1])):

        encoded_word = [grapheme_codes[grapheme] for grapheme in word]
        encoded_transcription = [phoneme_codes[phoneme] for phoneme in transcription.split()]

        encoded_words[index, :len(encoded_word)] = encoded_word
        encoded_transcriptions[index, :len(encoded_transcription)] = encoded_transcription

    return grapheme_codes, phoneme_codes, encoded_words, encoded_transcriptions

data = np.array(pd.read_csv('train.csv', delimiter=',', index_col='Id'), dtype=str)

seq_length = 40

grapheme_codes, phoneme_codes, X, y = vectorize_data(data, seq_length)

In [4]:
permutation = np.random.permutation(len(X))

X = X[permutation]
y = y[permutation]

train_X = X[:90000]
train_y = y[:90000]
val_X = X[90000:95000]
val_y = y[90000:95000]
test_X = X[95000:]
test_y = y[95000:]

In [12]:
epochs_count = 100
batch_size = 500
gpu_id = 0

#dropout_data = {'mode': 'dropout', 'prob': 0.2}
dropout_data = {'mode': 'concrete', 'gpu_id': gpu_id, 'wr': 1e-4, 'dr': 1e-4,
        'init_min': 0.05, 'init_max': 0.5}

model = Model(len(grapheme_codes) + 1, len(phoneme_codes) + 1, seq_length,
              dropout_data).cuda(gpu_id)

optimizer = optim.RMSprop(model.parameters(), lr=1e-2)

criterion = nn.NLLLoss()



In [13]:
start = time.time()

for epoch in range(epochs_count):

    permutation = np.random.permutation(len(train_X))

    model.train()
    set_dropout_state(model, True)

    total_train_loss = 0.0
    train_mismatches_count = 0
    train_batches_count = 0
    
    for batch_start in range(0, len(train_X), batch_size):

        batch_end = min(len(train_X), batch_start + batch_size)

        if batch_start == batch_end:
            break

        batch_indices = permutation[batch_start:batch_end]

        batch_X = train_X[batch_indices]
        numpy_batch_y = train_y[batch_indices]

        batch_X = Variable(torch.LongTensor(batch_X), requires_grad=False).cuda(gpu_id)
        batch_y = Variable(torch.LongTensor(numpy_batch_y), requires_grad=False).cuda(gpu_id)

        output = model(batch_X)

        train_loss = criterion(output.transpose(1, 2), batch_y)

        output = output.detach().cpu().numpy().argmax(axis=2)

        train_mismatches_count += (output != numpy_batch_y).sum()
        total_train_loss += float(train_loss)
        train_batches_count += 1

        optimizer.zero_grad()
        (train_loss + get_regularization_loss(model)).backward()
        optimizer.step()

    total_train_loss /= train_batches_count
    average_train_mismatches = train_mismatches_count/len(train_X)
 
    model.eval()
    set_dropout_state(model, False)

    val_train_loss = 0.0
    val_mismatches_count = 0
    val_batches_count = 0

    for batch_start in range(0, len(val_X), batch_size):

        batch_end = min(len(val_X), batch_start + batch_size)

        if batch_start == batch_end:
            break

        batch_X = val_X[batch_start:batch_end]
        numpy_batch_y = val_y[batch_start:batch_end]

        batch_X = Variable(torch.LongTensor(batch_X), requires_grad=False).cuda(gpu_id)
        batch_y = Variable(torch.LongTensor(numpy_batch_y), requires_grad=False).cuda(gpu_id)

        output = model(batch_X)

        val_loss = criterion(output.transpose(1, 2), batch_y)

        output = output.detach().cpu().numpy().argmax(axis=2)

        val_mismatches_count += (output != numpy_batch_y).sum()
        val_train_loss += float(val_loss)
        val_batches_count += 1

    val_train_loss /= val_batches_count
    average_val_mismatches = val_mismatches_count/len(val_X)

    print('Epoch {}'.format(epoch))
    print('{:.3f} minutes passed'.format((time.time() - start)/60))
    print('train-loss={0:.3f} train-score={1:.3f} val-loss={2:.3f} val-score={3:.3f}'.format(
        total_train_loss, average_train_mismatches, val_train_loss, average_val_mismatches
    ))



Epoch 0
1.218 minutes passed
train-loss=0.598 train-score=5.997 val-loss=0.417 val-score=4.547
Epoch 1
2.417 minutes passed
train-loss=0.386 train-score=4.354 val-loss=0.419 val-score=4.621
Epoch 2
3.586 minutes passed
train-loss=0.309 train-score=3.529 val-loss=0.254 val-score=2.916
Epoch 3
4.784 minutes passed
train-loss=0.251 train-score=2.923 val-loss=0.318 val-score=3.240
Epoch 4
5.975 minutes passed
train-loss=0.214 train-score=2.514 val-loss=0.204 val-score=2.377
Epoch 5
7.139 minutes passed
train-loss=0.187 train-score=2.254 val-loss=0.191 val-score=2.348
Epoch 6
8.285 minutes passed
train-loss=0.191 train-score=2.242 val-loss=3.451 val-score=35.303
Epoch 7
9.409 minutes passed
train-loss=0.205 train-score=2.451 val-loss=0.163 val-score=1.999
Epoch 8
10.597 minutes passed
train-loss=0.164 train-score=2.009 val-loss=0.180 val-score=2.174
Epoch 9
11.728 minutes passed
train-loss=0.155 train-score=1.903 val-loss=0.182 val-score=2.090
Epoch 10
12.890 minutes passed
train-loss=0.146

KeyboardInterrupt: 