# Regularisation in NNs

## 1. Set up the environment

In [1]:
# Import statements
from tensorflow import keras as kr 
import numpy as np
import matplotlib.pyplot as plt

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [3]:
# Set my plotting style
plt.style.use(('dark_background', 'bmh'))
plt.rc('axes', facecolor='none')
plt.rc('figure', figsize=(16, 4))

In [4]:
# Set random seed for reproducibility
np.random.seed(0)
torch.manual_seed(1)

<torch._C.Generator at 0x113bd0df0>

In [5]:
# Shortcuts
imdb = kr.datasets.imdb
Tokeniser = kr.preprocessing.text.Tokenizer

## 2. Loading the data set

In [6]:
# Set the number of features we want
features_nb = 1000

# Load data and target vector from movie review data
(train_data, train_target), (test_data, test_target) = imdb.load_data(num_words=features_nb)

# Convert movie review data to a one-hot encoded feature matrix
tokeniser = Tokeniser(num_words=features_nb)
train_features = tokeniser.sequences_to_matrix(train_data, mode='binary')
test_features = tokeniser.sequences_to_matrix(test_data, mode='binary')

### 2.1 Exploring the data set

In [7]:
# Check data set sizes
print('train_data.shape:', train_data.shape)
print('train_target.shape:', train_target.shape)
print('test_data.shape:', test_data.shape)
print('test_target.shape:', test_target.shape)

train_data.shape: (25000,)
train_target.shape: (25000,)
test_data.shape: (25000,)
test_target.shape: (25000,)


In [8]:
# Check format of first training sample
print('type(train_data[0]):', type(train_data[0]))
print('type(train_target[0]):', type(train_target[0]))

type(train_data[0]): <class 'list'>
type(train_target[0]): <class 'numpy.int64'>


In [9]:
# Check size of first 10 training samples and corresponding target
print('Reviews length:', [len(sample) for sample in train_data[:10]])
print('Review sentiment (bad/good):', train_target[:10])

Reviews length: [218, 189, 141, 550, 147, 43, 123, 562, 233, 130]
Review sentiment (bad/good): [1 0 0 1 0 0 1 0 1 0]


In [10]:
# Show first review - machine format
print(train_data[0])

[1, 14, 22, 16, 43, 530, 973, 2, 2, 65, 458, 2, 66, 2, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 2, 2, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2, 19, 14, 22, 4, 2, 2, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 2, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2, 2, 16, 480, 66, 2, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 2, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 2, 15, 256, 4, 2, 7, 2, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 2, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2, 56, 26, 141, 6, 194, 2, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 2, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 2, 88, 12, 16, 283, 5, 16, 2, 113, 103, 32, 15, 16, 2, 19, 178, 32]


In [11]:
# Data set text visualisation helper function
def show_text(sample):
    word_to_id = imdb.get_word_index()
    word_to_id = {k:(v+3) for k,v in word_to_id.items()}
    word_to_id["<PAD>"] = 0
    word_to_id["<START>"] = 1
    word_to_id["<UNK>"] = 2

    id_to_word = {value:key for key,value in word_to_id.items()}
    print(' '.join(id_to_word[id_] for id_ in sample))

In [12]:
# Show first review - human format
show_text(train_data[0])

<START> this film was just brilliant casting <UNK> <UNK> story direction <UNK> really <UNK> the part they played and you could just imagine being there robert <UNK> is an amazing actor and now the same being director <UNK> father came from the same <UNK> <UNK> as myself so i loved the fact there was a real <UNK> with this film the <UNK> <UNK> throughout the film were great it was just brilliant so much that i <UNK> the film as soon as it was released for <UNK> and would recommend it to everyone to watch and the <UNK> <UNK> was amazing really <UNK> at the end it was so sad and you know what they say if you <UNK> at a film it must have been good and this definitely was also <UNK> to the two little <UNK> that played the <UNK> of <UNK> and paul they were just brilliant children are often left out of the <UNK> <UNK> i think because the stars that play them all <UNK> up are such a big <UNK> for the whole film but these children are amazing and should be <UNK> for what they have done don't yo

In [13]:
# Show first review - neural net format
print(train_features[0])

[0. 1. 1. 0. 1. 1. 1. 1. 1. 1. 0. 0. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 0.
 0. 1. 1. 0. 1. 0. 1. 0. 1. 1. 0. 1. 1. 0. 1. 1. 0. 0. 0. 1. 0. 0. 1. 0.
 1. 0. 1. 1. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 1. 1. 0. 0. 0. 0. 1.
 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 1. 0. 0. 0. 1. 0. 0. 0.
 0. 0. 1. 0. 1. 0. 0. 1. 1. 0. 1. 1. 0. 0. 0. 0. 1. 1. 0. 0. 0. 1. 0. 0.
 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 1. 0. 0.
 1. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.
 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.
 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.

In [14]:
# Show first review - neural net format - explanation
print(train_features[0] * np.arange(len(train_features[0])))

[  0.   1.   2.   0.   4.   5.   6.   7.   8.   9.   0.   0.  12.  13.
  14.  15.  16.  17.  18.  19.   0.  21.  22.   0.   0.  25.  26.   0.
  28.   0.  30.   0.  32.  33.   0.  35.  36.   0.  38.  39.   0.   0.
   0.  43.   0.   0.  46.   0.  48.   0.  50.  51.  52.   0.   0.   0.
  56.   0.   0.   0.   0.   0.  62.   0.   0.  65.  66.   0.   0.   0.
   0.  71.   0.   0.   0.   0.  76.  77.   0.   0.   0.   0.  82.   0.
   0.   0.   0.  87.  88.   0.   0.   0.  92.   0.   0.   0.   0.   0.
  98.   0. 100.   0.   0. 103. 104.   0. 106. 107.   0.   0.   0.   0.
 112. 113.   0.   0.   0. 117.   0.   0.   0.   0.   0.   0. 124.   0.
   0.   0.   0.   0. 130.   0.   0.   0. 134. 135.   0.   0.   0.   0.
   0. 141.   0.   0. 144.   0.   0. 147.   0.   0. 150.   0.   0.   0.
   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0. 167.
   0.   0.   0.   0. 172. 173.   0.   0.   0.   0. 178.   0.   0.   0.
   0.   0.   0.   0.   0.   0.   0.   0.   0.   0. 192.   0. 194.   0.
   0. 

## 3. Exploring regularisation of NN

Play with the code, especially the one marked `# toggle`.  
Start from `# toggle 0`, and then, one at the time, `# toggle 1` to `5`.

In [15]:
class ThreeLayerDense(nn.Module):

    def __init__(self, input_size, units_size):

        super(ThreeLayerDense, self).__init__()
        self.linear1 = torch.nn.Linear(input_size, units_size) #features_nb, 16
        self.linear2 = torch.nn.Linear(units_size, units_size)
        #self.dropout
        self.linear3 = torch.nn.Linear(units_size, 1)

    def forward(self, x):
        x = self.linear1(x)
        x = F.relu(x) 
        x = self.linear2(x) 
        x = F.relu(x)
        #Add dropout regularization
        #x = F.dropout(x, training=self.training)   
        return nn.Sigmoid()(self.linear3(x))

In [16]:
epochs = 25
log_interval = 10
batch_size = 100

model = ThreeLayerDense(features_nb, 16)

criterion = torch.nn.BCELoss()
optimizer = torch.optim.RMSprop(model.parameters(), lr=0.001)

In [17]:
epochs = 25
log_interval = 10
batch_size = 100

model = ThreeLayerDense(features_nb, 16)

criterion = torch.nn.BCELoss()
optimizer = torch.optim.RMSprop(model.parameters(), lr=0.001)

#l2 regularization
#l2_regularization_factor = 0.0005
#optimizer = torch.optim.RMSprop(model.parameters(), lr=0.001, weight_decay = l2_regularization_factor)

In [18]:
epoch = 0
train_data_gen = zip(train_features, train_target)
train_size = len(train_target)

while epoch < epochs:
    predictions = []
    truth_values = []

    for batch_idx, (xs, y) in enumerate(train_data_gen):
        xs, y = torch.from_numpy(xs).float(), torch.FloatTensor([y])

        y_pred = model(xs)
        loss = criterion(y_pred, y)

        optimizer.zero_grad()
        loss.backward()
        #nn.utils.clip_grad_norm(model.parameters(), 0.5)
        optimizer.step()

        predictions.append(y_pred.cpu().data.numpy().ravel())
        truth_values.append(y)

        if batch_idx % log_interval == 0:
            print('Train Epoch: {}, mini-batch {} of {}, training loss: {:.6f}'.format(
                epoch, batch_idx, train_size, loss.item()))

    epoch += 1

Train Epoch: 0, mini-batch 0 of 25000, training loss: 0.613348
Train Epoch: 0, mini-batch 10 of 25000, training loss: 0.656209
Train Epoch: 0, mini-batch 20 of 25000, training loss: 0.729808
Train Epoch: 0, mini-batch 30 of 25000, training loss: 0.687957
Train Epoch: 0, mini-batch 40 of 25000, training loss: 0.726682
Train Epoch: 0, mini-batch 50 of 25000, training loss: 0.679756
Train Epoch: 0, mini-batch 60 of 25000, training loss: 0.630801
Train Epoch: 0, mini-batch 70 of 25000, training loss: 0.644291
Train Epoch: 0, mini-batch 80 of 25000, training loss: 0.671881
Train Epoch: 0, mini-batch 90 of 25000, training loss: 0.689763
Train Epoch: 0, mini-batch 100 of 25000, training loss: 0.543776
Train Epoch: 0, mini-batch 110 of 25000, training loss: 0.526624
Train Epoch: 0, mini-batch 120 of 25000, training loss: 0.737881
Train Epoch: 0, mini-batch 130 of 25000, training loss: 0.428849
Train Epoch: 0, mini-batch 140 of 25000, training loss: 0.891402
Train Epoch: 0, mini-batch 150 of 25

Train Epoch: 0, mini-batch 1350 of 25000, training loss: 0.213357
Train Epoch: 0, mini-batch 1360 of 25000, training loss: 0.549979
Train Epoch: 0, mini-batch 1370 of 25000, training loss: 0.807515
Train Epoch: 0, mini-batch 1380 of 25000, training loss: 0.105247
Train Epoch: 0, mini-batch 1390 of 25000, training loss: 0.098139
Train Epoch: 0, mini-batch 1400 of 25000, training loss: 0.690399
Train Epoch: 0, mini-batch 1410 of 25000, training loss: 0.327312
Train Epoch: 0, mini-batch 1420 of 25000, training loss: 0.348850
Train Epoch: 0, mini-batch 1430 of 25000, training loss: 0.106565
Train Epoch: 0, mini-batch 1440 of 25000, training loss: 0.616981
Train Epoch: 0, mini-batch 1450 of 25000, training loss: 0.138042
Train Epoch: 0, mini-batch 1460 of 25000, training loss: 0.238999
Train Epoch: 0, mini-batch 1470 of 25000, training loss: 0.087767
Train Epoch: 0, mini-batch 1480 of 25000, training loss: 0.026745
Train Epoch: 0, mini-batch 1490 of 25000, training loss: 0.141960
Train Epoc

Train Epoch: 0, mini-batch 2890 of 25000, training loss: 0.030071
Train Epoch: 0, mini-batch 2900 of 25000, training loss: 0.415149
Train Epoch: 0, mini-batch 2910 of 25000, training loss: 0.049818
Train Epoch: 0, mini-batch 2920 of 25000, training loss: 0.036722
Train Epoch: 0, mini-batch 2930 of 25000, training loss: 1.542135
Train Epoch: 0, mini-batch 2940 of 25000, training loss: 0.329811
Train Epoch: 0, mini-batch 2950 of 25000, training loss: 0.027226
Train Epoch: 0, mini-batch 2960 of 25000, training loss: 0.102297
Train Epoch: 0, mini-batch 2970 of 25000, training loss: 0.032228
Train Epoch: 0, mini-batch 2980 of 25000, training loss: 1.619554
Train Epoch: 0, mini-batch 2990 of 25000, training loss: 0.867965
Train Epoch: 0, mini-batch 3000 of 25000, training loss: 0.913068
Train Epoch: 0, mini-batch 3010 of 25000, training loss: 0.186353
Train Epoch: 0, mini-batch 3020 of 25000, training loss: 0.084611
Train Epoch: 0, mini-batch 3030 of 25000, training loss: 0.016289
Train Epoc

Train Epoch: 0, mini-batch 4390 of 25000, training loss: 0.137279
Train Epoch: 0, mini-batch 4400 of 25000, training loss: 0.536962
Train Epoch: 0, mini-batch 4410 of 25000, training loss: 0.019898
Train Epoch: 0, mini-batch 4420 of 25000, training loss: 0.006939
Train Epoch: 0, mini-batch 4430 of 25000, training loss: 0.003244
Train Epoch: 0, mini-batch 4440 of 25000, training loss: 0.069077
Train Epoch: 0, mini-batch 4450 of 25000, training loss: 0.007658
Train Epoch: 0, mini-batch 4460 of 25000, training loss: 1.245637
Train Epoch: 0, mini-batch 4470 of 25000, training loss: 0.005125
Train Epoch: 0, mini-batch 4480 of 25000, training loss: 0.412028
Train Epoch: 0, mini-batch 4490 of 25000, training loss: 0.014296
Train Epoch: 0, mini-batch 4500 of 25000, training loss: 0.000758
Train Epoch: 0, mini-batch 4510 of 25000, training loss: 0.485406
Train Epoch: 0, mini-batch 4520 of 25000, training loss: 0.237051
Train Epoch: 0, mini-batch 4530 of 25000, training loss: 0.241406
Train Epoc

Train Epoch: 0, mini-batch 5810 of 25000, training loss: 0.044221
Train Epoch: 0, mini-batch 5820 of 25000, training loss: 0.259877
Train Epoch: 0, mini-batch 5830 of 25000, training loss: 0.141588
Train Epoch: 0, mini-batch 5840 of 25000, training loss: 0.139451
Train Epoch: 0, mini-batch 5850 of 25000, training loss: 0.239557
Train Epoch: 0, mini-batch 5860 of 25000, training loss: 0.247746
Train Epoch: 0, mini-batch 5870 of 25000, training loss: 0.408890
Train Epoch: 0, mini-batch 5880 of 25000, training loss: 0.565888
Train Epoch: 0, mini-batch 5890 of 25000, training loss: 0.037637
Train Epoch: 0, mini-batch 5900 of 25000, training loss: 0.006358
Train Epoch: 0, mini-batch 5910 of 25000, training loss: 0.208833
Train Epoch: 0, mini-batch 5920 of 25000, training loss: 0.038720
Train Epoch: 0, mini-batch 5930 of 25000, training loss: 0.166565
Train Epoch: 0, mini-batch 5940 of 25000, training loss: 1.087361
Train Epoch: 0, mini-batch 5950 of 25000, training loss: 0.000593
Train Epoc

Train Epoch: 0, mini-batch 7380 of 25000, training loss: 0.001470
Train Epoch: 0, mini-batch 7390 of 25000, training loss: 0.047409
Train Epoch: 0, mini-batch 7400 of 25000, training loss: 0.088814
Train Epoch: 0, mini-batch 7410 of 25000, training loss: 0.496978
Train Epoch: 0, mini-batch 7420 of 25000, training loss: 0.114330
Train Epoch: 0, mini-batch 7430 of 25000, training loss: 0.114552
Train Epoch: 0, mini-batch 7440 of 25000, training loss: 0.055324
Train Epoch: 0, mini-batch 7450 of 25000, training loss: 0.617035
Train Epoch: 0, mini-batch 7460 of 25000, training loss: 0.139774
Train Epoch: 0, mini-batch 7470 of 25000, training loss: 1.972763
Train Epoch: 0, mini-batch 7480 of 25000, training loss: 0.408044
Train Epoch: 0, mini-batch 7490 of 25000, training loss: 0.405534
Train Epoch: 0, mini-batch 7500 of 25000, training loss: 0.456997
Train Epoch: 0, mini-batch 7510 of 25000, training loss: 0.254052
Train Epoch: 0, mini-batch 7520 of 25000, training loss: 0.064884
Train Epoc

KeyboardInterrupt: 

In [None]:
# ! ls  # toggle 5

In [None]:
# Get training and test accuracy histories
train_loss = history.history['loss']
test_loss = history.history['val_loss']

# Create count of the number of epochs
epoch = range(1, len(train_loss) + 1)

# Visualize accuracy history
plt.figure()

plt.plot(epoch, train_loss)
plt.plot(epoch, test_loss)
# plt.plot(no_reg['epoch'], no_reg['train_loss'])  # toggle 0
# plt.plot(no_reg['epoch'], no_reg['test_loss'])  # toggle 0

plt.legend(['Train loss', 'Test loss', 'Train no-reg', 'Test no-reg'])
plt.xlabel('Epoch')
plt.ylabel('Loss score')

# Get training and test accuracy histories
train_accuracy = history.history['acc']
test_accuracy = history.history['val_acc']

# Visualize accuracy history
plt.figure()

plt.plot(epoch, train_accuracy)
plt.plot(epoch, test_accuracy)
# plt.plot(no_reg['epoch'], no_reg['train_accuracy'])  # toggle 0
# plt.plot(no_reg['epoch'], no_reg['test_accuracy'])  # toggle 0

plt.legend(['Train accuracy', 'Test accuracy', 'Train no-reg', 'Test no-reg'])
plt.xlabel('Epoch')
plt.ylabel('Accuracy Score')

no_reg = {                             # toggle 0
    'epoch': epoch,                    # toggle 0
    'train_loss': train_loss,          # toggle 0
    'test_loss': test_loss,            # toggle 0
    'train_accuracy': train_accuracy,  # toggle 0
    'test_accuracy': test_accuracy,    # toggle 0
}

In [None]:
# Backup weights
weights = network.layers[0].get_weights()[0]  # toggle 0
# weights_L1 = network.layers[0].get_weights()[0]  # toggle 1
# weights_L2 = network.layers[0].get_weights()[0]  # toggle 2
# weights_max = network.layers[0].get_weights()[0]  # toggle 3

After you got to toggle `# toggle 3`, execute the following code.

In [None]:
# Show weight distribution
plt.hist((
    weights.reshape(-1),
    weights_L1.reshape(-1),
    weights_L2.reshape(-1),
    weights_max.reshape(-1),
), 49, range=(-.5, .5), label=(
    'No-reg',
    'L1',
    'L2',
    'Max',
))
plt.legend();