*Made by Artem Vazhentsev (AIRI)*

# Special Layers


In this seminar, we will use the same neural network and task as in the previous seminar. 

First of all, train the model and add one linear layer, which we will modify further.

In [1]:
# %load nn_imports.py

from IPython.display import display, clear_output

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from copy import deepcopy

import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader

import plotly.express as px

In [2]:
!wget  -O 'housing_data.csv' -q 'https://www.dropbox.com/s/6dxq90t0prn2vaw/_train_sem2.csv?dl=0'

In [3]:
df = pd.read_csv('housing_data.csv')
print(df.shape)
df.head()

(1460, 81)


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [4]:
data = df.select_dtypes(['int64', 'float64'])
data.head()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
0,1,60,65.0,8450,7,5,2003,2003,196.0,706,...,0,61,0,0,0,0,0,2,2008,208500
1,2,20,80.0,9600,6,8,1976,1976,0.0,978,...,298,0,0,0,0,0,0,5,2007,181500
2,3,60,68.0,11250,7,5,2001,2002,162.0,486,...,0,42,0,0,0,0,0,9,2008,223500
3,4,70,60.0,9550,7,5,1915,1970,0.0,216,...,0,35,272,0,0,0,0,2,2006,140000
4,5,60,84.0,14260,8,5,2000,2000,350.0,655,...,192,84,0,0,0,0,0,12,2008,250000


In [5]:
X = data.drop(columns=['Id', 'SalePrice']).fillna(data.mean()).values.astype(np.float32)
y = data.SalePrice.values.astype(np.float32)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [7]:
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

In [8]:
ss2 = StandardScaler()
y_train = ss2.fit_transform(y_train[:, None]).reshape(-1)
y_test = ss2.transform(y_test[:, None]).reshape(-1)

In [9]:
# Constants
SEED = 42 # random seed for reproducibility
LR = 3e-2 # learning rate, controls the speed of the training
WEIGHT_DECAY = 1e-3 # lambda for L2 reg. ()
NUM_EPOCHS = 5 # num training epochs (how many times each instance will be processed)
GAMMA = 0.9995 # learning rate scheduler parameter
BATCH_SIZE = 32 # training batch size
EVAL_BATCH_SIZE = 300 # evaluation batch size.
DEVICE = 'cpu' #'cuda' # device to make the calculations on

In [10]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.2, random_state = SEED)

In [11]:
# Initialize the DataObject, which must return an element (features vector x and target value y)
# for a given idx. This class must also have a length atribute
class MyDataset(Dataset):
    def __init__(self, X, y):
        super().__init__() # to initialize the parent class
        self.X = X
        self.y = y
        self.len = len(X)

    def __len__(self): # We use __func__ for implementing in-built python functions
        return self.len

    def __getitem__(self, index):
        return self.X[index], self.y[index]

In [12]:
# Initialize DataLoaders - objects, which sample instances from DataObject-s
train_dl = DataLoader(
    MyDataset(X_train, y_train),
    batch_size = BATCH_SIZE,
    shuffle = True
)

val_dl = DataLoader(
    MyDataset(X_val, y_val),
    batch_size = EVAL_BATCH_SIZE,
    shuffle = False
)

test_dl = DataLoader(
    MyDataset(X_test, y_test),
    batch_size = EVAL_BATCH_SIZE,
    shuffle = False
)

dls = {'train': train_dl, 'val': val_dl, 'test': test_dl}

In [13]:
#add a new layer in the model, which will be modified further
class Model(nn.Module):
    def __init__(self, in_features = 36, out_features = 1, hidden_size_2 = 128):
        super().__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.hidden_size = int(np.sqrt(in_features + out_features))
        self.hidden_size_2 = hidden_size_2

        self.sequential = nn.Sequential( # NN architecure, where the modules modify the data sequentially
            nn.Linear(in_features, self.hidden_size), # Linear transformation
            nn.ReLU(), # Activation function 
            nn.Linear(self.hidden_size, self.hidden_size_2), # Linear transformation
            nn.ReLU(), # Activation function 
            nn.Linear(self.hidden_size_2, self.out_features) # Another Linear transformation
        )

    def forward(self, x): # In the forward function, you define how your model runs, from input to output 
        x = self.sequential(x)
        return x

In [14]:
torch.manual_seed(SEED) # Fix random seed to have reproducible weights of model layers

model = Model()
model.to(DEVICE)

loss_fn = nn.MSELoss() # Loss function, which our model will try to minimize
# Initialize GD method, which will update the weights of the model
optimizer = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)
# Initialize learning rate scheduler, which will decrease LR according to some rule
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=GAMMA)

In [None]:
# Training loop
metrics_dict = {
    "Epoch": [],
    "Train RMSE": [],
    "Val RMSE": [],
}

# Train loop
for epoch in tqdm(range(NUM_EPOCHS)):
    metrics_dict["Epoch"].append(epoch)
    for stage in ['train', 'val']:
        with torch.set_grad_enabled(stage == 'train'): # Whether to start building a graph for a backward pass
            if stage == 'train':
                model.train() # Enable some "special" layers (will speak about later)
            else:
                model.eval() # Disable some "special" layers (will speak about later)

            loss_at_stage = 0 
            for batch in dls[stage]:
                x_batch, y_batch = batch
                x_batch, y_batch = x_batch.to(DEVICE), y_batch.to(DEVICE)

                y_pred = model(x_batch).view(-1) # forward pass: model(x_batch) -> calls forward()
                loss = loss_fn(y_pred, y_batch) # ¡Important! y_pred is always the first arg
                if stage == "train":
                    loss.backward() # Calculate the gradients of all the parameters wrt loss
                    optimizer.step() # Update the parameters
#                     scheduler.step()
                    optimizer.zero_grad() # Zero the saved gradient
                with torch.no_grad():
                    loss_at_stage += (torch.square((y_pred - y_batch)).sum()).item()
            rmse_at_stage = (loss_at_stage / len(dls[stage].dataset)) ** (1/2)
            metrics_dict[f"{stage.title()} RMSE"].append(rmse_at_stage)
            
    clear_output(wait=True)
    display(pd.DataFrame(metrics_dict))

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))

## HuggingFace Trainer



Instead of implementing full training pipeline we can use the [Trainer](https://huggingface.co/docs/transformers/main_classes/trainer) class, from the [HugginFace Transformers](https://huggingface.co/docs/transformers/main_classes/trainer) library. For this purpose, we need to define [TrainingArguments](https://huggingface.co/docs/transformers/v4.21.1/en/main_classes/trainer#transformers.TrainingArguments) object with hyperparameters and redefine `compute_loss` function in the Trainer.

In [None]:
from transformers import TrainingArguments, Trainer, set_seed
from sklearn.metrics import mean_squared_error

#define all hyperparameters in one object TrainingArguments
training_args = TrainingArguments(
    output_dir="test_trainer", 
    evaluation_strategy="epoch",
    logging_strategy="epoch",
    seed=SEED,
    learning_rate=LR,
    weight_decay=WEIGHT_DECAY,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=EVAL_BATCH_SIZE,
    label_names=["labels"],
    report_to="none",
)

#define custom trainer for computing MSE loss
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        y_true = inputs.get(self.label_names[0])
        x = inputs.get("x")
        # forward pass
        y_preds = model(x).view(-1)
        loss = loss_fn(y_preds, y_true)
        return (loss, {"y_preds": y_preds}) if return_outputs else loss

def compute_metrics(eval_preds):
    y_preds, y_true = eval_preds
    y_preds_rescheduled = y_preds * ss2.scale_ + ss2.mean_
    y_true_rescheduled = y_true * ss2.scale_ + ss2.mean_
    rmse = mean_squared_error(y_true_rescheduled, y_preds_rescheduled, squared=False)
    return {"RMSE": rmse}

In [None]:
from datasets import Dataset

train_ds = Dataset.from_dict({'x': X_train, 'labels': y_train})
eval_ds = Dataset.from_dict({'x': X_val, 'labels': y_val})
test_ds = Dataset.from_dict({'x': X_test, 'labels': y_test})

In [None]:
model = Model()
model.to(DEVICE)
optimizer = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=GAMMA)

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    compute_metrics=compute_metrics,
    optimizers=(optimizer, scheduler)
)

trainer.train()

In [None]:
metrics = trainer.evaluate(test_ds)
metrics

In [None]:
test_preds, test_labels, metrics_ = trainer.predict(test_ds)

## Dropout

![picture](https://drive.google.com/uc?export=view&id=19MVBGn0oVwlvyHoAh3cb4fqhYsrATOy5)

source: http://primo.ai/index.php?title=Dropout

**Note**: Using `Dropout` for regression problems often does not help to prevent overfitting, and can even decrease the performance of the model.

When using dropout during training, the activations are scaled in order to preserve their mean value after the dropout layer. The variance, however, is not preserved. 

This figure shows $R^2$ values for 8 regression datasets.

<img src="https://drive.google.com/uc?export=view&id=1PrB6fAbw4CLyAVa_idGLkyjlgorTBCVA" width="400">

source: https://www.researchgate.net/publication/344274687_Effect_of_Dropout_Layer_on_Classical_Regression_Problems

real-life example from kaggle: https://www.kaggle.com/competitions/commonlitreadabilityprize/discussion/260729

In [None]:
class Model(nn.Module):
    def __init__(self, in_features = 36, out_features = 1, hidden_size_2 = 128):
        super().__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.hidden_size = int(np.sqrt(in_features + out_features))
        self.hidden_size_2 = hidden_size_2

        self.sequential = nn.Sequential( # NN architecure, where the modules modify the data sequentially
            nn.Linear(in_features, self.hidden_size), # Linear transformation
            nn.ReLU(), # Activation function 
            nn.Linear(self.hidden_size, self.hidden_size_2), # Linear transformation
            nn.Dropout(p=0.1), #dropout 
            nn.ReLU(), # Activation function 
            nn.Linear(self.hidden_size_2, self.out_features) # Another Linear transformation
        )

    def forward(self, x): # In the forward function, you define how your model runs, from input to output 
        x = self.sequential(x)
        return x

In [None]:
def model_init():
    set_seed(SEED)
    return Model().to(DEVICE)

In [None]:
trainer = CustomTrainer(
    model_init=model_init,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    compute_metrics=compute_metrics,
    optimizers=(optimizer, scheduler)
)

trainer.train()

In [None]:
metrics_d = trainer.evaluate(test_ds)
metrics_d

## BatchNorm


<img src="https://drive.google.com/uc?export=view&id=19FeOZRhIMcEhIlkvd4cAEegyGUgJFNyq" width="400">

<img src="https://drive.google.com/uc?export=view&id=1vzA1JVW5RiS7OBDBXaN1BcCURM6jmWLW" width="400">


source: https://arxiv.org/pdf/1502.03167.pdf

In [None]:
class Model(nn.Module):
    def __init__(self, in_features = 36, out_features = 1, hidden_size_2 = 128):
        super().__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.hidden_size = int(np.sqrt(in_features + out_features))
        self.hidden_size_2 = hidden_size_2

        self.sequential = nn.Sequential( # NN architecure, where the modules modify the data sequentially
            nn.Linear(in_features, self.hidden_size), # Linear transformation
            nn.ReLU(), # Activation function 
            nn.Linear(self.hidden_size, self.hidden_size_2), # Linear transformation
            nn.BatchNorm1d(self.hidden_size_2), #batch normalization for 1D data
            nn.ReLU(), # Activation function 
            nn.Linear(self.hidden_size_2, self.out_features) # Another Linear transformation
        )

    def forward(self, x): # In the forward function, you define how your model runs, from input to output 
        x = self.sequential(x)
        return x

In [None]:
trainer = CustomTrainer(
    model_init=model_init,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    compute_metrics=compute_metrics,
    optimizers=(optimizer, scheduler)
)

trainer.train()

In [None]:
metrics_bn = trainer.evaluate(test_ds)
metrics_bn

## LayerNorm

<img src="https://drive.google.com/uc?export=view&id=1CwobTrNTx5B2JgOhbwpw9c3clKaf5y-x" width="600">

source: https://arxiv.org/pdf/1803.08494.pdf

In [None]:
class Model(nn.Module):
    def __init__(self, in_features = 36, out_features = 1, hidden_size_2 = 128):
        super().__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.hidden_size = int(np.sqrt(in_features + out_features))
        self.hidden_size_2 = hidden_size_2

        self.sequential = nn.Sequential( # NN architecure, where the modules modify the data sequentially
            nn.Linear(in_features, self.hidden_size), # Linear transformation
            nn.ReLU(), # Activation function 
            nn.Linear(self.hidden_size, self.hidden_size_2), # Linear transformation
            nn.LayerNorm(self.hidden_size_2), #layer normalization
            nn.ReLU(), # Activation function 
            nn.Linear(self.hidden_size_2, self.out_features) # Another Linear transformation
        )

    def forward(self, x): # In the forward function, you define how your model runs, from input to output 
        x = self.sequential(x)
        return x

In [None]:
trainer = CustomTrainer(
    model_init=model_init,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    compute_metrics=compute_metrics,
    optimizers=(optimizer, scheduler)
)

trainer.train()

In [None]:
metrics_ln = trainer.evaluate(test_ds)
metrics_ln

## Combine Layers

Let's combine batch normalization and dropout and compare all results

In [None]:
class Model(nn.Module):
    def __init__(self, in_features = 36, out_features = 1, hidden_size_2 = 128):
        super().__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.hidden_size = int(np.sqrt(in_features + out_features))
        self.hidden_size_2 = hidden_size_2

        self.sequential = nn.Sequential( # NN architecure, where the modules modify the data sequentially
            nn.Linear(in_features, self.hidden_size), # Linear transformation
            nn.ReLU(), # Activation function 
            nn.Linear(self.hidden_size, self.hidden_size_2), # Linear transformation
            nn.BatchNorm1d(self.hidden_size_2), #batch normalization
            nn.ReLU(), # Activation function 
            nn.Dropout(p=0.1), # dropout
            nn.Linear(self.hidden_size_2, self.out_features) # Another Linear transformation
        )

    def forward(self, x): # In the forward function, you define how your model runs, from input to output 
        x = self.sequential(x)
        return x

In [None]:
trainer = CustomTrainer(
    model_init=model_init,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    compute_metrics=compute_metrics,
    optimizers=(optimizer, scheduler)
)

trainer.train()

In [None]:
metrics_final = trainer.evaluate(test_ds)
metrics_final

In [None]:
res = pd.DataFrame({'Raw model': [metrics['eval_RMSE']],
                    'Model with BatchNorm': [metrics_bn['eval_RMSE']],
                    'Model with LayerNorm': [metrics_ln['eval_RMSE']],
                    'Model with Dropout': [metrics_d['eval_RMSE']],
                    'Final model': [metrics_final['eval_RMSE']]},
                   index=['Test RMSE'])

In [None]:
res

We can see that our final model with BatchNorm and Dropout perform better on the test dataset than other. 

# Initialization

This section will be devoted to different initialization techniques.

In [None]:
#base function for weight matrix initialization
def init_weights(m, init_func=torch.nn.init.zeros_):
    if isinstance(m, nn.Linear):
        init_func(m.weight)
        
        if init_func in [torch.nn.init.zeros_, torch.nn.init.ones_]:
            init_func(m.bias)
        else:
            m.bias.data.fill_(0.01)

In [None]:
def train_pipeline(train_ds, eval_ds, init_func):
    set_seed(SEED)
    model = Model()
    model.to(DEVICE)
    init_weights_func = lambda x: init_weights(x, init_func=init_func)
    model.sequential.apply(init_weights_func)

    optimizer = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)
    scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=GAMMA)

    trainer = CustomTrainer(
        model=model,
        args=training_args,
        train_dataset=train_ds,
        eval_dataset=eval_ds,
        compute_metrics=compute_metrics,
        optimizers=(optimizer, scheduler)
    )

    trainer.train()
    return trainer
  
def get_eval_loss(trainer):
    loss = []
    logs = trainer.state.log_history
    for epoch, log in enumerate(logs[1::2]):
        loss.append(log['eval_loss'])
    return np.array(loss)

## Constant Initialization

If all the weights are initialized to zeros, the derivatives will remain the same. As a result, neurons will learn the same features in each iteration. This problem is known as a network failing to break symmetry. And not only zero, but any constant initialization will also produce a poor result.

### Zeros initialization

Set all weight in linear layers equal to zero: $w_i$ = 0

In our case, ReLU(0) = 0, then all gradients will be the same and equal to zero. It means that our network will not be training.

In [None]:
trainer = train_pipeline(train_ds, eval_ds, torch.nn.init.zeros_)
zero_init_loss = get_eval_loss(trainer)

### Constant initialization

Set all weight in linear layers equal to constant: $w_i$ = 1

In [None]:
trainer = train_pipeline(train_ds, eval_ds, torch.nn.init.ones_)
ones_init_loss = get_eval_loss(trainer)

## Random Initialization

A too-large initialization leads to **exploding** gradients 

A too-small initialization leads to **vanishing** gradients

### Initialization from a normal distribution

Generate weights from a normal distribution:
$$w_i \sim \mathcal{N}(0,\,1)$$

In [None]:
trainer = train_pipeline(train_ds, eval_ds, torch.nn.init.normal_)
norm_init_loss = get_eval_loss(trainer)

### Initialization from a standard uniform distribution

Generate weights from a standard uniform distribution:

$$w_i \sim \mathcal{U}(0,\,1)$$

In [None]:
trainer = train_pipeline(train_ds, eval_ds, torch.nn.init.uniform_)
un_init_loss = get_eval_loss(trainer)

## Modern Initializations

### Initialization from a xavier normal distribution

Generate weights from a xavier normal distribution:
$$w_i \sim \mathcal{N}(0,\,\sigma^2)$$
$$\sigma = gain \sqrt{\frac{2}{fan_{in}+fan_{out}}}$$

source: https://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf

Theoretically proved that Xavier initialization will perform better when we will use tanh, sigmoid, or logistic activations.

In [None]:
trainer = train_pipeline(train_ds, eval_ds, torch.nn.init.xavier_normal)
xn_init_loss = get_eval_loss(trainer)

### Initialization from a xavier uniform distribution

Generate weights from a xavier uniform distribution:
$$w_i \sim \mathcal{U}(-a,\,a)$$
$$a = gain \sqrt{\frac{6}{fan_{in}+fan_{out}}}$$

source: https://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf

In [None]:
trainer = train_pipeline(train_ds, eval_ds, torch.nn.init.xavier_uniform)
xu_init_loss = get_eval_loss(trainer)

### Initialization from a kaiming normal distribution

Generate weights from a kaiming normal distribution:
$$w_i \sim \mathcal{N}(0,\,\sigma^2)$$
$$\sigma = \frac{gain}{\sqrt{fan_{in}}}$$

source: https://arxiv.org/pdf/1502.01852.pdf

Kaiming initialization was developed to model with ReLU and its modification activations. Since we use ReLU, we suppose that it is the best choice for us.

In [None]:
trainer = train_pipeline(train_ds, eval_ds, torch.nn.init.kaiming_normal_)
hen_init_loss = get_eval_loss(trainer)

### Initialization from a kaiming uniform distribution

Generate weights from a kaiming uniform distribution:
$$w_i \sim \mathcal{U}(-a,\,a)$$
$$a = gain \sqrt{\frac{3}{fan_{in}}}$$

source:  https://arxiv.org/pdf/1502.01852.pdf

In [None]:
trainer = train_pipeline(train_ds, eval_ds, torch.nn.init.kaiming_uniform_)
heu_init_loss = get_eval_loss(trainer)

## Compare different initialization techniques

Let's compare all results obtained with different activations. As supposed Kaiming initialization works better, but Xavier is almost not far behind the best results.

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 7))

epochs = list(range(1, NUM_EPOCHS+1))[1:]
plt.plot(epochs, zero_init_loss[1:], label='Zeros init.')
plt.plot(epochs, ones_init_loss[1:], label='Ones init.')
plt.plot(epochs, norm_init_loss[1:], label='Normal init.')
plt.plot(epochs, un_init_loss[1:], label='Uniform init.')

plt.plot(epochs, xn_init_loss[1:], label='Xavier Normal init.')
plt.plot(epochs, xu_init_loss[1:], label='Xavier Uniform init.')

plt.plot(epochs, hen_init_loss[1:], label='Kaiming Normal init.')
plt.plot(epochs, heu_init_loss[1:], label='Kaiming Uniform init.')

plt.xlabel('Epoch', fontsize=18)
plt.ylabel('Eval Loss', fontsize=18)
plt.xticks(epochs)

plt.xticks(fontsize=14)
plt.yticks(fontsize=14)

plt.legend(fontsize=14)