# **BERT**

In [1]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m57.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m25.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m86.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m67.8 MB/s[0m eta [36m0:00:0

# **Libraries**

In [2]:
import sys
import copy
import math
import random

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
from torch import optim
from torch.utils.data import Dataset, DataLoader

import tqdm

from transformers import BertTokenizer

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

# **Working with data**

## **Load data**

In [4]:
MAX_SEQ_LEN = 64

In [5]:
train_path = "/content/sample_data/train_data.csv"
test_path = "/content/sample_data/test_data.csv"

In [6]:
train_df = pd.read_csv(train_path)
train_df

Unnamed: 0,spam,original_message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...
5568,0,Will Ì_ b going to esplanade fr home?
5569,0,"Pity, * was in mood for that. So...any other s..."
5570,0,The guy did some bitching but I acted like i'd...


In [7]:
test_df = pd.read_csv(test_path)
test_df

Unnamed: 0,spam,original_message
0,0,Ok i msg u b4 i leave my house.
1,0,"\Gimme a few\"" was &lt;#&gt; minutes ago"""
2,1,Last Chance! Claim ur å£150 worth of discount ...
3,0,Appt is at &lt
4,1,FREE for 1st week! No1 Nokia tone 4 ur mobile ...
...,...,...
995,0,You bad girl. I can still remember them
996,0,How much i gave to you. Morning.
997,0,I hope your alright babe? I worry that you mig...
998,0,"Hey, can you tell me blake's address? Carlos w..."


## **Creating loader**

In [8]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [9]:
MASK_TOKEN = tokenizer.vocab["[MASK]"]

In [10]:
class MSGDataset(Dataset):
    def __init__(self, msgs, labels, tokenizer):
        self.msgs = msgs
        self.labels = labels
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.msgs)

    def __getitem__(self, idx):
        msg = self.msgs[idx]
        label = torch.LongTensor([self.labels[idx]])

        encoded_msg = tokenizer.encode_plus(text=msg,
                                            max_length=MAX_SEQ_LEN,
                                            add_special_tokens=True,
                                            truncation=True,
                                            padding="max_length",
                                            return_attention_mask=True,
                                            return_tensors="pt")

        msg_tokens = encoded_msg["input_ids"].view(-1)
        attn_mask = encoded_msg["attention_mask"]
        msg_len = torch.LongTensor([torch.count_nonzero(msg_tokens)])

        out = {"msg": msg,
               "msg_tokens": msg_tokens,
               "attn_mask": attn_mask,
               "msg_len": msg_len,
               "label": label}

        return out

In [11]:
train_data = MSGDataset(train_df["original_message"].values,
                        train_df["spam"].values,
                        tokenizer)

In [12]:
test_data = MSGDataset(test_df["original_message"].values,
                       test_df["spam"].values,
                       tokenizer)

In [13]:
batch_size = 32

In [14]:
train_loader = DataLoader(dataset=train_data,
                          batch_size=batch_size,
                          shuffle=True,
                          drop_last=True)

In [15]:
test_loader = DataLoader(dataset=test_data, batch_size=1)

# **Model components**

In [16]:
class Sublayer(nn.Module):
    def __init__(self, d_model, dropout_p):
        super(Sublayer, self).__init__()

        self.layer_norm = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, x, sublayer):
        return x + self.dropout(sublayer(self.layer_norm(x)))

## **Embedding**

In [17]:
class Embedding(nn.Module):
    def __init__(self, vocab_size, d_model):
        super(Embedding, self).__init__()

        self.embedding = nn.Embedding(vocab_size, d_model)
        self.d_model = d_model

    def forward(self, x):
        return self.embedding(x) * np.sqrt(self.d_model)

## **Positional encoding**

Thanks [ARUNMOHAN_003](https://www.kaggle.com/arunmohan003)

In [18]:
class Positional_encoding(nn.Module):
    def __init__(self, d_model):
        super(Positional_encoding, self).__init__()

        self.d_model = d_model

        pos_enc = torch.zeros(MAX_SEQ_LEN, self.d_model).to(device)

        for position in range(MAX_SEQ_LEN):
            for i in range(0, self.d_model, 2):
                pos_enc[position, i] = np.sin(
                    position / (10000 ** ((2 * i) / self.d_model))
                )
                pos_enc[position, i + 1] = np.cos(
                    position / (10000 ** ((2 * (i + 1)) / self.d_model))
                )

        pos_enc = pos_enc.unsqueeze(0)

        self.register_buffer('positional_encoding', pos_enc)

    def forward(self, x):
        x = x + np.sqrt(self.d_model)

        len_x = x.size(1)

        x += torch.autograd.Variable(
            self.positional_encoding[:, :len_x],
            requires_grad=False
        ).to(device)

        return x

## **Positionwise FFN**

In [19]:
class PositionwiseFFN(nn.Module):
    def __init__(self, d_model, d_ffn, dropout_p):
        super(PositionwiseFFN, self).__init__()

        self.FFN = nn.Sequential(
            nn.Linear(d_model, d_ffn),
            nn.ReLU(inplace=True),
            nn.Linear(d_ffn, d_model)
        )

        self.dropout = nn.Dropout(dropout_p)

    def forward(self, x):
        return self.dropout(self.FFN(x))

## **Multihead attention**




### **Scaled dot product attention**

Scaled dot product attention it is calculated according to the formula (see the article [Attention Is All You Need](https://arxiv.org/pdf/1706.03762.pdf))

$$\text{Attention}(Q, K, V) = \text{softmax}\left(\frac{QK^{T}}{\sqrt{d_k}}\right)V,$$

where $\sqrt{d_k}$ is a square root of queries and keys dimension.

![Scaled Dot-Product Attention.svg](https://svgshare.com/i/u4z.svg)

In [20]:
def scaled_dot_product_attn(Q, K, V, mask=None):
    d_k = Q.size(-1)

    attn_weights = torch.matmul(Q, K.transpose(-2, -1)) / np.sqrt(d_k)

    if mask is not None:
        attn_weights = attn_weights.masked_fill(mask==0, -np.inf)

    attn_weights = attn_weights.softmax(dim=-1)

    attn_applied = torch.matmul(attn_weights, V)

    return attn_applied, attn_weights

### **Multihead attention**

Following the original article [Attention Is All You Need](https://arxiv.org/pdf/1706.03762.pdf), multihead attention is defined as:

$$\text{MultiHead}(Q, K, V) = \text{Concat}(\text{head}_{1}, \text{...}, \text{head}_{h})W^O,$$

where

$$\text{head}_i = \text{Attention}(QW_{i}^{Q}, KW_{i}^{K}, VW_{i}^{V}).$$

![Multi-Head Attention.svg](https://svgshare.com/i/u6C.svg)

In [21]:
class MultiheadAttention(nn.Module):
    def __init__(self, d_model, n_heads, dropout_p):
        super(MultiheadAttention, self).__init__()

        assert (
            d_model % n_heads == 0
        ), "d_head * n_heads must be equal to d_model!"

        self.n_heads = n_heads
        self.d_head = d_model // n_heads

        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        self.W_o = nn.Linear(d_model, d_model)

        self.attn_weights = None

    def forward(self, Q, K, V, mask=None):
        if mask is not None:
            mask = mask.unsqueeze(1)

        batch_size = Q.size(0)

        Q = self.W_q(Q)
        K = self.W_k(K)
        V = self.W_v(V)

        Q = Q.view(batch_size, -1, self.n_heads, self.d_head).transpose(1, 2)
        K = K.view(batch_size, -1, self.n_heads, self.d_head).transpose(1, 2)
        V = V.view(batch_size, -1, self.n_heads, self.d_head).transpose(1, 2)

        attention, self.attn_weights = scaled_dot_product_attn(Q, K, V, mask)

        attention = attention.transpose(1, 2).contiguous()
        attention = attention.view(batch_size, -1, self.n_heads * self.d_head)

        attention = self.W_o(attention)

        return attention

## **Encoder**

### **Encoder block**

![Encoder block.svg](https://svgshare.com/i/u5x.svg)

In [22]:
class EncoderLayer(nn.Module):
    def __init__(self, self_attn, FFN, d_model, dropout_p):
        super(EncoderLayer, self).__init__()

        self.self_attn = self_attn
        self.FFN = FFN

        self.sublayer = Sublayer(d_model, dropout_p)

    def forward(self, x, src_mask):
        x = self.sublayer(x, lambda x: self.self_attn(x, x, x, src_mask))
        x = self.sublayer(x, self.FFN)
        return x

### **Full encoder**

![Encoder.svg](https://svgshare.com/i/u55.svg)

In [23]:
class Encoder(nn.Module):
    def __init__(self, encoder_layer, emb, pos_enc, n_layers):
        super(Encoder, self).__init__()

        self.emb = emb
        self.pos_enc = pos_enc

        self.encoder_layers = nn.ModuleList([
            copy.deepcopy(encoder_layer)
            for _ in range(n_layers)
        ])

    def forward(self, x, src_mask=None):
        x = self.pos_enc(self.emb(x))

        for encoder_layer in self.encoder_layers:
            x = encoder_layer(x, src_mask)

        return x

## **BERT**

![BERT.svg](https://svgshare.com/i/unb.svg)

In [24]:
class BERT(nn.Module):
    def __init__(self, encoder, d_model, vocab_size, n_labels):
        super(BERT, self).__init__()

        self.encoder = encoder

        self.generator = nn.Sequential(
            nn.Linear(d_model, d_model),
            nn.GELU(),
            nn.Linear(d_model, vocab_size),
            nn.LogSoftmax(dim=-1)
        )

        self.classifier = nn.Sequential(
            nn.Linear(d_model, n_labels),
            nn.LogSoftmax(dim=-1)
        )

    def forward(self, input, attn_mask):
        encoder_out = self.encoder(input, attn_mask)

        lm_out = self.generator(encoder_out)
        classifier_out = self.classifier(encoder_out[:, 0, :])

        return lm_out, classifier_out

# **Creating model**

In [25]:
d_model = 512
n_heads = 8
n_layers = 6
d_ffn = 4*d_model
dropout_p = 0.1

In [26]:
c = copy.deepcopy

src_emb = Embedding(tokenizer.vocab_size, d_model)
pos_enc = Positional_encoding(d_model)

mhsa = MultiheadAttention(d_model, n_heads, dropout_p)
FFN = PositionwiseFFN(d_model, d_ffn, dropout_p)

encoder = Encoder(
    EncoderLayer(c(mhsa), c(FFN), d_model, dropout_p),
    src_emb,
    pos_enc,
    n_layers
)

model = BERT(encoder, d_model, tokenizer.vocab_size, 2).to(device)

In [27]:
for p in model.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)

model

BERT(
  (encoder): Encoder(
    (emb): Embedding(
      (embedding): Embedding(28996, 512)
    )
    (pos_enc): Positional_encoding()
    (encoder_layers): ModuleList(
      (0-5): 6 x EncoderLayer(
        (self_attn): MultiheadAttention(
          (W_q): Linear(in_features=512, out_features=512, bias=True)
          (W_k): Linear(in_features=512, out_features=512, bias=True)
          (W_v): Linear(in_features=512, out_features=512, bias=True)
          (W_o): Linear(in_features=512, out_features=512, bias=True)
        )
        (FFN): PositionwiseFFN(
          (FFN): Sequential(
            (0): Linear(in_features=512, out_features=2048, bias=True)
            (1): ReLU(inplace=True)
            (2): Linear(in_features=2048, out_features=512, bias=True)
          )
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (sublayer): Sublayer(
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)

In [28]:
pytorch_total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print("Total model parameters:", pytorch_total_params)

Total model parameters: 48892742


# **Train**

In [29]:
def mask_tokens(batch):
    n_msgs = batch["msg_tokens"].size(0)
    msg_lens = batch["msg_len"]

    n_mask_tokens = [math.ceil(0.15*msg_len) for msg_len in msg_lens]
    masked_pos = [
        random.sample(range(1, msg_lens[i]-1), n_mask_tokens[i])
        if msg_lens[i] > 4 else []
        for i in range(n_msgs)
    ]
    masked_tokens = torch.zeros_like(batch["msg_tokens"])

    for i in range(n_msgs):
        batch["attn_mask"][i, 0, masked_pos[i]] = 0
        masked_tokens[i, masked_pos[i]] = batch["msg_tokens"][i, masked_pos[i]]
        batch["msg_tokens"][i, masked_pos[i]] = MASK_TOKEN

    return batch, masked_tokens

In [30]:
epochs = 5
lr = 1e-5

In [31]:
optimizer = optim.Adam(model.parameters(), lr=lr)
lm_criterion = nn.NLLLoss(ignore_index=0)
cls_criterion = nn.NLLLoss()

In [32]:
train_data_size = len(train_loader.dataset)

for epoch in range(epochs):
    epoch_lm_loss = 0
    epoch_cls_loss = 0

    model.train()

    for batch in tqdm.tqdm(train_loader, file=sys.stdout):
        batch, masked_tokens = mask_tokens(c(batch))

        lm_out, cls_out = model(c(batch["msg_tokens"]).to(device),
                                c(batch["attn_mask"]).to(device))

        lm_loss = lm_criterion(lm_out.to(device).view(-1, tokenizer.vocab_size),
                               masked_tokens.to(device).view(-1))
        cls_loss = cls_criterion(cls_out.to(device),
                                 batch["label"].to(device).view(-1))

        epoch_lm_loss += lm_loss.item()
        epoch_cls_loss += cls_loss.item()

        loss = lm_loss + cls_loss

        loss.backward()
        optimizer.step()
        optimizer.zero_grad(set_to_none=True)

    epoch_lm_loss /= (train_data_size // batch_size)
    epoch_cls_loss /= (train_data_size // batch_size)

    print(f"Epoch: {epoch};")
    print(f"train_lm_loss: {epoch_lm_loss}")
    print(f"train_cls_loss: {epoch_cls_loss}\n")

100%|██████████| 174/174 [00:25<00:00,  6.96it/s]
Epoch: 0;
train_lm_loss: 6.051724532554889
train_cls_loss: 0.002158830226750622

100%|██████████| 174/174 [00:25<00:00,  6.94it/s]
Epoch: 1;
train_lm_loss: 6.0156698939444
train_cls_loss: 0.00015455254031083887

100%|██████████| 174/174 [00:24<00:00,  7.01it/s]
Epoch: 2;
train_lm_loss: 6.039944599414694
train_cls_loss: 0.020297530207085242

100%|██████████| 174/174 [00:24<00:00,  7.00it/s]
Epoch: 3;
train_lm_loss: 5.997798610007626
train_cls_loss: 0.0008346653473078186

100%|██████████| 174/174 [00:24<00:00,  6.97it/s]
Epoch: 4;
train_lm_loss: 5.954488008871846
train_cls_loss: 0.001974409608545524



## **Lets test our model**

In [33]:
model.eval()

with torch.no_grad():
    test_data_size = len(test_loader.dataset)

    test_lm_loss = 0
    test_cls_loss = 0

    for batch in tqdm.tqdm(test_loader, file=sys.stdout):
        batch, masked_tokens = mask_tokens(c(batch))

        lm_out, cls_out = model(c(batch["msg_tokens"]).to(device),
                                c(batch["attn_mask"]).to(device))

        lm_loss = lm_criterion(lm_out.to(device).view(-1, tokenizer.vocab_size),
                                masked_tokens.to(device).view(-1))
        cls_loss = cls_criterion(cls_out.to(device),
                                    batch["label"].to(device).view(-1))

        test_lm_loss += lm_loss.item()
        test_cls_loss += cls_loss.item()

    test_lm_loss /= (test_data_size // batch_size)
    test_cls_loss /= (test_data_size // batch_size)

print(f"test_lm_loss: {lm_loss}")
print(f"test_cls_loss: {cls_loss}\n")

100%|██████████| 1000/1000 [00:08<00:00, 114.82it/s]
test_lm_loss: 6.096308708190918
test_cls_loss: 0.0



## **Classifier test**

In [34]:
model.eval()

with torch.no_grad():
    msg = "Someone has contacted our dating service and entered your phone because they fancy you!"
    encoded_msg = tokenizer.encode_plus(text=msg,
                                        max_length=MAX_SEQ_LEN,
                                        add_special_tokens=True,
                                        truncation=True,
                                        padding="max_length",
                                        return_attention_mask=True,
                                        return_tensors="pt")
    _, cls_out = model(encoded_msg["input_ids"].to(device),
                       encoded_msg["attention_mask"].to(device))

    cls_out = cls_out.argmax(dim=-1).item()

print(f"msg: {msg}")
print(f"label: 1")
print(f"predict: {cls_out}")

msg: Someone has contacted our dating service and entered your phone because they fancy you!
label: 1
predict: 1
