# Download, install and import

In [1]:
!wget https://storage.googleapis.com/bert_models/2018_11_23/multi_cased_L-12_H-768_A-12.zip
!unzip multi_cased_L-12_H-768_A-12.zip
!rm multi_cased_L-12_H-768_A-12.zip

--2022-07-18 16:56:38--  https://storage.googleapis.com/bert_models/2018_11_23/multi_cased_L-12_H-768_A-12.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 142.250.188.48, 172.217.13.80, 142.251.45.16, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|142.250.188.48|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 662903077 (632M) [application/zip]
Saving to: ‘multi_cased_L-12_H-768_A-12.zip’


2022-07-18 16:56:41 (247 MB/s) - ‘multi_cased_L-12_H-768_A-12.zip’ saved [662903077/662903077]

Archive:  multi_cased_L-12_H-768_A-12.zip
   creating: multi_cased_L-12_H-768_A-12/
  inflating: multi_cased_L-12_H-768_A-12/bert_model.ckpt.meta  
  inflating: multi_cased_L-12_H-768_A-12/bert_model.ckpt.data-00000-of-00001  
  inflating: multi_cased_L-12_H-768_A-12/vocab.txt  
  inflating: multi_cased_L-12_H-768_A-12/bert_model.ckpt.index  
  inflating: multi_cased_L-12_H-768_A-12/bert_config.json  


In [2]:
import torch

# Support vars

In [3]:
MODEL_BERT_DIR = 'multi_cased_L-12_H-768_A-12/'

# Model (architecture BertModelForSequenceClassification from [transformers/bert](https://github.com/huggingface.transformers/blob/main/src/transformers/models/bert/modeling_bert.py))

## Classes

In [4]:
class Embeddings(torch.nn.Module):
  def __init__(self):
    super().__init__()
    self.word_embeddings = torch.nn.Embedding(119547, 768, padding_idx=0)
    self.position_embeddings = torch.nn.Embedding(512, 768)
    self.token_type_embeddings = torch.nn.Embedding(2, 768)
    self.LayerNorm = torch.nn.LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    self.dropout = torch.nn.Dropout(p=0.1, inplace=False)
  
  def forward(self, x):
    x = self.word_embeddings(x)
    x = self.position_embeddings(x)
    x = self.token_type_embeddings(x)
    x = self.LayerNorm(x)
    x = self.dropout(x)
    return x

In [5]:
class SelfAttention(torch.nn.Module):
  def __init__(self):
    super().__init__()
    self.query = torch.nn.Linear(in_features=768, out_features=768, bias=True)
    self.key = torch.nn.Linear(in_features=768, out_features=768, bias=True)
    self.value = torch.nn.Linear(in_features=768, out_features=768, bias=True)
    self.dropout = torch.nn.Dropout(p=0.1, inplace=False)
  
  def forward(self, x):
    x = self.query(x)
    x = self.key(x)
    x = self.value(x)
    x = self.dropout(x)
    return x

In [6]:
class SelfOutput(torch.nn.Module):
  def __init__(self):
    super().__init__()
    self.dense = torch.nn.Linear(in_features=768, out_features=768, bias=True)
    self.LayerNorm = torch.nn.LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    self.dropout = torch.nn.Dropout(p=0.1, inplace=False)
  
  def forward(self, x):
    x = self.dense(x)
    x = self.LayerNorm(x)
    x = self.dropout(x)
    return x

In [7]:
class Attention(torch.nn.Module):
  def __init__(self):
    super().__init__()
    self.self = SelfAttention()
    self.output = SelfOutput()

  def forward(self, x):
    x = self.self(x)
    x = self.output(x)
    return x

In [8]:
class Intermediate(torch.nn.Module):
  def __init__(self):
    super().__init__()
    self.dense = torch.nn.Linear(in_features=768, out_features=3072, bias=True)
    self.intermediate_act_fn = torch.nn.functional.gelu
  
  def forward(self, x):
    x = self.dense(x)
    x = self.intermediate_act_fn(x)
    return x

In [9]:
class Output(torch.nn.Module):
  def __init__(self):
    super().__init__()
    self.dense = torch.nn.Linear(in_features=3072, out_features=768, bias=True)
    self.LayerNorm = torch.nn.LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    self.dropout = torch.nn.Dropout(p=0.1, inplace=False)
  
  def forward(self, x):
    x = self.dense(x)
    x = self.LayerNorm(x)
    x = self.dropout(x)
    return x

In [10]:
class Layer(torch.nn.Module):
  def __init__(self):
    super().__init__()
    self.attention = Attention()
    self.intermediate = Intermediate()
    self.output = Output()
  
  def forward(self, x):
    x = self.attention(x)
    x = self.intermediate(x)
    x = self.output(x)
    return x

In [11]:
class Encoder(torch.nn.Module):
  def __init__(self):
    super().__init__()
    self.layer = torch.nn.ModuleList(Layer() for i in range(12))
  
  def forward(self, x):
    for i, l in enumerate(self.layer):
      x = self.layer[i // 2](x) + l(x)
    return x

In [12]:
class Pooler(torch.nn.Module):
  def __init__(self):
    super().__init__()
    self.dense = torch.nn.Linear(in_features=768, out_features=768, bias=True)
    self.activation = torch.nn.Softmax(dim=1) #need define
  
  def forward(self, x):
    x = self.dense(x)
    x = self.activation(x)
    return x

In [13]:
class Bert(torch.nn.Module):
  def __init__(self):
    super().__init__()
    self.embeddings = Embeddings()
    self.encoder = Encoder()
    self.pooler = Pooler()
  
  def forward(self, x):
    x = self.embeddings(x)
    x = self.encoder(x)
    x = self.pooler(x)
    return x

In [14]:
class IntentClassifier(torch.nn.Module):
  def __init__(self):
    super().__init__()
    self.bert = Bert()
    self.dropout = torch.nn.Dropout(p=0.1, inplace=False)
    self.classifier = torch.nn.Linear(in_features=768, out_features=6, bias=True)

  def forward(self, x):
    x = self.bert(x)
    x = self.dropout(x)
    x = self.classifier(x)
    return x

## Init

In [15]:
model = IntentClassifier()

In [16]:
model

IntentClassifier(
  (bert): Bert(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): Encoder(
      (layer): ModuleList(
        (0): Layer(
          (attention): Attention(
            (self): SelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): SelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (dropout)