# Download, install and import

In [None]:
# download newest pretrained model BERT Multilingual Base Cased
!wget https://storage.googleapis.com/bert_models/2018_11_23/multi_cased_L-12_H-768_A-12.zip
!unzip multi_cased_L-12_H-768_A-12.zip
!rm multi_cased_L-12_H-768_A-12.zip

In [None]:
!pip install transformers

In [33]:
import os
import json

import torch
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

from transformers.models.bert.convert_bert_original_tf_checkpoint_to_pytorch import convert_tf_checkpoint_to_pytorch
from transformers.models.bert.tokenization_bert import BertTokenizer

# Support vars and funcs

In [34]:
BERT_MODEL_DIR = 'multi_cased_L-12_H-768_A-12/'
DATASET_PATH = 'data/small_dataset.csv'

In [35]:
def get_formal_encode(seq: list, config: dict) -> dict:
  encode = bert_tokenizer(seq, padding=True)

  encode['position_ids'] = [1]*len(encode['input_ids']) + [0]*(config['max_position_embeddings'] - len(encode['input_ids'])) # init necessary data for model
  encode['input_ids'] += [0]*(config['max_position_embeddings'] - len(encode['input_ids'])) # expand tokens data to max_position_embeddings
  encode['token_type_ids'] += [0]*(config['max_position_embeddings'] - len(encode['token_type_ids'])) # expand necessary data for model
  del encode['attention_mask'] # delete unnecessary data for model
  
  return encode

In [36]:
def get_formal_label(label: int, config: dict) -> list:
  formal_label = [0]*config['num_labels']
  formal_label[label] = 1
  return formal_label

# Model

## Config

In [37]:
with open(BERT_MODEL_DIR + 'bert_config.json') as fp:
  config = json.load(fp)

In [38]:
config['num_labels'] = 6

## Classes

In [39]:
class Embeddings(torch.nn.Module):
  def __init__(self, config):
    super().__init__()
    self.word_embeddings = torch.nn.Embedding(config['vocab_size'], 768, padding_idx=0)
    self.position_embeddings = torch.nn.Embedding(config['max_position_embeddings'], 768)
    self.token_type_embeddings = torch.nn.Embedding(2, 768)
    self.LayerNorm = torch.nn.LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    self.dropout = torch.nn.Dropout(p=0.1, inplace=False)
  
  def forward(self, encodes):
    word_embeddings_output = self.word_embeddings(encodes['input_ids'])
    position_embeddings_output = self.position_embeddings(encodes['position_ids'])
    token_type_embeddings = self.token_type_embeddings(encodes['token_type_ids'])

    embeddings = word_embeddings_output + position_embeddings_output + token_type_embeddings
    
    embeddings = self.LayerNorm(embeddings)
    embeddings = self.dropout(embeddings)
    return embeddings

In [40]:
class SelfAttention(torch.nn.Module):
  def __init__(self, config):
    super().__init__()
    self.query = torch.nn.Linear(in_features=768, out_features=768, bias=True)
    self.key = torch.nn.Linear(in_features=768, out_features=768, bias=True)
    self.value = torch.nn.Linear(in_features=768, out_features=768, bias=True)
    self.dropout = torch.nn.Dropout(p=config['attention_probs_dropout_prob'], inplace=False)
  
  def forward(self, x):
    x = self.query(x)
    x = self.key(x)
    x = self.value(x)
    x = self.dropout(x)
    return x

In [41]:
class SelfOutput(torch.nn.Module):
  def __init__(self, config):
    super().__init__()
    self.dense = torch.nn.Linear(in_features=768, out_features=768, bias=True)
    self.LayerNorm = torch.nn.LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    self.dropout = torch.nn.Dropout(p=config['hidden_dropout_prob'], inplace=False)
  
  def forward(self, x):
    x = self.dense(x)
    x = self.LayerNorm(x)
    x = self.dropout(x)
    return x

In [42]:
class Attention(torch.nn.Module):
  def __init__(self, config):
    super().__init__()
    self.self = SelfAttention(config)
    self.output = SelfOutput(config)

  def forward(self, x):
    x = self.self(x)
    x = self.output(x)
    return x

In [43]:
class Intermediate(torch.nn.Module):
  def __init__(self, config):
    super().__init__()
    self.dense = torch.nn.Linear(in_features=768, out_features=config['intermediate_size'], bias=True)
    if config['hidden_act'] == 'gelu':
      self.intermediate_act_fn = torch.nn.functional.gelu
  
  def forward(self, x):
    x = self.dense(x)
    x = self.intermediate_act_fn(x)
    return x

In [44]:
class Output(torch.nn.Module):
  def __init__(self, config):
    super().__init__()
    self.dense = torch.nn.Linear(in_features=config['intermediate_size'], out_features=768, bias=True)
    self.LayerNorm = torch.nn.LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    self.dropout = torch.nn.Dropout(p=config['hidden_dropout_prob'], inplace=False)
  
  def forward(self, x):
    x = self.dense(x)
    x = self.LayerNorm(x)
    x = self.dropout(x)
    return x

In [45]:
class Layer(torch.nn.Module):
  def __init__(self, config):
    super().__init__()
    self.attention = Attention(config)
    self.intermediate = Intermediate(config)
    self.output = Output(config)
  
  def forward(self, x):
    x = self.attention(x)
    x = self.intermediate(x)
    x = self.output(x)
    return x

In [46]:
class Encoder(torch.nn.Module):
  def __init__(self, config):
    super().__init__()
    self.layer = torch.nn.ModuleList(Layer(config) for i in range(config['num_hidden_layers']))
  
  def forward(self, x):
    for i, layer in enumerate(self.layer):
      x = x + layer(x)
    return x

In [47]:
class Pooler(torch.nn.Module):
  def __init__(self, config):
    super().__init__()
    self.dense = torch.nn.Linear(in_features=768, out_features=768, bias=True)
    self.activation = torch.nn.Softmax(dim=1) #need define dim
  
  def forward(self, x):
    x = self.dense(x)
    x = self.activation(x)
    return x

In [48]:
class Bert(torch.nn.Module):
  def __init__(self, config):
    super().__init__()
    self.embeddings = Embeddings(config)
    self.encoder = Encoder(config)
    self.pooler = Pooler(config)
  
  def forward(self, encodes):
    x = self.embeddings(encodes)
    x = self.encoder(x)
    x = self.pooler(x)
    return x

In [49]:
class IntentClassifier(torch.nn.Module):
  def __init__(self, config):
    super().__init__()
    self.bert = Bert(config)
    self.dropout = torch.nn.Dropout(p=0.1, inplace=False)
    self.classifier = torch.nn.Linear(in_features=768, out_features=config['num_labels'], bias=True)

  def forward(self, encodes):
    x = self.bert(encodes)
    x = self.dropout(x)
    x = self.classifier(x)
    return x
  
  def predict(self, seq: str) -> torch.tensor:
    encode = get_formal_encode(seq, config)
    tensor_encode = {k: torch.tensor(v) for k, v in encode.items()}
    model_output = self.forward(tensor_encode)
    return model_output

## Init

In [50]:
model = IntentClassifier(config)

In [51]:
bert_tokenizer = BertTokenizer(BERT_MODEL_DIR + 'vocab.txt')

## Load state dict (weights and biases model)

In [None]:
convert_tf_checkpoint_to_pytorch(BERT_MODEL_DIR + 'bert_model.ckpt', BERT_MODEL_DIR + 'bert_config.json',
                                 BERT_MODEL_DIR + 'bert_model.pt')

In [53]:
state_dict = torch.load(BERT_MODEL_DIR + 'bert_model.pt')

In [54]:
# init missing weights and biases (missing in original BERT)
state_dict['classifier.weight'] = torch.Tensor(np.zeros((6, 768)))
state_dict['classifier.bias'] = torch.Tensor(np.zeros(config['num_labels']))

# load only need weights and biases (with skip extra in orginal BERT)
model_state_dict = model.state_dict()
model.load_state_dict({k: v for k, v in state_dict.items() if k in model_state_dict})

<All keys matched successfully>

# Dataset

In [55]:
df = pd.read_csv(DATASET_PATH)
df.tail()

Unnamed: 0,sequence,intent,label
789,"есть ли такая вещь, как хорошая смерть",philosophical_talk,5
790,"разум или мудрость, что важнее для лучшего мира",philosophical_talk,5
791,являются ли убеждения и суеверия одинаковыми,philosophical_talk,5
792,"почему мы делаем то, что нам не нравится",philosophical_talk,5
793,у атеистов есть собственные боги,philosophical_talk,5


In [56]:
train_seqs, test_seqs, train_labels, test_labels = train_test_split(df['sequence'].tolist(), df['label'].tolist(), test_size=0.1)

In [57]:
class Dataset(torch.utils.data.Dataset):
  def __init__(self, seqs, labels):
    self.encodes = [get_formal_encode(seq, config) for seq in seqs]
    self.labels = [get_formal_label(label, config) for label in labels]
  
  def __getitem__(self, i):
    tensor_encode = {k: torch.tensor(v) for k, v in self.encodes[i].items()}
    tensor_label = torch.tensor(self.labels[i])
    return tensor_encode, tensor_label
  
  def __len__(self):
    return len(self.encodes)

In [58]:
train_dataset = Dataset(train_seqs, train_labels)
test_dataset = Dataset(test_seqs, test_labels)

# Test

In [59]:
model.predict("Быть или не быть?")

tensor([[0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        ...,
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.]], grad_fn=<AddmmBackward0>)