# Install and import libs

In [276]:
%%capture
!pip install transformers

In [277]:
import torch
from transformers import BertModel, BertConfig, BertTokenizer, get_linear_schedule_with_warmup

import pandas as pd
from sklearn.model_selection import train_test_split

from typing import Any, Union, Tuple, Dict

# Support funcs

In [278]:
def get_formal_label(label: int, num_labels: int) -> list:
  ''' Ex. with label=3 and num_labels=6 func return [0, 0, 1, 0, 0, 0] '''
  formal_label = [0]*num_labels
  formal_label[label] = 1
  return formal_label

# Classes

In [279]:
class Dataset(torch.utils.data.Dataset):
  def __init__(self, seqs: list, labels: list, tokenizer: object, num_labels: int):
    self.tokenized = [tokenizer(seq, max_length=512, padding='max_length') for seq in seqs]
    self.formal_labels = [get_formal_label(label, num_labels=num_labels) for label in labels]

  def __getitem__(self, i) -> Dict[str, torch.tensor]:
    tokenized = {k: torch.tensor(v) for k, v in self.tokenized[i].items()}
    formal_label = torch.tensor(self.formal_labels[i]).float() # to float() need for model.train()
    return tokenized, formal_label
  
  def __len__(self) -> int:
    return len(self.tokenized)

In [282]:
class IntentClassifier(torch.nn.Module):
  def __init__(self, pretrained_bert_model: str, num_labels: int, load_bert_model_state_dict: bool = True):
    super(IntentClassifier, self).__init__()
    self.pretrained_bert_model = pretrained_bert_model
    self.num_labels = num_labels

    # layers
    if load_bert_model_state_dict:
      self.bert = BertModel.from_pretrained(self.pretrained_bert_model)
    else:
      self.bert = BertModel(BertConfig.from_pretrained(pretrained_bert_model))
    ## set layers bert untrained
    for param in self.bert.parameters():
      param.requires_grad = False
    ##classification layers need define
    self.dropout = torch.nn.Dropout(p=0.1, inplace=False)
    self.classifier = torch.nn.Linear(in_features=768, out_features=self.num_labels, bias=True)

    self.tokenizer = BertTokenizer.from_pretrained(self.pretrained_bert_model)

    if torch.cuda.is_available():
      self.device = torch.device('cuda:0')
    else:
      self.device = torch.device('cpu')
  
  
  def forward(self, input_ids: torch.tensor, attention_mask: torch.tensor, token_type_ids: torch.tensor) -> torch.tensor:
    bert_outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
    pooled_output = self.dropout(bert_outputs[1])
    logits = self.classifier(pooled_output)
    return logits

  
  def train(self, dataloader: torch.utils.data.DataLoader, epochs: int):

    criterion = torch.nn.BCEWithLogitsLoss().to(self.device)
    optimizer = torch.optim.AdamW(self.parameters())

    total_steps = len(train_dataloader) * epochs
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0, num_training_steps = total_steps)

    # for statistics
    train_loss, train_loss_history = 0.0, list()

    for epoch in range(epochs):
      for i, batch in enumerate(train_dataloader):
        input_ids = batch[0]['input_ids'].to(self.device)
        attention_mask = batch[0]['attention_mask'].to(self.device)
        token_type_ids = batch[0]['token_type_ids'].to(self.device)
        formal_labels = batch[1].to(self.device)

        self.zero_grad()

        logits = self.forward(input_ids, attention_mask, token_type_ids)

        loss = criterion(logits, formal_labels)
        loss.backward()

        # for statistic
        train_loss += loss.item()
        if i % 4 == 3:
            print(f'[{epoch + 1}, {i + 1:5d}] loss: {train_loss / 4:.3f}')
            train_loss_history.append(train_loss)
            train_loss = 0.0
        
        torch.nn.utils.clip_grad_norm_(self.parameters(), 1.0)

        optimizer.step()
        scheduler.step()
    
    print("Training is finish.")
    return train_loss_history
  

  def load(self, fp: str):
  	self.load_state_dict(torch.load(fp))

  
  def predict(self, text: str) -> Dict[str, Union[int, float]]:
    tokenized = self.tokenizer(text, return_tensors='pt').to(self.device)
    logits = self.forward(tokenized['input_ids'], tokenized['attention_mask'], tokenized['token_type_ids'])

    max_prob_label, max_prob = -1, -1
    for label, prob in enumerate(logits[0], 0):
      prob = prob.item()
      if prob > max_prob:
        max_prob_label, max_prob = label, prob
    
    if max_prob < 0.51:
      max_prob_label, max_prob = 6, 1.0 - torch.mean(logits[0]).item()

    print(logits) # for debug
    return {'label': max_prob_label, 'prob': round(max_prob, 2)}

# Model init

In [283]:
%%capture
model = IntentClassifier('bert-base-multilingual-uncased', num_labels=6)
model.to(model.device)

Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


# Datasets init

In [None]:
df = pd.read_csv('data/small_dataset.csv')
df.tail()

Unnamed: 0,sequence,intent,label
789,"есть ли такая вещь, как хорошая смерть",philosophical_talk,5
790,"разум или мудрость, что важнее для лучшего мира",philosophical_talk,5
791,являются ли убеждения и суеверия одинаковыми,philosophical_talk,5
792,"почему мы делаем то, что нам не нравится",philosophical_talk,5
793,у атеистов есть собственные боги,philosophical_talk,5


In [None]:
train_seqs, test_seqs, train_labels, test_labels = train_test_split(df['sequence'].tolist(), df['label'].tolist(), test_size=0.1)

In [None]:
train_dataset = Dataset(train_seqs, train_labels, tokenizer=model.tokenizer, num_labels=6)
test_dataset = Dataset(test_seqs, test_labels, tokenizer=model.tokenizer, num_labels=6)

In [286]:
# batch_size=16 recommended by bert authors (https://arxiv.org/pdf/1810.04805.pdf)
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=16)
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=16)

In [None]:
model.train(train_dataloader, epochs=3)