# Install and import libs

In [1]:
%%capture
!pip install transformers

In [2]:
import torch
from transformers import BertModel, BertConfig, BertTokenizer

from typing import Any, Union, Tuple, Dict

import pandas as pd
from sklearn.model_selection import train_test_split

# Support funcs

In [3]:
def get_formal_label(label: int, num_labels: int, dtype: Any = int) -> torch.tensor:
  formal_label = [0]*num_labels
  formal_label[label] = 1
  return torch.tensor([formal_label], dtype=dtype)

# Classes

In [4]:
class Dataset(torch.utils.data.Dataset):
  def __init__(self, seqs: list, labels: list, tokenizer: object, num_labels: int):
    self.tokenized = [tokenizer.tokenize(seq) for seq in seqs]
    self.formal_labels = [get_formal_label(label, num_labels=num_labels) for label in labels]

  def __getitem__(self, i) -> Tuple[Dict[str, torch.tensor], torch.tensor]:
    return self.tokenized[i], self.formal_labels[i]
  
  def __len__(self) -> int:
    return len(self.tokenized)

In [5]:
class IntentClassifier(torch.nn.Module):
  def __init__(self, pretrained_bert_model: str, num_labels: int, load_bert_model_state_dict: bool = True):
    super(IntentClassifier, self).__init__()
    self.pretrained_bert_model = pretrained_bert_model
    self.num_labels = num_labels

    # layers
    if load_bert_model_state_dict:
      self.bert = BertModel.from_pretrained(self.pretrained_bert_model)
    else:
      self.bert = BertModel(BertConfig.from_pretrained(pretrained_bert_model))
    #classification layers need define
    self.dropout = torch.nn.Dropout(p=0.1, inplace=False)
    self.classifier = torch.nn.Linear(in_features=768, out_features=self.num_labels, bias=True)

    self.tokenizer = BertTokenizer.from_pretrained(self.pretrained_bert_model)

    if torch.cuda.is_available():
      self.device = torch.device('cuda:0')
    else:
      self.device = torch.device('cpu')
  
  
  def forward(self, input_ids: torch.tensor, token_type_ids: torch.tensor, attention_mask: torch.tensor) -> torch.tensor:
    pooled_output = self.bert(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
    pooled_output = self.dropout(pooled_output[1])
    logits = self.classifier(pooled_output)
    return logits

  
  def train(self, dataset: Dataset, epochs: int):
    criterion = torch.nn.CrossEntropyLoss().to(self.device) #need define
    optimizer = torch.optim.AdamW(self.parameters()) #need define

    for epoch in range(epochs):

      for i, data in enumerate(dataset, 0):
        tokenized, formal_label = data
        
        optimizer.zero_grad()

        logits = self.forward(tokenized['input_ids'], tokenized['token_type_ids'], tokenized['attention_mask'])
        loss = criterion(logits, formal_label.float().to(self.device))
        loss.backward()
        optimizer.step()
      
    print("Training if finish")
  

  def load(self, fp: str):
  	self.load_state_dict(torch.load(fp))

  
  def predict(self, text: str) -> Dict[str, Union[int, float]]:
    tokenized = self.tokenizer(text, return_tensors='pt')
    logits = self.forward(tokenized['input_ids'], tokenized['token_type_ids'], tokenized['attention_mask'])

    max_prob_label, max_prob = -1, -1
    for label, prob in enumerate(logits[0], 0):
      prob = prob.item()
      if prob > max_prob:
        max_prob_label, max_prob = label, prob
    
    if max_prob < 0.51:
      max_prob_label, max_prob = 6, 1.0 - torch.mean(logits[0]).item()

    return {'label': max_prob_label, 'prob': round(max_prob, 2)}

# Model init

In [6]:
%%capture
model = IntentClassifier('bert-base-multilingual-cased', num_labels=6)
model.to(model.device)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


# Datasets init

In [7]:
df = pd.read_csv('data/small_dataset.csv')
df.tail()

Unnamed: 0,sequence,intent,label
789,"есть ли такая вещь, как хорошая смерть",philosophical_talk,5
790,"разум или мудрость, что важнее для лучшего мира",philosophical_talk,5
791,являются ли убеждения и суеверия одинаковыми,philosophical_talk,5
792,"почему мы делаем то, что нам не нравится",philosophical_talk,5
793,у атеистов есть собственные боги,philosophical_talk,5


In [8]:
train_seqs, test_seqs, train_labels, test_labels = train_test_split(df['sequence'].tolist(), df['label'].tolist(), test_size=0.1)

In [9]:
train_dataset = Dataset(train_seqs, train_labels, tokenizer=model.tokenizer, num_labels=6)
test_dataset = Dataset(test_seqs, test_labels, tokenizer=model.tokenizer, num_labels=6)

# Test

In [11]:
model.predict("Быть или не быть?")

{'label': 6, 'prob': 0.95}