# Install and import libs

In [None]:
%%capture
!pip install transformers

In [None]:
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertModel, BertTokenizer, Trainer

# Support funcs

In [None]:
def get_formal_label(label: int) -> torch.tensor:
  formal_label = [0]*6
  formal_label[label] = 1
  return torch.tensor([formal_label]).float().to(model.device)

# Model

## Class

In [None]:
class IntentClassifier(torch.nn.Module):
  def __init__(self):
    super(IntentClassifier, self).__init__()
    self.bert = BertModel.from_pretrained('bert-base-multilingual-cased')
    #classification layers need define
    self.dropout = torch.nn.Dropout(p=0.1, inplace=False)
    self.classifier = torch.nn.Linear(in_features=768, out_features=6, bias=True)

    self.tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

    if torch.cuda.is_available():
      self.device = torch.device('cuda:0')
    else:
      self.device = torch.device('cpu')
  
  def forward(self, input_ids: torch.tensor, token_type_ids: torch.tensor, attention_mask: torch.tensor):
    pooled_output = self.bert(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
    pooled_output = self.dropout(pooled_output[1])
    logits = self.classifier(pooled_output)
    return logits

  def train(self, dataset: torch.utils.data.Dataset):
    criterion = torch.nn.CrossEntropyLoss().to(self.device) #need define
    optimizer = torch.optim.Adam(self.parameters()) #need define

    for epoch in range(3):

      for i, data in enumerate(dataset, 0):
        tokenized, label = data
        
        optimizer.zero_grad()

        output = self.forward(tokenized['input_ids'], tokenized['token_type_ids'], tokenized['attention_mask'])
        loss = criterion(output, label)
        loss.backward()
        optimizer.step()
      
    print("Training if finish")
  
  def tokenize(self, text: str) -> dict:
    tokenized = {k: torch.tensor([v]).to(self.device) for k, v in self.tokenizer(text).items()}
    return tokenized
  
  def predict(self, text: str) -> torch.tensor:
    tokenized = self.tokenize(text)
    forward_output = self.forward(tokenized['input_ids'], tokenized['token_type_ids'], tokenized['attention_mask'])
    return forward_output

## Init

In [None]:
%%capture
model = IntentClassifier()
model.to(model.device)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


# Dataset

In [None]:
df = pd.read_csv('data/small_dataset.csv')
df.tail()

Unnamed: 0,sequence,intent,label
789,"есть ли такая вещь, как хорошая смерть",philosophical_talk,5
790,"разум или мудрость, что важнее для лучшего мира",philosophical_talk,5
791,являются ли убеждения и суеверия одинаковыми,philosophical_talk,5
792,"почему мы делаем то, что нам не нравится",philosophical_talk,5
793,у атеистов есть собственные боги,philosophical_talk,5


In [None]:
class Dataset(torch.utils.data.Dataset):
  def __init__(self, seqs: list, labels: list):
    self.tokenized = [model.tokenize(seq) for seq in seqs]
    self.labels = [get_formal_label(label) for label in labels]
  
  def __getitem__(self, i):
    return self.tokenized[i], self.labels[i]
  
  def __len__(self):
    return len(self.tokenized)

In [None]:
train_seqs, test_seqs, train_labels, test_labels = train_test_split(df['sequence'].tolist(), df['label'].tolist(), test_size=0.1)

In [None]:
train_dataset = Dataset(train_seqs, train_labels)
test_dataset = Dataset(test_seqs, test_labels)

# Train

In [None]:
model.train(train_dataset)