# Load and import libs

In [2]:
import torch
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from transformers import BertForSequenceClassification, BertTokenizer, Trainer

# Init device

In [3]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# Load own dataset

In [4]:
DATASET_PATH = 'data/small_dataset.csv'

df = pd.read_csv(DATASET_PATH)
df.tail()

Unnamed: 0,sequence,intent,label
789,"есть ли такая вещь, как хорошая смерть",philosophical_talk,5
790,"разум или мудрость, что важнее для лучшего мира",philosophical_talk,5
791,являются ли убеждения и суеверия одинаковыми,philosophical_talk,5
792,"почему мы делаем то, что нам не нравится",philosophical_talk,5
793,у атеистов есть собственные боги,philosophical_talk,5


In [5]:
intents_uniq = df['intent'].drop_duplicates()
labels_uniq = df['label'].drop_duplicates()

intents_uniq

0                manage_media
140                  shutdown
308         change_lamp_color
436             open_programm
565    tell_definite_anecdote
693        philosophical_talk
Name: intent, dtype: object

# Try load model

In [6]:
MODEL_PATH = 'model.pt'

try:
  model = torch.load(MODEL_PATH)
except FileNotFoundError:
  pass

# Init model (from pretrained BERT Base Multilingual Uncased)

In [None]:
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-uncased', num_labels=len(labels_uniq))
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased', do_lower_case=True)

In [8]:
model.config

BertConfig {
  "_name_or_path": "bert-base-multilingual-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_versi

# Prepare own dataset

In [9]:
train_seqs, test_seqs, train_labels, test_labels = train_test_split(df['sequence'].tolist(), df['label'].tolist(), test_size=0.1) # train_test_split requires list

In [10]:
train_encodes = bert_tokenizer(train_seqs, truncation=True, padding=True, max_length=512) # max_length = max_position_embeddings in model
test_encodes = bert_tokenizer(test_seqs, truncation=True, padding=True, max_length=512)

In [11]:
class Dataset(torch.utils.data.Dataset):
  def __init__(self, encodes, labels):
    self.encodes = encodes
    self.labels = labels
  
  def __getitem__(self, i):
    item = {k: torch.tensor(v[i]) for k, v in self.encodes.items()}
    item['labels'] = torch.tensor([self.labels[i]])
    return item
  
  def __len__(self):
    return len(self.labels)

In [12]:
train_dataset = Dataset(train_encodes, train_labels)
test_dataset = Dataset(test_encodes, test_labels)

# Train

In [13]:
def compute_metrics(predict):
  labels = predict.label_ids
  predicts = predict.predictions.argmax(-1)

  accuracy = accuracy_score(labels, predicts)

  return {'accuracy': accuracy}

In [14]:
trainer = Trainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

In [15]:
trainer.train()

***** Running training *****
  Num examples = 714
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 270


Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=270, training_loss=0.1801908846254702, metrics={'train_runtime': 34.661, 'train_samples_per_second': 61.799, 'train_steps_per_second': 7.79, 'total_flos': 44031571980480.0, 'train_loss': 0.1801908846254702, 'epoch': 3.0})

# Evaluate

In [16]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 80
  Batch size = 8


{'epoch': 3.0,
 'eval_accuracy': 1.0,
 'eval_loss': 0.002405602252110839,
 'eval_runtime': 0.1922,
 'eval_samples_per_second': 416.267,
 'eval_steps_per_second': 52.033}

In [17]:
def print_pred(text):
  input_tensors = bert_tokenizer(text, padding=True, return_tensors='pt').to(device)
  output_tensor = model(**input_tensors)

  logits = output_tensor.logits.tolist()[0]

  for i in range(len(labels_uniq)):
    print(labels_uniq.iloc[i], "{:.2f}".format(logits[i]), intents_uniq.iloc[i])

In [18]:
print_pred("В чем смысл твоего существования?")

0 -1.72 manage_media
1 -1.67 shutdown
2 -1.23 change_lamp_color
3 -1.46 open_programm
4 -1.70 tell_definite_anecdote
5 5.56 philosophical_talk


# Save model

In [19]:
torch.save(model, 'model.pt')