# NLP Projects: 02 - Work with tokenizers and BERT model

In [None]:
!pip install transformers datasets bertviz -q

In [6]:
# ! pip install tokenizers

In [121]:
# !mkdir data
# !wget https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-raw-v1.zip -P data
# !unzip data/wikitext-103-raw-v1.zip -d data

## 1.Токенайзеры

![alt text](https://alexanderdyakonov.files.wordpress.com/2019/11/bpe.jpg)

Лекция Дьяконова по токенайзерам: [Subword Tokenization](https://dyakonov.org/2019/11/29/%D1%82%D0%BE%D0%BA%D0%B5%D0%BD%D0%B8%D0%B7%D0%B0%D1%86%D0%B8%D1%8F-%D0%BD%D0%B0-%D0%BF%D0%BE%D0%B4%D1%81%D0%BB%D0%BE%D0%B2%D0%B0-subword-tokenization/) <br>
Rico Sennrich, Barry Haddow, Alexandra Birch Neural Machine Translation of Rare Words with Subword Units https://arxiv.org/abs/1508.07909

### 1.1.Byte-pair-encoding: BPE simple version

In [13]:
import re, collections

def get_stats(vocab):
    """collect charcters pairs frequency"""
    pairs = collections.defaultdict(int)
    for word, freq in vocab.items():
        symbols = word.split()
        for i in range(len(symbols)-1):
            pairs[symbols[i],symbols[i+1]] += freq
    return pairs

def merge_vocab(pair, v_in):
    v_out = {}
    bigram = re.escape(' '.join(pair)) 
    p = re.compile(r'(?<!\S)' + bigram + r'(?!\S)')
    for word in v_in:
        # print("orig_word", word)
        w_out = p.sub(''.join(pair), word)
        # print("w_out", w_out)
        v_out[w_out] = v_in[word]
    return v_out

vocab = {'l o w </w>' : 5, 'l o w e r </w>' : 2,'n e w e s t </w>':6, 'w i d e s t </w>':3}

num_merges = 10
for i in range(num_merges):
    pairs = get_stats(vocab)
    print("pairs_loop", pairs)
    best = max(pairs, key=pairs.get)
    print("best", best)
    vocab = merge_vocab(best, vocab)
    print("vocab", vocab)
    print("="*100)

pairs_loop defaultdict(<class 'int'>, {('l', 'o'): 7, ('o', 'w'): 7, ('w', '</w>'): 5, ('w', 'e'): 8, ('e', 'r'): 2, ('r', '</w>'): 2, ('n', 'e'): 6, ('e', 'w'): 6, ('e', 's'): 9, ('s', 't'): 9, ('t', '</w>'): 9, ('w', 'i'): 3, ('i', 'd'): 3, ('d', 'e'): 3})
best ('e', 's')
vocab {'l o w </w>': 5, 'l o w e r </w>': 2, 'n e w es t </w>': 6, 'w i d es t </w>': 3}
pairs_loop defaultdict(<class 'int'>, {('l', 'o'): 7, ('o', 'w'): 7, ('w', '</w>'): 5, ('w', 'e'): 2, ('e', 'r'): 2, ('r', '</w>'): 2, ('n', 'e'): 6, ('e', 'w'): 6, ('w', 'es'): 6, ('es', 't'): 9, ('t', '</w>'): 9, ('w', 'i'): 3, ('i', 'd'): 3, ('d', 'es'): 3})
best ('es', 't')
vocab {'l o w </w>': 5, 'l o w e r </w>': 2, 'n e w est </w>': 6, 'w i d est </w>': 3}
pairs_loop defaultdict(<class 'int'>, {('l', 'o'): 7, ('o', 'w'): 7, ('w', '</w>'): 5, ('w', 'e'): 2, ('e', 'r'): 2, ('r', '</w>'): 2, ('n', 'e'): 6, ('e', 'w'): 6, ('w', 'est'): 6, ('est', '</w>'): 9, ('w', 'i'): 3, ('i', 'd'): 3, ('d', 'est'): 3})
best ('est', '</w>')

### 1.2.Word-piece

In [99]:
from tokenizers import Tokenizer
from tokenizers.models import WordPiece

bert_tokenizer = Tokenizer(WordPiece())

In [100]:
from tokenizers import normalizers
from tokenizers.normalizers import Lowercase, NFD, StripAccents

bert_tokenizer.normalizer = normalizers.Sequence([NFD(), Lowercase(), StripAccents()])

In [101]:
from tokenizers.pre_tokenizers import Whitespace

bert_tokenizer.pre_tokenizer = Whitespace()

In [102]:
from tokenizers.processors import TemplateProcessing

bert_tokenizer.post_processor = TemplateProcessing(
    single="[CLS] $A [SEP]",
    pair="[CLS] $A [SEP] $B:1 [SEP]:1",
    special_tokens=[
        ("[CLS]", 1),
        ("[SEP]", 2),
    ],
)

In [117]:
from tokenizers.trainers import WordPieceTrainer

trainer = WordPieceTrainer(
    vocab_size=30522, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
)
files = [f"data/wikitext-103-raw/wiki.{split}.raw" for split in ["test", "train", "valid"]]
bert_tokenizer.train(files, trainer)

model_files = bert_tokenizer.model.save("data", "bert-wiki")
bert_tokenizer.model = WordPiece.from_file(*model_files, unk_token="[UNK]")

bert_tokenizer.save("data/bert-wiki.json")






In [118]:
output = bert_tokenizer.encode("Hello, brooooooo! How are you 😁 ?")
print(output.ids)

bert_tokenizer.decode([1, 27462, 16, 67, 11, 7323, 5, 7510, 7268, 7989, 0, 35, 2])

[1, 27462, 16, 7560, 10284, 10284, 10284, 5, 7510, 7268, 7989, 0, 35, 2]


"hello, y ' all! how are you?"

In [122]:
output = bert_tokenizer.encode("Welcome to the club bro !")
print(output.tokens)

bert_tokenizer.decode(output.ids)

['[CLS]', 'welcome', 'to', 'the', 'club', 'bro', '!', '[SEP]']


'welcome to the club bro!'

In [123]:
from tokenizers import decoders

bert_tokenizer.decoder = decoders.WordPiece()
bert_tokenizer.decode(output.ids)

'welcome to the club bro!'

### 1.3.Transformers tokenizers

In [35]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

print(tokenizer.tokenize("I have a new GPU!"))

['i', 'have', 'a', 'new', 'gp', '##u', '!']


In [36]:
print(tokenizer.tokenize("dsfgshdlfkjgs"))

['ds', '##f', '##gs', '##hd', '##lf', '##k', '##j', '##gs']


In [37]:
tokenizer.encode("wkedwiemc I have a new RTX4090! JWIENFO")

[101,
 1059,
 8126,
 9148,
 6633,
 2278,
 1045,
 2031,
 1037,
 2047,
 19387,
 2595,
 12740,
 21057,
 999,
 1046,
 9148,
 2368,
 14876,
 102]

In [38]:
tokenizer.convert_ids_to_tokens(tokenizer.encode("wkedwiemc I have a new RTX4090! JWIENFO"))

['[CLS]',
 'w',
 '##ked',
 '##wi',
 '##em',
 '##c',
 'i',
 'have',
 'a',
 'new',
 'rt',
 '##x',
 '##40',
 '##90',
 '!',
 'j',
 '##wi',
 '##en',
 '##fo',
 '[SEP]']

In [39]:
tokenizer.decode(tokenizer.encode("wkedwiemc I have a new RTX4090! JWIENFO"))

'[CLS] wkedwiemc i have a new rtx4090! jwienfo [SEP]'

In [40]:
tokenizer.decode(tokenizer.encode("wkedwiemc I have a new RTX4090! JWIENFO"), skip_special_tokens=True)

'wkedwiemc i have a new rtx4090! jwienfo'

In [41]:
tokenizer.special_tokens_map

{'unk_token': '[UNK]',
 'sep_token': '[SEP]',
 'pad_token': '[PAD]',
 'cls_token': '[CLS]',
 'mask_token': '[MASK]'}

In [42]:
tokenizer.eos_token

Using eos_token, but it is not set yet.


In [43]:
tokenizer("Hello world!")

{'input_ids': [101, 7592, 2088, 999, 102], 'token_type_ids': [0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1]}

In [44]:
encoding = tokenizer("Hello world!", add_special_tokens = True,
                                 truncation = True, padding = "max_length", return_attention_mask = True, return_tensors = "pt")

In [45]:
encoding.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [46]:
encoding['input_ids'].shape

torch.Size([1, 512])

## 2.Визуализация отношения токенов внутри BERT моделей с помощью bertviz

In [47]:
from bertviz import head_view, model_view
from transformers import BertTokenizer, BertModel, BertForQuestionAnswering

model_version = 'bert-base-uncased'
do_lower_case = True
model = BertModel.from_pretrained(model_version, output_attentions=True)
tokenizer = BertTokenizer.from_pretrained(model_version, do_lower_case=do_lower_case)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [48]:
sentence_a = "I love studying math"
sentence_b = "Artificial intelligence is based on complex mathematical models"
inputs = tokenizer.encode_plus(sentence_a, sentence_b, return_tensors='pt', add_special_tokens=True)
input_ids = inputs['input_ids']
token_type_ids = inputs['token_type_ids']
attention = model(input_ids, token_type_ids=token_type_ids)[-1]
sentence_b_start = token_type_ids[0].tolist().index(1)
input_id_list = input_ids[0].tolist()
tokens = tokenizer.convert_ids_to_tokens(input_id_list)

### 2.1.Визуализация 12 аттенш-голов

In [49]:
model_view(attention, tokens, sentence_b_start)

<IPython.core.display.Javascript object>

### 2.2.Визуализация одной аттеншн-головы с выбором

In [50]:
head_view(attention, tokens, sentence_b_start)

<IPython.core.display.Javascript object>

## 2.Text classification with BERT

### 3.1.Transformers

In [147]:
import transformers

#### 3.1.1.Masked Language Modeling - MLM

In [148]:
from transformers import BertTokenizer, BertForMaskedLM
from torch.nn import functional as F
import torch

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForMaskedLM.from_pretrained('bert-base-uncased', return_dict=True)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [41]:
model.eval();

In [149]:
text = "The capital of Russia, " + tokenizer.mask_token + ", contains the Red Square."
input = tokenizer.encode_plus(text, return_tensors = "pt")
mask_index = torch.where(input["input_ids"][0] == tokenizer.mask_token_id)

with torch.no_grad():
    output = model(**input)
    logits = output.logits
    softmax = F.softmax(logits, dim = -1)
    mask_word = softmax[0, mask_index, :]
    top_10 = torch.topk(mask_word, 10, dim = 1)[1][0]
for token in top_10:
    word = tokenizer.decode([token])
    new_sentence = text.replace(tokenizer.mask_token, word)
    print(new_sentence)

The capital of Russia, moscow, contains the Red Square.
The capital of Russia, kazan, contains the Red Square.
The capital of Russia, helsinki, contains the Red Square.
The capital of Russia, kiev, contains the Red Square.
The capital of Russia, vladimir, contains the Red Square.
The capital of Russia, russia, contains the Red Square.
The capital of Russia, petersburg, contains the Red Square.
The capital of Russia, sofia, contains the Red Square.
The capital of Russia, tokyo, contains the Red Square.
The capital of Russia, leningrad, contains the Red Square.


In [150]:
text

'The capital of Russia, [MASK], contains the Red Square.'

In [151]:
tokenizer

BertTokenizer(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)

#### 3.1.2.Pipelines

In [152]:
from transformers import pipeline

# Pipeline для question-answering
question_answerer = pipeline('question-answering')
question_answerer({
     'question': 'What is the name of the repository ?',
     'context': 'Pipeline have been included in the huggingface/transformers repository'
})

No model was supplied, defaulted to distilbert-base-cased-distilled-squad and revision 626af31 (https://huggingface.co/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production is not recommended.


Downloading (…)lve/main/config.json:   0%|          | 0.00/473 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/261M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

{'score': 0.5135955810546875,
 'start': 35,
 'end': 59,
 'answer': 'huggingface/transformers'}

In [155]:
from transformers import pipeline

# Pipeline для sentiment-analysis
classifier = pipeline('sentiment-analysis')
classifier('We are very not happy to use transformers repository.')

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


[{'label': 'NEGATIVE', 'score': 0.9992297887802124}]

## 4.Будем классифицировать отзывы о ресторанах с помощью файнтюна BERT-based модели

In [9]:
import pandas as pd
import numpy as np
import torch
from tqdm.auto import tqdm, trange
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import DataCollatorWithPadding
from torch.utils.data import DataLoader
from datasets import Dataset
from torch.optim import Adam

import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split

2023-11-06 16:07:13.344250: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0


In [10]:
df = pd.read_json('https://huggingface.co/datasets/blinoff/restaurants_reviews/resolve/main/restaurants_reviews.jsonl', lines=True)

In [11]:
print(df.shape)
df.sample(5)

(47139, 6)


Unnamed: 0,review_id,general,food,interior,service,text
46238,46238,0,0,0,0,второй раз за неделю в Шотладской клетке - и с...
716,716,0,10,10,10,Хочу дополнить плеяду приятных и хороших отзыв...
19882,19882,0,9,9,10,отмечали вчера День Рождения в Этом прекрасном...
38189,38189,0,0,0,0,Сидели недавно у вас в деревянном зале на балк...
42853,42853,0,0,0,0,"Хороший ресторан , отличное обслуживание , веж..."


In [12]:
df['general'].unique()

array([0, 5, 3, 4, 1, 2])

In [13]:
df.groupby('general').sample(1)

Unnamed: 0,review_id,general,food,interior,service,text
38566,38566,0,0,0,0,"26 июня дружной компанией , чисто случайно поп..."
35439,35439,1,0,0,0,"Мы сегодня ужинали в этом заведении , причем к..."
36014,36014,2,0,0,0,Отмечали юбилей в этом ресторане 19.06.2013г ....
36240,36240,3,0,0,0,Забронировал столик на нашу годовщину в этом р...
34892,34892,4,0,0,0,Всё СУПЕР ! готовят хорошо . цены не высокие ....
36896,36896,5,0,0,0,В сентябре отмечали корпоратив в вашем рестора...


In [14]:
g = df[df.general>0]

data = Dataset.from_dict({'text': g.text, 'label': g.general-1}).train_test_split(test_size=0.2, seed=1)
data

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 2559
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 640
    })
})

In [15]:
# https://huggingface.co/ai-forever/ruBert-base
base_model = 'ai-forever/ruBert-base'

In [16]:
tokenizer = AutoTokenizer.from_pretrained(base_model)

In [17]:
data_tokenized = data.map(lambda x: tokenizer(x['text'], truncation=True, max_length=512), batched=True, remove_columns=['text'])

Map:   0%|          | 0/2559 [00:00<?, ? examples/s]

Map:   0%|          | 0/640 [00:00<?, ? examples/s]

In [18]:
data_tokenized

DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2559
    })
    test: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 640
    })
})

In [19]:
collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [20]:
train_dataloader = DataLoader(data_tokenized['train'], shuffle=True, batch_size=4, collate_fn=collator)
val_dataloader = DataLoader(data_tokenized['test'], shuffle=False, batch_size=4, collate_fn=collator)

In [21]:
model = AutoModelForSequenceClassification.from_pretrained(base_model, num_labels=5)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Some weights of the model checkpoint at ai-forever/ruBert-base were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not ini

In [23]:
import torch

class BertClassifierSimple(torch.nn.Module):
    def __init__(self, num_labels):
        super(BertClassifierSimple, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = torch.nn.Dropout(self.bert.config.dropout)
        self.out = torch.nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask=None, token_type_ids=None):
        bert_output = self.bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        output = self.out(self.dropout(bert_output[1])) 
        return output

In [24]:
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(120138, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen

In [25]:
optimizer = Adam(model.parameters(), lr=1e-6)

In [26]:
losses = []
for epoch in trange(3):
    pbar = tqdm(train_dataloader)
    model.train()
    for i, batch in enumerate(pbar):
        out = model(**batch.to(model.device))
        out.loss.backward()
        if i % 1 == 0:
            optimizer.step()
            optimizer.zero_grad()
        losses.append(out.loss.item())
        pbar.set_description(f'loss: {np.mean(losses[-100:]):2.2f}')
    model.eval()
    eval_losses = []
    eval_preds = []
    eval_targets = []
    for batch in tqdm(val_dataloader):
        with torch.no_grad():
                out = model(**batch.to(model.device))
        eval_losses.append(out.loss.item())
        eval_preds.extend(out.logits.argmax(1).tolist())
        eval_targets.extend(batch['labels'].tolist())
    print('recent train loss', np.mean(losses[-100:]), 'eval loss', np.mean(eval_losses), 'accuracy', np.mean(np.array(eval_targets) == eval_preds))

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/640 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


  0%|          | 0/160 [00:00<?, ?it/s]

recent train loss 0.8777561241388321 eval loss 0.9118369391188026 accuracy 0.6765625


  0%|          | 0/640 [00:00<?, ?it/s]

  0%|          | 0/160 [00:00<?, ?it/s]

recent train loss 0.7488809429109097 eval loss 0.7597084344364703 accuracy 0.7609375


  0%|          | 0/640 [00:00<?, ?it/s]

  0%|          | 0/160 [00:00<?, ?it/s]

recent train loss 0.6648740524053574 eval loss 0.7182020721957088 accuracy 0.7625


In [27]:
model.eval()
eval_losses = []
eval_preds = []
eval_targets = []
for batch in tqdm(val_dataloader):
    with torch.no_grad():
            out = model(**batch.to(model.device))
    eval_losses.append(out.loss.item())
    eval_preds.extend(out.logits.argmax(1).tolist())
    eval_targets.extend(batch['labels'].tolist())
print('recent train loss', np.mean(losses[-100:]), 'eval loss', np.mean(eval_losses), 'accuracy', np.mean(np.array(eval_targets) == eval_preds))

  0%|          | 0/160 [00:00<?, ?it/s]

recent train loss 0.6648740524053574 eval loss 0.7182020721957088 accuracy 0.7625


In [28]:
model.save_pretrained('sentiment_classifier')
tokenizer.save_pretrained('sentiment_classifier')

('sentiment_classifier/tokenizer_config.json',
 'sentiment_classifier/special_tokens_map.json',
 'sentiment_classifier/vocab.txt',
 'sentiment_classifier/added_tokens.json',
 'sentiment_classifier/tokenizer.json')

In [30]:
# Загрузим файнтюн модель и будем делать инференс
model = AutoModelForSequenceClassification.from_pretrained('sentiment_classifier')
tokenizer = AutoTokenizer.from_pretrained('sentiment_classifier')

In [31]:
def classify(text):
    with torch.no_grad():
        proba = torch.softmax(model(**tokenizer(text, return_tensors='pt', truncation=True, max_length=512).to(model.device)).logits, -1)
    return proba.cpu().numpy()[0]

In [32]:
classify('Мне было скучно')

array([0.15575767, 0.09338856, 0.06957538, 0.1250468 , 0.55623156],
      dtype=float32)

In [33]:
classify('Мне было весело')

array([0.07820132, 0.05843171, 0.04636016, 0.09669976, 0.72030705],
      dtype=float32)