Прежде чем начать добавьте model_bert.bin 
(https://drive.google.com/file/d/1MHIM1TrJ4sDMuj_VrNL0pO2zTwDo9rRh/view?usp=sharing) на гугл диск. Далее запустите все ячейки поочереди. Если добавили в конкретную папку, укажите ее в ячейке Load model в переменную folder. Если функция predict выполняется долго (более 1-2 мин) или вылезла ошибка, перезапустите среду выполнения (Среда выполнения -> Перезупустить среду выполнения). И прокликайте все ячейки заново.

# Import libraries

In [1]:
!pip install -qq transformers

In [2]:
import transformers
from transformers import BertTokenizer, BertModel
import torch
import numpy as np
import nltk
from nltk.corpus import stopwords
from pymystem3 import Mystem
from string import punctuation
import re
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [3]:
# authorization for Google Colab
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Load model and tokenizer

In [4]:
class_names = ['negative', 'positive']
NAME_BERT = 'DeepPavlov/rubert-base-cased'
tokenizer = BertTokenizer.from_pretrained(NAME_BERT)

class SentimentClassifier(nn.Module):
  def __init__(self, n_classes):
    super(SentimentClassifier, self).__init__()
    self.bert = BertModel.from_pretrained(NAME_BERT)
    self.drop = nn.Dropout(p=0.3)
    self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
  def forward(self, input_ids, attention_mask):
    _, pooled_output = self.bert(
      input_ids=input_ids,
      attention_mask=attention_mask
    )
    output = self.drop(pooled_output)
    return self.out(output)

In [5]:
# enter your folder if you need
folder = ''

model = SentimentClassifier(len(class_names))
name_model = '/model_bert.bin'
PATH = '/content/drive/My Drive/' + folder + name_model
model.load_state_dict(torch.load(PATH))
model.eval()
model.to(device)
print()
print("Model loaded")


Model loaded


# Preprocess text

In [6]:
# install pymystem3 library for google colab
!wget http://download.cdn.yandex.net/mystem/mystem-3.0-linux3.1-64bit.tar.gz
!tar -xvf mystem-3.0-linux3.1-64bit.tar.gz
!cp mystem /bin

--2020-07-31 07:20:14--  http://download.cdn.yandex.net/mystem/mystem-3.0-linux3.1-64bit.tar.gz
Resolving download.cdn.yandex.net (download.cdn.yandex.net)... 5.45.205.241, 5.45.205.242, 5.45.205.243, ...
Connecting to download.cdn.yandex.net (download.cdn.yandex.net)|5.45.205.241|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: http://cache-mskm906.cdn.yandex.net/download.cdn.yandex.net/mystem/mystem-3.0-linux3.1-64bit.tar.gz [following]
--2020-07-31 07:20:14--  http://cache-mskm906.cdn.yandex.net/download.cdn.yandex.net/mystem/mystem-3.0-linux3.1-64bit.tar.gz
Resolving cache-mskm906.cdn.yandex.net (cache-mskm906.cdn.yandex.net)... 5.45.220.16, 2a02:6b8:0:2002::17
Connecting to cache-mskm906.cdn.yandex.net (cache-mskm906.cdn.yandex.net)|5.45.220.16|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 16457938 (16M) [application/octet-stream]
Saving to: ‘mystem-3.0-linux3.1-64bit.tar.gz.6’


2020-07-31 07:20:14 (27.2 MB/s) - ‘mystem-3.0-

In [7]:
nltk.download("stopwords")
mystem = Mystem() 
russian_stopwords = stopwords.words("russian")

TEXT_CLEANING_RE = "@\S+|https?:\S+|http?:\S|[^А-Яа-я0-9]+"

def preprocess_text(text):
    # Remove link,user and special characters
    text = re.sub(TEXT_CLEANING_RE, ' ', str(text).lower()).strip()
    tokens = []
    for token in text.split():
        if token not in russian_stopwords:
          tokens.append(token)

    text = " ".join(tokens)

    # lematize text
    tokens = mystem.lemmatize(text.lower())
    tokens = [token for token in tokens if token not in russian_stopwords\
              and token != " " \
              and token.strip() not in punctuation]
    
    text = " ".join(tokens)
    
    return text

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Predict

In [8]:
import logging
logging.basicConfig(level=logging.ERROR)

def predict(text):    
  text = preprocess_text(text)
  encoding = tokenizer.encode_plus(
    text,
    add_special_tokens=True,
    max_length=300,
    return_token_type_ids=False,
    pad_to_max_length=True,
    return_attention_mask=True,
    return_tensors='pt',
  )
  input_ids = encoding['input_ids'].to(device)
  attention_mask = encoding['attention_mask'].to(device)
  outputs = model(
          input_ids=input_ids,
          attention_mask=attention_mask
        )
  _, preds = torch.max(outputs, dim=1)
  if preds[0] == 1:
    print("POSITIVE")
  else:
    print('NEGATIVE')

# Example

In [9]:
predict('я люблю слушать музыку')

POSITIVE


In [10]:
predict('я не стану смотреть этот скучный фильм')

NEGATIVE


Your text

In [None]:
text = 'введите ваш текст'
predict(text)