In [49]:
!pip install transformers stop_words pymorphy2 tqdm nltk

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting stop_words
  Downloading stop-words-2018.7.23.tar.gz (31 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pymorphy2
  Downloading pymorphy2-0.9.1-py3-none-any.whl (55 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.5/55.5 KB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
Collecting docopt>=0.6
  Downloading docopt-0.6.2.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting dawg-python>=0.7.1
  Downloading DAWG_Python-0.7.2-py2.py3-none-any.whl (11 kB)
Collecting pymorphy2-dicts-ru<3.0,>=2.4
  Downloading pymorphy2_dicts_ru-2.4.417127.4579844-py2.py3-none-any.whl (8.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.2/8.2 MB[0m [31m60.9 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: stop_words, docopt
  Building wheel for stop_words (setup.py) ... [?25l[?25hdone
  Creat

In [50]:
import numpy as np
import pandas as pd
from pathlib import Path

import torch
import torch.nn as nn
# import torch.nn.functional as F
from torch.optim import Adam
from tqdm import tqdm

# from transformers import pipeline
# from transformers import BertTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import BertForSequenceClassification
from transformers import BertTokenizerFast

from stop_words import get_stop_words
from pymorphy2 import MorphAnalyzer
from string import punctuation
import nltk

In [51]:
nltk.download("punkt")
stop_words = set(get_stop_words("ru"))
punctuations = set(punctuation)
lemmatizer = MorphAnalyzer()

device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


'cuda'

In [52]:
tok = BertTokenizerFast.from_pretrained('blanchefort/rubert-base-cased-sentiment')
b_model = AutoModelForSequenceClassification.from_pretrained('blanchefort/rubert-base-cased-sentiment', return_dict=True)

Downloading:   0%|          | 0.00/499 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.40M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/943 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/712M [00:00<?, ?B/s]

In [53]:
class TwitterDataset(torch.utils.data.Dataset):

    def __init__(self, txts, labels, tokenizer):
        self._labels = labels

        # self.tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
        self.tokenizer = tokenizer
        self._txts = [self.tokenizer(text, padding='max_length', max_length=10,
                                     truncation=True, return_tensors="pt")
                      for text in txts]

    def __len__(self):
        return len(self._txts)

    def __getitem__(self, index):
        return self._txts[index], self._labels[index]

In [54]:
class BertClassifier(nn.Module):

    def __init__(self, bert_model, dropout=0.5):
        super().__init__()
        # self.bert = BertModel.from_pretrained('bert-base-multilingual-cased')
        self.Xrobert = bert_model
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(3, 2)
        self.sigm = nn.Sigmoid()

    def forward(self, x, mask):

        pooled_output = self.Xrobert(input_ids=x, attention_mask=mask, return_dict=False)[0]
        # _, pooled_output - набор эмбеддинигов слов, эмбеддинг предложения
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        final_layer = self.sigm(linear_output)

        return final_layer

In [55]:
def preprocess_text(txt, sw, lem, punct):

    txt = str(txt)
    txt = "".join(c for c in txt if c not in punct)
    txt = txt.lower()
    txt = [lem.parse(word)[0].normal_form for word in txt.split() if word not in sw]
    return " ".join(txt)

In [56]:
df_train = pd.read_csv("/content/drive/MyDrive/data/train.csv")
df_val = pd.read_csv("/content/drive/MyDrive/data/val.csv")

df_train.shape, df_val.shape

((181467, 3), (22683, 3))

In [57]:
df_train.head()

Unnamed: 0,id,text,class
0,0,@alisachachka не уезжаааааааай. :(❤ я тоже не ...,0
1,1,RT @GalyginVadim: Ребята и девчата!\nВсе в кин...,1
2,2,RT @ARTEM_KLYUSHIN: Кто ненавидит пробки ретви...,0
3,3,RT @epupybobv: Хочется котлету по-киевски. Зап...,1
4,4,@KarineKurganova @Yess__Boss босапопа есбоса н...,1


In [58]:
df_train['class'].value_counts()

1    92063
0    89404
Name: class, dtype: int64

In [59]:
tqdm.pandas()

df_train['text'] = df_train['text'].progress_apply(lambda t: preprocess_text(t, stop_words, lemmatizer, punctuations))
df_val['text'] = df_val['text'].progress_apply(lambda t: preprocess_text(t, stop_words, lemmatizer, punctuations))

100%|██████████| 181467/181467 [03:52<00:00, 779.76it/s]
100%|██████████| 22683/22683 [00:24<00:00, 910.75it/s]


In [60]:
%%time
y_train = df_train['class'].values
y_val = df_val['class'].values

train_dataset = TwitterDataset(df_train['text'], y_train, tok)
valid_dataset = TwitterDataset(df_val['text'], y_val, tok)

train_loader = torch.utils.data.DataLoader(train_dataset,
                          batch_size=64,
                          shuffle=True,
                          num_workers=2)
valid_loader = torch.utils.data.DataLoader(valid_dataset,
                          batch_size=64,
                          shuffle=False,
                          num_workers=1)

CPU times: user 47.6 s, sys: 1min 13s, total: 2min 1s
Wall time: 1min 58s


In [61]:
for txt, lbl in train_loader:
    print(txt.keys())
    print(txt['input_ids'].shape)
    break

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])
torch.Size([64, 1, 10])


In [62]:
model = BertClassifier(b_model).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = Adam(model.Xrobert.classifier.parameters(), lr=0.0016)  # неполное обучение

In [63]:
print(model)
print("Parameters full train:", sum([param.nelement() for param in model.parameters()]))
print("Parameters transfer learning:", sum([param.nelement() for param in model.Xrobert.classifier.parameters()]))

BertClassifier(
  (Xrobert): BertForSequenceClassification(
    (bert): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(119547, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0): BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=768, out_features=768, bias

In [64]:
for epoch_num in range(2):
    total_acc_train = 0
    total_loss_train = 0
    model.train()

    for train_input, train_label in tqdm(train_loader):
        mask = train_input['attention_mask'].to(device)
        input_id = train_input['input_ids'].squeeze(1).to(device)
        train_label = train_label.to(device)

        output = model(input_id, mask)

        batch_loss = criterion(output, train_label)
        total_loss_train += batch_loss.item()
        acc = (output.argmax(dim=1) == train_label).sum().item()
        total_acc_train += acc

        model.zero_grad()
        batch_loss.backward()
        optimizer.step()

    model.eval()
    total_loss_val, total_acc_val = 0.0, 0.0
    for val_input, val_label in valid_loader:
        val_label = val_label.to(device)
        mask = val_input['attention_mask'].to(device)
        input_id = val_input['input_ids'].squeeze(1).to(device)

        output = model(input_id, mask)

        batch_loss = criterion(output, val_label)
        total_loss_val += batch_loss.item()

        acc = (output.argmax(dim=1) == val_label).sum().item()
        total_acc_val += acc

    print(
        f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_dataset): .3f} \
        | Train Accuracy: {total_acc_train / len(train_dataset): .3f} \
        | Val Loss: {total_loss_val / len(valid_dataset): .3f} \
        | Val Accuracy: {total_acc_val / len(valid_dataset): .3f}')

100%|██████████| 2836/2836 [05:22<00:00,  8.80it/s]


Epochs: 1 | Train Loss:  0.011         | Train Accuracy:  0.552         | Val Loss:  0.011         | Val Accuracy:  0.529


100%|██████████| 2836/2836 [05:19<00:00,  8.86it/s]


Epochs: 2 | Train Loss:  0.011         | Train Accuracy:  0.534         | Val Loss:  0.011         | Val Accuracy:  0.529
