In [None]:
%pip install -U transformers bitsandbytes accelerate datasets pymorphy3
# %pip install -U

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting huggingface-hub<1.0,>=0.23.2 (from transformers)
  Downloading huggingface_hub-0.26.2-py3-none-any.whl.metadata (13 kB)
Downloading huggingface_hub-0.26.2-py3-none-any.whl (447 kB)
Installing collected packages: huggingface-hub
  Attempting uninstall: huggingface-hub
    Found existing installation: huggingface-hub 0.2.1
    Uninstalling huggingface-hub-0.2.1:
      Successfully uninstalled huggingface-hub-0.2.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
ruclip 0.0.2 requires huggingface-hub==0.2.1, but you have huggingface-hub 0.26.2 which is incompatible.[0m[31m
[0mSuccessfully installed huggingface-hub-0.26.2
Note: you may need to restart the kernel to use updated packages.


In [None]:
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset

import pymorphy3
from tqdm import tqdm
import os
import gc
import random
from glob import glob
import warnings
import sklearn
# import bitsandbytes as bnb

from transformers import AutoModelForSequenceClassification, \
                         AutoModelForCausalLM, \
                        AutoModel, \
                        Trainer,\
                        pipeline, \
                        AutoTokenizer,\
                        BertTokenizerFast
from datasets import DatasetDict

2024-11-10 21:41:52.886723: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-10 21:41:52.886770: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-10 21:41:52.887935: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-10 21:41:52.963201: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
def seed_everything(seed: int):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(42)
warnings.filterwarnings("ignore")
device = torch.device('cuda:0')

In [None]:
# torch.cuda.get_device_name(0)
!nvidia-smi

Sun Nov 10 20:21:43 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.216.01             Driver Version: 535.216.01   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA L40S                    On  | 00000000:00:06.0 Off |                    0 |
| N/A   33C    P8              34W / 350W |     34MiB / 46068MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

### Sentiment analysis

#### hf pipeline

In [None]:
def fast_sentiment(texts, device):
  model = pipeline("text-classification", model="blanchefort/rubert-base-cased-sentiment-rusentiment", device=device)
  ans = []
  for text in texts:
    ans.append(model(text))
  del model
  gc.collect()
  torch.cuda.empty_cache()
  return ans

In [None]:
results = fast_sentiment(
    ["Знаешь, ты мне очень не нравишься", "Как же это всё надоело", "Ура, завтра я встречусь с друзьями"], device=device
)

In [None]:
!nvidia-smi

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Sun Nov 10 18:23:15 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.216.01             Driver Version: 535.216.01   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA L40S                    On  | 00000000:00:06.0 Off |                    0 |
| N/A   42C    P0              88W / 350W |    240MiB / 46068MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

#### custom

In [None]:
def collate(data):
  # print(data)
  # max_len = len(max(data, key=lambda x: len(x[0]))[0])
  max_len = max([len(x['input_ids']) for x in data])
  input_ids = torch.stack([
      torch.cat(
          [i['input_ids'],
          torch.tensor([0] * (max_len - len(i['input_ids'])), dtype=torch.long)],
          dim=-1
       )
      for i in data
  ])
  attention_mask = torch.stack([
      torch.cat(
          [i['attention_mask'],
          torch.tensor([0] * (max_len - len(i['attention_mask'])), dtype=torch.long)],
          dim=-1
       )
      for i in data
  ])

  to_return = {
      "input_ids" : input_ids,
      "attention_mask" : attention_mask
  }

  if data[0].get('targets', -1) != -1:
      to_return["targets"] = [i['targets']for i in data]

  return to_return

In [None]:
class BERTDataset(Dataset):
    def __init__(self, text, tokenizer, max_len, targets=None,):
        self.max_len = max_len
        self.text = text
        self.tokenizer = tokenizer
        self.targets = targets

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = self.text[index]

        inputs = self.tokenizer.encode_plus(
            text,
            truncation=True,
            add_special_tokens=True,
            max_length=self.max_len,
            # padding='max_length',
            padding=False,
            return_token_type_ids=False
        )

        ids = inputs['input_ids']
        mask = inputs['attention_mask']

        item = {
            'input_ids': torch.tensor(ids, dtype=torch.long),
            'attention_mask': torch.tensor(mask, dtype=torch.long),
        }

        if self.targets is not None:
            item['targets'] = torch.tensor(self.targets[index], dtype=torch.long)

        return item

In [None]:
def predict(dl, model, device):
    model.eval()
    model = model.to(device)
    preds = []
    for batch in dl:
        outputs = model(**{k : v.to(device) for k, v in batch.items()})
        predicted = torch.nn.functional.softmax(outputs.logits, dim=-1)
        predicted = torch.argmax(predicted, dim=-1).cpu()
        preds.append(predicted)
    # return torch.cat(preds).detach().cpu().numpy()
    return torch.cat(preds).numpy()

In [None]:
#predicting fast
def predict_test(model, tokenizer, texts, id2label):

    MAX_LEN = 512
    test_dataset = BERTDataset(text=texts, tokenizer=tokenizer, max_len=MAX_LEN)
    test_dataloader = DataLoader(
        test_dataset,
        batch_size=64,
        shuffle=False,
        pin_memory=True,
        num_workers=2,
        collate_fn=collate
    )

    results = predict(test_dataloader, model, device)
    results = [id2label[i] for i in results]

    return results

#### Zero shot custom prediction

In [None]:
# to search: https://huggingface.co/models?language=ru&sort=trending&search=sentiment
# blanchefort/rubert-base-cased-sentiment-rusentiment - the best for now
model_name = 'blanchefort/rubert-base-cased-sentiment-rusentiment'
tokenizer = BertTokenizerFast.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, return_dict=True)

id2class = {0: "NEUTRAL",
    1: "POSITIVE",
    2: "NEGATIVE"}

predict_test(model, tokenizer,  ["Знаешь, ты мне очень не нравишься", "Как же это всё надоело", "Ура, завтра я встречусь с друзьями"], id2class)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


['NEGATIVE', 'NEGATIVE', 'POSITIVE']

In [None]:
del model, tokenizer
gc.collect()
torch.cuda.empty_cache()

Traceback (most recent call last):
  File "/opt/conda/lib/python3.11/multiprocessing/queues.py", line 239, in _feed
    reader_close()
  File "/opt/conda/lib/python3.11/multiprocessing/connection.py", line 178, in close
    self._close()
  File "/opt/conda/lib/python3.11/multiprocessing/connection.py", line 377, in _close
    _close(self._handle)
OSError: [Errno 9] Bad file descriptor


In [None]:
!nvidia-smi

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Sun Nov 10 18:23:43 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.216.01             Driver Version: 535.216.01   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA L40S                    On  | 00000000:00:06.0 Off |                    0 |
| N/A   42C    P0              88W / 350W |    240MiB / 46068MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

### Training

In [None]:
class BertClassifierModel(nn.Module):
    def __init__(self, hidden_dim, bert_model, n_classes):
        from collections import OrderedDict
        super().__init__()
        self.bert = bert_model
        self.linear = nn.Sequential(OrderedDict([
            ("ln1", nn.Linear(768, hidden_dim)),
            ("act", nn.LeakyReLU()),
            ("ln2", nn.Linear(hidden_dim, n_classes)),
        ]))

    def forward(self, input_ids, attention_mask, labels=None, return_hidden=False):
      x = self.bert(
          input_ids=input_ids,
          attention_mask=attention_mask,
      ).pooler_output
      # print(x)
      logits = self.linear(x)
      probs = torch.nn.functional.softmax(logits, dim=-1)
      loss = nn.CrossEntropyLoss()(probs, labels)
      if return_hidden:
          return logits, x
      return loss, logits


In [None]:
def train_one_epoch(model, loader, loss_fn, optimizer, device):

    model.train()

    lossi = []
    f1_i = []
    roc_auc_i = []

    stream = tqdm(loader)

    for batch in stream:

        input_ids = torch.tensor(batch['input_ids']).to(device)
        attention_mask = torch.tensor(batch['attention_mask']).to(device)
        y_batch = torch.tensor(batch['targets']).to(device)

        loss, logits = model(input_ids, attention_mask, labels=y_batch)
        probs = torch.nn.functional.softmax(logits, dim=-1)

        # loss = loss_fn(probs, y_batch)
        lossi.append(loss.item())

        f1_i.append(sklearn.metrics.f1_score(y_batch.cpu(), probs.argmax(axis=-1).cpu(), average='micro'))

        # if all(y_batch) or (not any(y_batch)):
        #     roc_auc_i.append(0)
        # else:
        #     roc_auc_i.append(sklearn.metrics.roc_auc_score(y_batch.cpu(), probs.argmax(axis=1).cpu()))

        stream.set_postfix(
            loss=np.mean(lossi),
            f1=np.mean(f1_i),
            # roc_auc=np.mean(roc_auc_i)
        )

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()


def validate_one_epoch(model, loader, loss_fn, device):

    model.eval()

    lossi = []
    f1_i = []
    roc_auc_i = []

    stream = tqdm(loader)

    with torch.no_grad():

        for batch in stream:

            input_ids = torch.tensor(batch['input_ids']).to(device)
            attention_mask = torch.tensor(batch['attention_mask']).to(device)
            y_batch = torch.tensor(batch['targets']).to(device)

            loss, logits = model(input_ids, attention_mask, labels=y_batch)
            probs = torch.nn.functional.softmax(logits, dim=-1)

            # loss = loss_fn(probs, y_batch)
            lossi.append(loss.item())

            f1_i.append(sklearn.metrics.f1_score(y_batch.cpu(), probs.argmax(axis=-1).cpu(), average='micro'))

            # if all(y_batch) or (not any(y_batch)):
            #     roc_auc_i.append(0)
            # else:
            #     roc_auc_i.append(sklearn.metrics.roc_auc_score(y_batch.cpu(), probs.argmax(axis=1).cpu()))

            stream.set_postfix(
                loss=np.mean(lossi),
                f1=np.mean(f1_i),
                # roc_auc=np.mean(roc_auc_i)
            )

            torch.cuda.empty_cache()

    return np.mean(f1_i)


def train(
      train_loader,
      val_loader,
      model,
      num_epochs,
      lr,
      gamma,
      best_f1=0,
      device=None,
      checkpoint=None
  ):

      # loss_fn = nn.BCELoss() if task == "binary" else FocalLoss(gamma=gamma)
      loss_fn = nn.CrossEntropyLoss()
      optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=1e-5)
      model = model.to(device)
      scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma)
      best_model = None
      for epoch in range(num_epochs):
        print(f"{'-' * 100}")
        print(f"EPOCH {epoch} STARTED")
        print("TRAINING")
        train_one_epoch(model, train_loader, loss_fn, optimizer, device)
        print("VALIDATION")
        val_f1 = validate_one_epoch(model, val_loader, loss_fn, device)
        print(f"{'-' * 100}")
        if best_f1 < val_f1:
            best_f1 = val_f1
            best_model = model
            if checkpoint != None:
                torch.save(best_model.state_dict(), checkpoint + f"/model_f1m_{val_f1:.3f}.pt")
        scheduler.step()
      print(f"train finished with best f1 micro={best_f1}")

In [None]:
import datasets
dataset = datasets.load_dataset('ai-forever/kinopoisk-sentiment-classification')

In [None]:
model_name = 'blanchefort/rubert-base-cased-sentiment-rusentiment'
tokenizer = BertTokenizerFast.from_pretrained(model_name)
bert_model = AutoModel.from_pretrained(model_name, return_dict=True)

model = BertClassifierModel(bert_model=bert_model, hidden_dim=128, n_classes=3)

In [None]:
del bert_model
gc.collect()
torch.cuda.empty_cache()

In [None]:
train_df = dataset['train'].to_pandas()
val_df = dataset['validation'].to_pandas()

In [None]:
if split := False:
    ids = np.arange(df.shape[0])
    train_ids, val_ids = sklearn.model_selection.train_test_split(ids, test_size=0.15,) #stratify='')
    train, val = df.iloc[train_ids, :], df.iloc[val_ids, :]

MAX_LEN = 512

train_dataset = BERTDataset(text=train_df['text'].tolist(), tokenizer=tokenizer, max_len=MAX_LEN, targets=train_df['label'])
val_dataset = BERTDataset(text=val_df['text'].tolist(), tokenizer=tokenizer, max_len=MAX_LEN, targets=val_df['label'])

In [None]:
def save_embeds(embeds, path):
  import pickle
  with open(path, 'wb') as f:
    pickle.dump(embeds, f)

def load_embeds(path):
  import pickle
  with open(path, 'rb') as f:
    d = pickle.load(f)
  return d

embeds_path = './train_dataset.pickle'
save_embeds(train_dataset, embeds_path)
# train_dataset = load_embeds(embeds_path)
embeds_path = './val_dataset.pickle'
save_embeds(val_dataset, embeds_path)
# train_dataset = load_embeds(embeds_path)

In [None]:
train_dataloader = DataLoader(
    train_dataset,
    batch_size=16,
    shuffle=True,
    pin_memory=True,
    num_workers=4,
    collate_fn=collate
)

val_dataloader = DataLoader(
    val_dataset,
    batch_size=16,
    shuffle=False,
    pin_memory=True,
    num_workers=4,
    collate_fn=collate
)

In [None]:
label2id = {0 : "Bad", 1 : 'Neutral', 2: 'Good'}
id2label = {v : k for k, v in label2id.items()}

In [None]:
train(
  train_dataloader,
  val_dataloader,
  model,
  num_epochs=10,
  lr=3e-5,
  gamma=0.75,
  best_f1=0,
  device=device,
  checkpoint='.'
)

----------------------------------------------------------------------------------------------------
EPOCH 0 STARTED
TRAINING


100%|██████████| 657/657 [04:43<00:00,  2.32it/s, f1=0.6, loss=0.94]   


VALIDATION


100%|██████████| 94/94 [00:22<00:00,  4.26it/s, f1=0.594, loss=0.945]


----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
EPOCH 1 STARTED
TRAINING


100%|██████████| 657/657 [04:39<00:00,  2.35it/s, f1=0.688, loss=0.856]


VALIDATION


100%|██████████| 94/94 [00:21<00:00,  4.31it/s, f1=0.652, loss=0.88] 


----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
EPOCH 2 STARTED
TRAINING


100%|██████████| 657/657 [04:40<00:00,  2.34it/s, f1=0.764, loss=0.785]


VALIDATION


100%|██████████| 94/94 [00:22<00:00,  4.09it/s, f1=0.699, loss=0.846]


----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
EPOCH 3 STARTED
TRAINING


100%|██████████| 657/657 [04:41<00:00,  2.34it/s, f1=0.813, loss=0.735]


VALIDATION


100%|██████████| 94/94 [00:22<00:00,  4.16it/s, f1=0.699, loss=0.839]


----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
EPOCH 4 STARTED
TRAINING


100%|██████████| 657/657 [04:42<00:00,  2.33it/s, f1=0.85, loss=0.7]   


VALIDATION


100%|██████████| 94/94 [00:22<00:00,  4.14it/s, f1=0.689, loss=0.856]


----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
EPOCH 5 STARTED
TRAINING


100%|██████████| 657/657 [04:41<00:00,  2.33it/s, f1=0.869, loss=0.681]


VALIDATION


100%|██████████| 94/94 [00:23<00:00,  4.03it/s, f1=0.69, loss=0.857] 


----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
EPOCH 6 STARTED
TRAINING


100%|██████████| 657/657 [04:42<00:00,  2.33it/s, f1=0.881, loss=0.67] 


VALIDATION


100%|██████████| 94/94 [00:23<00:00,  3.92it/s, f1=0.698, loss=0.849]


----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
EPOCH 7 STARTED
TRAINING


100%|██████████| 657/657 [04:40<00:00,  2.34it/s, f1=0.894, loss=0.658]


VALIDATION


100%|██████████| 94/94 [00:23<00:00,  4.09it/s, f1=0.695, loss=0.85] 


----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
EPOCH 8 STARTED
TRAINING


100%|██████████| 657/657 [04:42<00:00,  2.32it/s, f1=0.898, loss=0.653]


VALIDATION


100%|██████████| 94/94 [00:23<00:00,  3.98it/s, f1=0.697, loss=0.851]


----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
EPOCH 9 STARTED
TRAINING


100%|██████████| 657/657 [04:41<00:00,  2.33it/s, f1=0.902, loss=0.649]


VALIDATION


100%|██████████| 94/94 [00:22<00:00,  4.09it/s, f1=0.703, loss=0.844]


----------------------------------------------------------------------------------------------------
train finished with best f1 micro=0.7034574468085106
