# How to generate text with ruGPTs models?

### Install enviroment

In [1]:
!wget https://raw.githubusercontent.com/sberbank-ai/ru-gpts/master/generate_transformers.py
!pip install -U catalyst
!pip3 install urllib3==1.25.4
!pip3 install transformers==2.8.0

--2020-11-01 13:26:47--  https://raw.githubusercontent.com/sberbank-ai/ru-gpts/master/generate_transformers.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10474 (10K) [text/plain]
Saving to: ‘generate_transformers.py.9’


2020-11-01 13:26:47 (98.3 MB/s) - ‘generate_transformers.py.9’ saved [10474/10474]

Requirement already up-to-date: catalyst in /usr/local/lib/python3.6/dist-packages (20.10.1)


Here your mabe need to restart colab notebook

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import os 

import torch
from transformers import AutoTokenizer, AutoModel
import catalyst 
from catalyst import dl 
import pandas as pd 
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

from dataset import CustomDataSet
from models_factory import PoolingModel, ClsTokenModel
from inferece import make_prediction
from optimize_threshold import get_optimal_threshold_skf
from utils import get_tokenized_texts

In [4]:
# experiment setup

device = 'cuda:0'

name = "ruGPT3_cosine_v0"
tokenizer_name = "sberbank-ai/rugpt3large_based_on_gpt2"
model_name = "sberbank-ai/rugpt3large_based_on_gpt2"

random_state = 0
validation_fraction = 0.1 

tokenizer_encode_kwargs = {
    "max_length": 50,
    "pad_to_max_length": True,
    "return_tensors": "pt",
    "add_space_before_punct_symbol": True,
}

n_workers = 4

steps_per_epoch = 2_500

batch_size = 4
accumulation_steps = 16
initial_lr = 3e-5

snapshot_num_epochs = 25
n_snapshots = 3
lr_reduce_coef = 0.9

path_to_test = "drive/My Drive/vk2020/test_processed.csv"
path_to_train = "drive/My Drive/vk2020/trainset.csv"

path_to_save_preds = "drive/My Drive/vk2020/new_test_preds/"

In [5]:
# get model 
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
model = AutoModel.from_pretrained(model_name)
embeddings_dim = 1536

In [6]:
# add speical tokens to tokenizer 
special_tokens_dict = {"pad_token": "<pad>"}
_ = tokenizer.add_special_tokens(special_tokens_dict)

In [7]:
# define classification model 
cls_model = PoolingModel(backbone=model, embeddings_dim=embeddings_dim)

PoolingModel(
  (backbone): GPT2Model(
    (wte): Embedding(50257, 1536)
    (wpe): Embedding(2048, 1536)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): Block(
        (ln_1): LayerNorm((1536,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1536,), eps=1e-05, elementwise_affine=True)
        (mlp): MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): Block(
        (ln_1): LayerNorm((1536,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): Layer

In [8]:
# load test 
test = pd.read_csv(path_to_test)
test["text"] = test["text"].fillna(" ")
test.shape

(8482, 2)

In [9]:
# load train 
train = pd.read_csv(path_to_train)
train["text"] = train["text"].fillna(" ")

texts_train, texts_val, y_train, y_val = train_test_split(
    train["text"], train["label"], 
    test_size=validation_fraction, 
    random_state=random_state,
    stratify=train["label"]
)

In [10]:
y_train = np.array(y_train).reshape(-1, 1)
y_train = torch.Tensor(y_train)

y_val = np.array(y_val).reshape(-1, 1)
y_val = torch.Tensor(y_val)

In [11]:
# tokenize data 
tokens_test = get_tokenized_texts(texts=test["text"].values[:max_samples], 
                                  tokenizer=tokenizer, tokenizer_encode_kwargs=tokenizer_encode_kwargs)
tokens_train = get_tokenized_texts(texts=texts_train, 
                                   tokenizer=tokenizer, tokenizer_encode_kwargs=tokenizer_encode_kwargs)
tokens_val = get_tokenized_texts(texts=texts_val, 
                                   tokenizer=tokenizer, tokenizer_encode_kwargs=tokenizer_encode_kwargs)

100%|██████████| 8482/8482 [00:03<00:00, 2324.03it/s]
100%|██████████| 63000/63000 [00:17<00:00, 3524.42it/s]
100%|██████████| 7000/7000 [00:01<00:00, 4262.20it/s]


In [12]:
tokens_test[:3]

[tensor([[26648, 35915,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0]]),
 tensor([[  272,  3778,   382,   519,  4442, 17801,   309, 10717,   360,  1131,
            319,   986,   334,  4234,   360, 30269,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0]]),
 tensor([[  476,   844,  3685,   309,   783,   558,   503,   694, 13856,   809,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,    

In [13]:
tokens_train[:3]

[tensor([[  404,  1957,   382, 11279,   417,   334,   263,   807,   506,  3027,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0]]),
 tensor([[  300, 21181, 17571,  1003,   469,   650,   292,   335,  2596,   293,
          28976,  3916,  5123,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0]]),
 tensor([[ 960,  577,  830,  375,  469, 4970,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,   

In [14]:
# combine tokens in tensor 
tokens_train = torch.cat(tokens_train, dim=0)
tokens_val = torch.cat(tokens_val, dim=0)
tokens_test = torch.cat(tokens_test, dim=0)
tokens_train.size()

torch.Size([63000, 50])

In [15]:
# create datasets 
train_dataset = CustomDataSet(
    tokens_train, 
    y_train, 
    n_steps=num_steps_train//batch_size, 
    batch_size=batch_size,
    mode="train"
)
train_loader = torch.utils.data.DataLoader(
    train_dataset, 
    sampler=RandomSampler(num_samples=train_dataset.num_samples, replacement=False),
    batch_size=batch_size, 
    num_workers=n_workers, 
)

https://www.kaggle.com/alexandersemiletov/toxic-russian-comments

val_dataset = CustomDataSet(
  tokens_val, 
  y_val,
  mode="validation"
  )
val_loader = torch.utils.data.DataLoader(
    val_dataset, 
    batch_size=batch_size, 
    num_workers=n_workers, 
    shuffle=False
)

test_dataset = CustomDataSet(tokens_test, None, mode="validation")
test_loader = torch.utils.data.DataLoader(
    test_dataset, batch_size=batch_size, num_workers=n_workers, shuffle=False
)

len(train_dataset), len(val_dataset), len(test_dataset)

(2500, 7000, 8482)

In [16]:
# define catalyst callbacks 
callbacks = [
    dl.OptimizerCallback(accumulation_steps=accumulation_steps),
    dl.AUCCallback(),
    # dl.EarlyStoppingCallback(patience=10, minimize=True),
    # WandbLogger(project="Project Name", name= 'Run Name')
]

In [17]:
runner = dl.SupervisedRunner()

In [None]:
for ind in range(n_snapshots):
  current_lr = initial_lr * lr_reduce_coef**ind
  print(f"LR: {current_lr}")

  criterion = torch.nn.BCEWithLogitsLoss()
  optimizer = torch.optim.Adam(model.parameters(), lr=current_lr)
  scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(
      optimizer, T_0=snapshot_num_epochs
  )

  runner.train(
      model=cls_model,
      criterion=criterion,
      optimizer=optimizer,
      scheduler=scheduler,
      loaders={"train": train_loader, "valid": val_loader},
      num_epochs=snapshot_num_epochs,
      fp16=True,
      callbacks=callbacks,
      verbose=True
  ) 

  # make pred 
  df_val = make_prediction(
      cls_model, val_loader, device, mode="validation"
  )
  df_test = make_prediction(
    cls_model, test_loader, device, mode="test"
  )

  # calc score 
  score = roc_auc_score(df_val["y_true"], df_val["y_hat"])
  print("\n\n", "-"*60)
  print(f"Val roc auc score: {score:.6f}")
  print("-"*60, "\n\n")

  # opt threshold 
  opt_threshold = get_optimal_threshold_skf(df_val["y_true"], df_val["y_hat"])
  print(opt_threshold)

  # save results 
  file_save_val_preds = os.path.join(path_to_save_preds, f"{name}_{score:.6f}_val_snap{ind}.csv")
  file_save_test_preds = os.path.join(path_to_save_preds, f"{name}_{score:.6f}_test_snap{ind}.csv")
  df_test.to_csv(file_save_test_preds, index=False) 
  df_val.to_csv(file_save_val_preds, index=False) 

  print("Snapshot is trained!")


LR: 3e-05
1/25 * Epoch (train): 100% 625/625 [01:15<00:00,  8.33it/s, loss=0.042]
1/25 * Epoch (valid):  34% 595/1750 [00:22<00:43, 26.63it/s, loss=0.273]

In [None]:
# show biggest error 