In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [2]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
import os
import time
from tqdm import tqdm
import gc
import random
import copy
from collections import defaultdict

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import KFold, GroupKFold, StratifiedKFold

import torch
import transformers
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader
from torch import cuda
from torch.cuda.amp import GradScaler, autocast

import warnings
warnings.simplefilter('ignore')

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed_everything()

In [4]:
class Config:
    DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    EPOCHS = 5
    LR = 1e-5
    MAX_LEN = 125
    N_SPLITS = 2
    TRAIN_BS = 16
    VALID_BS = 32
    BERT_MODEL = transformers.AutoModel.from_pretrained('sberbank-ai/ruRoberta-large')
    MODEL_NAME = 'sberbank-ai/ruRoberta-large'
    TRAIN_FILE = '/content/gdrive/My Drive/RBK/train_dataset_train.csv'
    TEST_FILE = '/content/gdrive/My Drive/RBK/test_dataset_test.csv'
    TOKENIZER = transformers.AutoTokenizer.from_pretrained('sberbank-ai/ruRoberta-large', do_lower_case=True)
    scaler = GradScaler()

Some weights of the model checkpoint at sberbank-ai/ruRoberta-large were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.decoder.bias', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at sberbank-ai/ruRoberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to b

In [5]:
train = pd.read_csv(Config.TRAIN_FILE)
test = pd.read_csv(Config.TEST_FILE)

train.head()

Unnamed: 0,document_id,title,publish_date,session,authors,ctr,category,tags,views,depth,full_reads_percent
0,624ac09c9a7947db3d80c98eIDE7mtH4RBqGn-8MXfGffQ,Европейский банк развития приостановил доступ ...,2022-04-04 10:29:44,IDE7mtH4RBqGn-8MXfGffQ,[],1.58,5409f11ce063da9c8b588a18,"['55928d339a794751dc8303d6', '542d1e28cbb20f86...",20460,1.134,35.85
1,620f6b899a7947701cf489e1KtVJsteHStO5oditt3Uvzw,Кремль назвал регулярным процессом учебные зап...,2022-02-18 10:00:39,KtVJsteHStO5oditt3Uvzw,"['54244e01cbb20f03076b236d','5878a2ec9a7947e53...",1.853,5409f11ce063da9c8b588a12,"['549d25df9a794775979561d2', '58abcf539a7947f1...",19038,1.142,38.355
2,620730cf9a7947ab96a44e27hk7puWJwSziw0m3sfTkKWA,Госсекретарь Швеции заявила о нежелании вступа...,2022-02-12 04:24:02,hk7puWJwSziw0m3sfTkKWA,[],0.0,5409f11ce063da9c8b588a12,"['5430f451cbb20f73931ecd05', '5409f15de063daa0...",51151,1.185,36.424
3,6262a5889a79470b78c9ca307UKY2SSZTjCcjhwBzxw37w,Песков назвал прагматичной выдачу лицензии Газ...,2022-04-22 13:24:55,7UKY2SSZTjCcjhwBzxw37w,[],0.0,5409f11ce063da9c8b588a12,"['5409f297e063daa0f408b11c', '545caa9ecbb20f36...",3782,1.053,30.169
4,626678929a79477ca0101568wuMYES90REuV5YhrN75IXg,В Хабаровске задержали главу филиала РАНХиГС п...,2022-04-25 10:42:23,wuMYES90REuV5YhrN75IXg,[],0.0,5433e5decbb20f277b20eca9,"['5409f42ae063daa0f408b5d7', '585c20e19a79470e...",3065,1.063,34.617


In [6]:
train['title'][99]

'Bild узнала о подготовке Германии к высылке 100 российских дипломатов'

In [7]:
class RBKDataset(Dataset):
    def __init__(self, title, target=None):
        self.title = title
        self.target = target
        self.tokenizer = Config.TOKENIZER
        self.max_len = Config.MAX_LEN
    
    def __len__(self):
        return len(self.title)
    
    def __getitem__(self, idx):
        title = str(self.title[idx])
        title = ' '.join(title.split())
        global inputs
        
        inputs = self.tokenizer.encode_plus(
            title,
            None,
            truncation=True,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True
        )        
        ids = torch.tensor(inputs['input_ids'], dtype=torch.long)
        mask = torch.tensor(inputs['attention_mask'], dtype=torch.long)
        #token_type_ids = torch.tensor(inputs['token_type_ids'], dtype=torch.long)
        
        targets = torch.tensor(self.target[idx], dtype=torch.float)
        return {
                'ids': ids,
                'mask': mask,
                #'token_type_ids': token_type_ids,
                'targets': targets
            }

In [8]:
class RBKModel(nn.Module):
    def __init__(self, model_name):
        super(RBKModel, self).__init__()
        self.model = transformers.AutoModel.from_pretrained(Config.MODEL_NAME)
        self.config = transformers.AutoConfig.from_pretrained(Config.MODEL_NAME)
        self.drop = nn.Dropout(p=0.2)
        self.fc = nn.Linear(self.config.hidden_size, 1)
        
    def forward(self, ids, mask):        
        _, out = self.model(input_ids=ids,attention_mask=mask, return_dict=False) #token_type_ids=token_type_ids
        out = self.drop(out)
        output = self.fc(out)
        return output

In [9]:
class Trainer:
    def __init__(
        self, 
        model, 
        optimizer, 
        scheduler, 
        train_dataloader, 
        valid_dataloader,
        device
    ):
        self.model = model
        self.optimizer = optimizer
        self.scheduler = scheduler
        self.train_data = train_dataloader
        self.valid_data = valid_dataloader
        self.loss_fn = self.yield_loss
        self.device = device
        
    def yield_loss(self, outputs, targets):
        """
        This is the loss function for this task
        """
        target_mean = torch.mean(targets)
        ss_tot = torch.sum((targets - target_mean) ** 2)
        ss_res = torch.sum((targets - outputs) ** 2)
        r2 = 1 - ss_res / ss_tot
        return r2
    
    def train_one_epoch(self):
        """
        This function trains the model for 1 epoch through all batches
        """
        prog_bar = tqdm(enumerate(self.train_data), total=len(self.train_data))
        self.model.train()
        with autocast():
            for idx, inputs in prog_bar:
                ids = inputs['ids'].to(self.device, dtype=torch.long)
                mask = inputs['mask'].to(self.device, dtype=torch.long)
                #ttis = inputs['token_type_ids'].to(self.device, dtype=torch.long)
                targets = inputs['targets'].to(self.device, dtype=torch.float)

                outputs = self.model(ids=ids, mask=mask).view(-1) #token_type_ids=ttis

                loss = self.loss_fn(outputs, targets)
                prog_bar.set_description('loss: {:.2f}'.format(loss.item()))

                Config.scaler.scale(loss).backward()
                Config.scaler.step(self.optimizer)
                Config.scaler.update()
                self.optimizer.zero_grad()
                self.scheduler.step()
    
    def valid_one_epoch(self):
        """
        This function validates the model for one epoch through all batches of the valid dataset
        It also returns the validation Root mean squared error for assesing model performance.
        """
        prog_bar = tqdm(enumerate(self.valid_data), total=len(self.valid_data))
        self.model.eval()
        all_targets = []
        all_predictions = []
        with torch.no_grad():
            for idx, inputs in prog_bar:
                ids = inputs['ids'].to(self.device, dtype=torch.long)
                mask = inputs['mask'].to(self.device, dtype=torch.long)
               # ttis = inputs['token_type_ids'].to(self.device, dtype=torch.long)
                targets = inputs['targets'].to(self.device, dtype=torch.float)

                outputs = self.model(ids=ids, mask=mask).view(-1)
                all_targets.extend(targets.cpu().detach().numpy().tolist())
                all_predictions.extend(outputs.cpu().detach().numpy().tolist())

        val_r2_loss = r2_score(all_targets, all_predictions)
        print('Validation R2: {:.2f}'.format(val_r2_loss))
        
        return val_r2_loss
    
    def get_model(self):
        return self.model

In [10]:
# Training Code
if __name__ == '__main__':
    if torch.cuda.is_available():
        print("[INFO] Using GPU: {}\n".format(torch.cuda.get_device_name()))
        DEVICE = torch.device('cuda:0')
    else:
        print("\n[INFO] GPU not found. Using CPU: {}\n".format(platform.processor()))
        DEVICE = torch.device('cpu')

    data = train[['title', 'depth']]

    
    # Do Kfolds training and cross validation
    kf = KFold(n_splits=Config.N_SPLITS)
    #nb_bins = int(np.floor(1 + np.log2(len(data))))
    #data.loc[:, 'bins'] = pd.cut(data[['views', 'depth', 'full_reads_percent']], bins=nb_bins, labels=False)
    
    for fold, (train_idx, valid_idx) in enumerate(kf.split(X=data, y=data['depth'].values)):
        # Train for only 1 fold, you can train it for more.
        if fold != 0:
            continue
        print(f"\nFold: {fold}")
        print(f"{'-'*20}\n")
        
        train_data = data.loc[train_idx]
        valid_data = data.loc[valid_idx]
        
        train_set = RBKDataset(
            title = train_data['title'].values,
            target = train_data['depth'].values
        )

        valid_set = RBKDataset(
            title = valid_data['title'].values,
            target = valid_data['depth'].values
        )

        train = DataLoader(
            train_set,
            batch_size = Config.TRAIN_BS,
            shuffle = True,
            num_workers=8
        )

        valid = DataLoader(
            valid_set,
            batch_size = Config.VALID_BS,
            shuffle = False,
            num_workers=8
        )

        model = RBKModel(Config.MODEL_NAME).to(DEVICE)
        nb_train_steps = int(len(train_data) / Config.TRAIN_BS * Config.EPOCHS)
        optimizer = optim.AdamW(model.parameters(), lr=Config.LR, weight_decay=1e-6)
        scheduler = transformers.get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=0,
            num_training_steps=nb_train_steps
        )

        trainer = Trainer(model, optimizer, scheduler, train, valid, DEVICE)

        best_loss = 100
        for epoch in range(1, Config.EPOCHS+1):
            print(f"\n{'--'*5} EPOCH: {epoch} {'--'*5}\n")

            # Train for 1 epoch
            trainer.train_one_epoch()

            # Validate for 1 epoch
            current_loss = trainer.valid_one_epoch()

            if current_loss < best_loss:
                print(f"Saving best model in this fold: {current_loss:.4f}")
                #torch.save(trainer.get_model().state_dict(), f"{Config.MODEL_NAME}_fold_{fold}.pt")
                best_loss = current_loss
        
        print(f"Best R2 in fold: {fold} was: {best_loss:.4f}")
        print(f"Final R2 in fold: {fold} was: {current_loss:.4f}")

[INFO] Using GPU: Tesla P100-PCIE-16GB


Fold: 0
--------------------



Some weights of the model checkpoint at sberbank-ai/ruRoberta-large were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.decoder.bias', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at sberbank-ai/ruRoberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to b


---------- EPOCH: 1 ----------



loss: -1314.30: 100%|██████████| 219/219 [02:27<00:00,  1.48it/s]
 22%|██▏       | 24/110 [00:09<00:34,  2.47it/s]


KeyboardInterrupt: ignored