In [1]:
from google.colab import drive
drive.mount('/content/drive')
DATA_PATH = "/content/drive/My Drive/alms/"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
lines_train = list(open(DATA_PATH + 'data/news_train.txt', 'r', encoding='utf-8'))
lines_test = list(open(DATA_PATH + 'data/news_test.txt', 'r', encoding='utf-8'))

In [3]:
from tqdm.notebook import trange, tqdm
def parse(lines):
    targets = []
    titles = []
    texts = []
    for i in trange(len(lines)):
        target, title, text = lines[i].split('\t')
        targets.append(target)
        if text[-1] == '\n':
            text = text[:-1]
        titles.append(title)
        texts.append(text)

    return targets, titles, texts

In [4]:
targets, titles, texts = parse(lines_train)
data_test = parse(lines_test)

HBox(children=(FloatProgress(value=0.0, max=15000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3000.0), HTML(value='')))




In [5]:
from collections import Counter

target_counter = Counter(targets)
target_dict = dict(zip(target_counter, list(range(10))))
targets_train = [target_dict[key] for key in targets]
targets_test = [target_dict[key] for key in data_test[0]]
target_counter

Counter({'business': 554,
         'culture': 2053,
         'economics': 2080,
         'forces': 1225,
         'life': 2033,
         'media': 2111,
         'science': 2156,
         'sport': 2215,
         'style': 284,
         'travel': 289})

In [6]:
# объединение заголовка и текста статьи (необязательно)
texts = [title + text for title, text in zip(titles, texts)]
data_test[1][:] = [title + text for title, text in zip(data_test[1], data_test[2])]

In [None]:
!pip install transformers

In [8]:
import torch
from transformers import BertTokenizer

In [9]:
tokenizer = BertTokenizer.from_pretrained("DeepPavlov/rubert-base-cased", 
                                          do_lower_case=True)

In [10]:
data_train = tokenizer.batch_encode_plus(texts, add_special_tokens=True, 
                                         padding='max_length', truncation=True, max_length=256, 
                                         return_tensors='pt', return_attention_mask=True)
data_test = tokenizer.batch_encode_plus(data_test[1], add_special_tokens=True, 
                                        padding='max_length', truncation=True, max_length=256, 
                                        return_tensors='pt', return_attention_mask=True)

In [11]:
from sklearn.model_selection import train_test_split

input_ids_train, input_ids_val = train_test_split(data_train['input_ids'], test_size=0.1, random_state=42)
attention_masks_train, attention_masks_val = train_test_split(data_train['attention_mask'], test_size=0.1, random_state=42)
targets_train, targets_val = train_test_split(targets_train, test_size=0.1, random_state=42)

In [12]:
targets_train = torch.LongTensor(targets_train)
targets_val = torch.LongTensor(targets_val)

targets_test = torch.LongTensor(targets_test)

In [13]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

batch_size = 16

train_data = TensorDataset(input_ids_train, attention_masks_train, targets_train)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(input_ids_val, attention_masks_val, targets_val)
val_sampler = SequentialSampler(input_ids_val)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

test_data = TensorDataset(data_test['input_ids'], data_test['attention_mask'])
test_sampler = SequentialSampler(data_test['input_ids'])
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

In [14]:
from transformers import BertForSequenceClassification, AdamW, BertConfig

model = BertForSequenceClassification.from_pretrained("DeepPavlov/rubert-base-cased", 
                                                      num_labels = 10, 
                                                      output_attentions = False,
                                                      output_hidden_states = False)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
optimizer = AdamW(model.parameters(), lr = 2e-5, eps = 1e-8)

In [16]:
from transformers import get_linear_schedule_with_warmup

n_epochs = 4

n_steps = len(train_dataloader) * n_epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0, num_training_steps = n_steps)

In [19]:
import random
from datetime import timedelta
import time
from sklearn.metrics import classification_report
import numpy as np

device = torch.device("cuda")
seed = 42
random.seed = (seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
model.cuda()

losses = []

for epoch in range(n_epochs):
    print("Training:")
    start = time.time()
    mean_loss = 0
    model.train()
    for step, batch in enumerate(train_dataloader):
        
        torch.cuda.empty_cache()
        if (step + 1) % 100 == 0:
            duration = timedelta(seconds=int(time.time() - start))
            print('Batch {:>5,}  of  {:>5,}.    Time: {:}.'.format(step + 1, len(train_dataloader), duration))
        b_input_ids = batch[0].to(device)
        b_attention_masks = batch[1].to(device)
        b_targets = batch[2].to(device)
        model.zero_grad()
        
        torch.cuda.empty_cache()
        outputs = model(b_input_ids, attention_mask=b_attention_masks, labels = b_targets)
        loss = outputs[0]
        
        mean_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        
    mean_loss = mean_loss / len(train_dataloader)
    
    losses.append(mean_loss)
    print("Mean loss: " , mean_loss)
    print("Training epoch took:" , timedelta(seconds=int(time.time() - start)))
    
    print()
    print("Validation:")
    model.eval()
    
    start = time.time()
    predictions = torch.Tensor().to(dtype=torch.int8)
    
    for batch in val_dataloader:
        
        b_input_ids = batch[0].to(device)
        b_attention_masks = batch[1].to(device)
        b_targets = batch[2].to(device)
        
        with torch.no_grad():
            outputs = model(b_input_ids, attention_mask=b_attention_masks, output_hidden_states=False, output_attentions=False, return_dict=True)
        
        
        predictions = torch.cat((predictions, outputs.logits.cpu().argmax(axis=1)))
        torch.cuda.empty_cache()
        
    
    print(classification_report(targets_val, predictions, target_names=target_dict.keys()))
    print("Validation took: {:}".format(timedelta(seconds = int(time.time() - start))))
    print()
            
            

Training:
Batch   100  of    844.    Time: 0:01:27.
Batch   200  of    844.    Time: 0:02:55.
Batch   300  of    844.    Time: 0:04:24.
Batch   400  of    844.    Time: 0:05:52.
Batch   500  of    844.    Time: 0:07:20.
Batch   600  of    844.    Time: 0:08:49.
Batch   700  of    844.    Time: 0:10:17.
Batch   800  of    844.    Time: 0:11:45.
Mean loss:  0.4097202039672441
Training epoch took: 0:12:25

Validation:
              precision    recall  f1-score   support

       sport       0.97      0.99      0.98       231
     culture       0.95      0.94      0.94       186
     science       0.91      0.86      0.89       212
       media       0.86      0.86      0.86       215
   economics       0.89      0.82      0.85       206
        life       0.91      0.84      0.87       191
      forces       0.87      0.97      0.92       134
      travel       1.00      0.84      0.91        31
       style       0.84      0.94      0.89        34
    business       0.56      0.80      0

In [20]:
print("Testing:")
model.eval()

t0 = time.time()
predictions = torch.Tensor().to(dtype=torch.int8)

for batch in test_dataloader:

    b_input_ids = batch[0].to(device)
    b_attention_masks = batch[1].to(device)

    with torch.no_grad():
        outputs = model(b_input_ids, attention_mask=b_attention_masks, output_hidden_states=False, output_attentions=False, return_dict=True)

    predictions = torch.cat((predictions, outputs.logits.cpu().argmax(axis=1)))

print(classification_report(targets_test, predictions, target_names=target_dict.keys()))

Testing:
              precision    recall  f1-score   support

       sport       0.98      0.99      0.98       423
     culture       0.94      0.95      0.94       426
     science       0.93      0.91      0.92       466
       media       0.87      0.93      0.89       403
   economics       0.93      0.89      0.91       426
        life       0.93      0.90      0.92       415
      forces       0.94      0.93      0.94       245
      travel       0.79      0.83      0.81        54
       style       0.90      0.88      0.89        52
    business       0.61      0.66      0.63        90

    accuracy                           0.92      3000
   macro avg       0.88      0.89      0.88      3000
weighted avg       0.92      0.92      0.92      3000

