In [395]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, SequentialSampler, RandomSampler

from transformers import BertTokenizer, BertForSequenceClassification

from utils import (init_random_seed, count_parameters, EmotionDataset, 
                   train_eval_loop, predict_with_model, calculate_metrics)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [396]:
SEED = 1
TRAIN_MODEL = False  # if True, then train model and save it. if False, just load it
PATH_MODEL = "./Pre-trained Models/BertSentimentAnalysis.pth"

path_positive_tweets = './Data/Positive_tweets.csv'
path_negative_tweets = './Data/Negative_tweets.csv'

In [397]:
init_random_seed(SEED)

In [398]:
model_name = "cointegrated/rubert-tiny2"

tokenizer = BertTokenizer.from_pretrained(model_name, do_lower_case=True)

In [399]:
pos_texts = pd.read_csv(path_positive_tweets, encoding='utf8', sep=';', header=None)
neg_texts = pd.read_csv(path_negative_tweets, encoding='utf8', sep=';', header=None)

In [400]:
pos_texts.sample(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
42347,409941054688092160,1386572538,ArxangelPlay,"@karin_karin_b вспомнил, что ты любишь котиков...",1,0,0,0,953,26,21,0
67507,410393622744276992,1386680439,albinchik_zh,"@Ksu_Lushnikova вдруг потом посмотрят, а запис...",1,0,0,0,1905,45,34,1
77802,410713862720466944,1386756790,Arinorik,"RT @Shady_Kate: @Arinorik Ооо, выздоравливай, ...",1,0,1,0,7702,262,132,3


In [401]:
print(f'Len of positive texts: {len(pos_texts[3])}')
print(f'Len of negative texts: {len(neg_texts[3])}')

Len of positive texts: 114911
Len of negative texts: 111923


In [402]:
sentences = np.concatenate([pos_texts[3].values, neg_texts[3].values])
labels = [[1] for _ in range(pos_texts.shape[0])] + [[0] for _ in range(neg_texts.shape[0])]

assert len(sentences) == len(labels) == pos_texts.shape[0] + neg_texts.shape[0]

In [403]:
print(sentences[666])

Медведев: "Партия Единая Россия - это кусок нашей страны". Ага-ага, говна кусок!)


In [404]:
train_sentences, test_sentences, train_labels, test_labels = train_test_split(
    sentences, 
    labels,
    test_size=0.2,
    shuffle=True,
    random_state=SEED
)

train_sentences, validation_sentences, train_labels, validation_labels = train_test_split(
    train_sentences,
    train_labels,
    test_size=0.25,
    shuffle=True,
    random_state=SEED
)

len(train_sentences), len(validation_sentences), len(test_sentences)

(136100, 45367, 45367)

In [405]:
train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)
test_labels = torch.tensor(test_labels)

In [406]:
batch_size = 256
max_length = 50

train_dataset = EmotionDataset(train_sentences, tokenizer, labels=train_labels, max_length=max_length)
train_dataloader = DataLoader(
    dataset=train_dataset, 
    batch_size=batch_size, 
    shuffle=False, 
    sampler=RandomSampler(train_dataset)
)

In [407]:
validation_dataset = EmotionDataset(validation_sentences, tokenizer, labels=validation_labels, max_length=max_length)
validation_dataloader = DataLoader(
    dataset=validation_dataset,
    batch_size=batch_size,
    shuffle=False,
    sampler=SequentialSampler(validation_dataset)
)

In [408]:
test_dataset = EmotionDataset(test_sentences, tokenizer, labels=test_labels, max_length=max_length)
test_dataloader = DataLoader(
    dataset=test_dataset, 
    batch_size=batch_size,
    shuffle=False,
    sampler=SequentialSampler(test_dataset)
)

In [409]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

num_labels = 2
learning_rate = 5e-4
num_epoch = 5
scheduler_patience = 2
early_stopping_patience = 3

In [410]:
model = BertForSequenceClassification.from_pretrained(model_name)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at cointegrated/rubert-tiny2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [411]:
print(f'Num of teachable parameters before freezing: {count_parameters(model)}')

for param in model.parameters():
    param.requires_grad = False
    
model.classifier = nn.Linear(model.classifier.in_features, out_features=num_labels, bias=True)

print(f'Num of teachable parameters after freezing: {count_parameters(model)}')

Num of teachable parameters before freezing: 29194394
Num of teachable parameters after freezing: 626


In [412]:
torch.cuda.empty_cache()

In [413]:
model = model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                       patience=scheduler_patience,
                                                       verbose=True)

In [414]:
if TRAIN_MODEL:
    best_model, history_loss_train, history_loss_valid = train_eval_loop(
        model,
        train_dataloader,
        validation_dataloader,
        optimizer,
        epoch_n=num_epoch,
        device=device,
        early_stopping_patience=early_stopping_patience,
        scheduler=scheduler
    )

    plt.plot(history_loss_train, label='Train loss')
    plt.plot(history_loss_valid, label='Valid loss')
    plt.legend(loc='upper right')
    plt.show()
else:
    model.load_state_dict(torch.load(PATH_MODEL, map_location=device))

In [415]:
y_pred, y_true = predict_with_model(
    model,
    test_dataloader,
    device=device,
    use_sigmoid=True,
    return_labels=True
)

metrics_score = calculate_metrics(y_true, y_pred)
print(f'Recall: {metrics_score[0]:1.4f}\n'
      f'Precision: {metrics_score[1]:1.4f}\n'
      f'Accuracy: {metrics_score[2]:1.4f}\n')

100%|[32m██████████[0m| 178/178 [00:38<00:00,  4.68it/s]

Recall: 0.8881
Precision: 0.8546
Accuracy: 0.8661





In [416]:
if TRAIN_MODEL: 
    torch.save(best_model.state_dict(), PATH_MODEL)