In [None]:
!wget -c https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar xvzf aclImdb_v1.tar.gz

In [None]:
!pip install transformers

In [4]:
%load_ext autoreload
%autoreload 2

import torch
from torch.utils.data import Dataset, DataLoader, TensorDataset
from torch.utils.data import random_split, RandomSampler, SequentialSampler 
from torch.utils.data.sampler import SubsetRandomSampler

from transformers import AdamW, get_linear_schedule_with_warmup
from transformers import BertTokenizer, BertConfig, BertForSequenceClassification

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import time
import datetime
import os
import re
import sklearn.metrics
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import cross_val_predict

Датасет для удобного обращения к данным

In [5]:
class ReviewsDataset(Dataset):
  def __init__(self, root_dir):
    self.pos_reviews_dir = root_dir + "/pos"
    self.pos_reviews_filenames = os.listdir(self.pos_reviews_dir)
    self.neg_reviews_dir = root_dir + "/neg"
    self.neg_reviews_filenames = os.listdir(self.neg_reviews_dir)

  def __len__(self):
    return len(self.pos_reviews_filenames) + len(self.neg_reviews_filenames)

  def __getitem__(self, idx):
    sample = {}
    if torch.is_tensor(idx):
            idx = idx.tolist()
    if idx % 2 == 0:
      k = idx // 2
      review_idx_filename = self.pos_reviews_filenames[k]        
      id_and_rating = re.findall(r"\d+", review_idx_filename)
      with open(os.path.join(self.pos_reviews_dir, review_idx_filename), "r") as review:
        data = review.readlines()
    else:
      k = (idx - 1) // 2
      review_idx_filename = self.neg_reviews_filenames[k]
      id_and_rating = re.findall(r"\d+", review_idx_filename)
      with open(os.path.join(self.neg_reviews_dir, review_idx_filename), "r") as review:
        data = review.readlines()     
    rating = int(id_and_rating[1])
    data = str(data)
    sample = {"Text": data, "Rating": rating}
    return sample

In [6]:
data_train = ReviewsDataset("/content/aclImdb/train")
data_test = ReviewsDataset("/content/aclImdb/test")

In [7]:
reviews_texts_train = []
reviews_rating_train = []

reviews_texts_test = []
reviews_rating_test = []

for i in range(len(data_train)):
  reviews_texts_train.append(data_train[i]['Text'])
  reviews_rating_train.append(data_train[i]['Rating'])
  reviews_texts_test.append(data_test[i]['Text'])
  reviews_rating_test.append(data_test[i]['Rating'])

In [None]:
reviews_rating_train = [[1] if rating >= 7 else [0] for rating in reviews_rating_train]

Загрузка предтренированного токенизатора

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

Предобработка текстов из тренировочной выборки для BERT

In [None]:
'''
Тексты токенизируются
Добавляются токены [CLS] и [SEP]
Каждый текст ограничивается 500-ми токенами
Формируется attention-маска
'''

input_ids = []
attention_masks = []

for text in reviews_texts_train:

    encoded_dict = tokenizer.encode_plus(
                        text,                      
                        add_special_tokens = True, 
                        max_length = 500,      
                        pad_to_max_length = True,
                        return_attention_mask = True,   
                        return_tensors = 'pt',    
                   )
       
    input_ids.append(encoded_dict['input_ids']) 
    attention_masks.append(encoded_dict['attention_mask'])

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(reviews_rating_train)

Разделение данных на тренировочный и валидационный датасеты в соотношении 90 к 10

In [None]:
dataset = TensorDataset(input_ids, attention_masks, labels)

train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size

train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

Создание даталоадеров для тренировочного и валидационного датасетов

In [None]:
batch_size = 8

train_dataloader = DataLoader(
            train_dataset,  
            sampler = RandomSampler(train_dataset), #Для train используется случайный семплер
            batch_size = batch_size 
        )

validation_dataloader = DataLoader(
            val_dataset, 
            sampler = SequentialSampler(val_dataset), #Для val используется последовательный семплер
            batch_size = batch_size 
        )

В качетстве вычислительного устройства выбирается ГПУ, если такого нет, то выбирается ЦПУ

In [9]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

Загрузка предтренированного BERTа для классификации входных последовательностей

In [None]:
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", 
    num_labels = 2, 
    output_attentions = False, 
    output_hidden_states = False,
)
model.cuda()

Настройка количества эпох тренировки BERTа, оптимизатора и планировщика 

In [None]:
epochs = 2
total_steps = len(train_dataloader) * epochs

optimizer = AdamW(model.parameters(),
                  lr = 2e-5,
                  eps = 1e-8 
                )

scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, 
                                            num_training_steps = total_steps)

Функция для красивого вывода времени

In [None]:
def format_time(elapsed):
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))

Цикл тренировки модели

In [None]:
total_t0 = time.time()

for epoch_i in range(0, epochs):

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    t0 = time.time()
    total_train_loss = 0
    correct_samples = 0
    model.train()

    for step, batch in enumerate(train_dataloader):

        if step % 40 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        model.zero_grad()        

        result = model(b_input_ids, 
                       token_type_ids=None, 
                       attention_mask=b_input_mask, 
                       labels=b_labels,
                       return_dict=True)

        loss = result.loss
        logits = result.logits
        total_train_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_train_loss / len(train_dataloader)            
    training_time = format_time(time.time() - t0)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(training_time))
        

    print("")
    print("Running Validation...")

    t0 = time.time()
    model.eval()
    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0

    for batch in validation_dataloader:
        
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        
        with torch.no_grad():        
            result = model(b_input_ids, 
                           token_type_ids=None, 
                           attention_mask=b_input_mask,
                           labels=b_labels,
                           return_dict=True)
        
        loss = result.loss
        logits = result.logits
        total_eval_loss += loss.item()
        logits = logits.detach().cpu().numpy()
        label_pred = np.max(logits, axis=1)
        label_ids = b_labels.to('cpu').numpy()
        correct_samples += (label_pred == label_ids).float().sum()
        
    accuracy = correct_samples / len(val_dataset)
    print("  Accuracy: {0:.2f}".format(accuracy))

    avg_val_loss = total_eval_loss / len(validation_dataloader)
    validation_time = format_time(time.time() - t0)
    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))

print("")
print("Training complete!")

print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))

Сохранение модели и токенизатора

In [None]:
output_dir = 'content/model_save/'

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

print("Saving model to %s" % output_dir)

model_to_save = model.module if hasattr(model, 'module') else model  
model_to_save.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

torch.save(args, os.path.join(output_dir, 'training_args.bin'))

In [None]:
from google.colab import drive
drive.mount('/content/drive')

!cp -r content/model_save/ "/content/drive/MyDrive/BERT"

Преобразование оценок тестовых данных в сентименты

In [11]:
reviews_rating_test = [[1] if rating >= 7 else [0] for rating in reviews_rating_test]

Предобработка тестовых данных

In [None]:
input_ids = []
attention_masks = []

for text in reviews_texts_test:

    encoded_dict = tokenizer.encode_plus(
                        text,                      
                        add_special_tokens = True, 
                        max_length = 500,           
                        pad_to_max_length = True,
                        return_attention_mask = True,  
                        return_tensors = 'pt',    
                   )
       
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(reviews_rating_test)
 
batch_size = 8  

test_dataset = TensorDataset(input_ids, attention_masks, labels)
test_sampler = SequentialSampler(test_dataset)
test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=batch_size)

Проверка работы модели на тестовых данных

In [None]:
model.eval()
predictions , true_labels = [], []
correct_samples = 0

for step, batch in enumerate(test_dataloader):
  
  if step % 40 == 0 and not step == 0:
    print('  Batch {:>5,}  of  {:>5,}..'.format(step, len(test_dataloader)))

  batch = tuple(t.to(device) for t in batch)
  b_input_ids, b_input_mask, b_labels = batch
  
  with torch.no_grad():
      result = model(b_input_ids, 
                     token_type_ids=None, 
                     attention_mask=b_input_mask,
                     return_dict=True)

  logits = result.logits
  logits = logits.detach().cpu().numpy()
  label_ids = b_labels.to('cpu').numpy()
  label_pred = np.max(logits, axis=1)
  correct_samples += (label_pred == label_ids).sum()

accuracy = correct_samples / len(test_dataset)
print("  Accuracy: {0:.2f}".format(accuracy))

print('    DONE.')

Функция для получение эмбеддинга отзыва

In [21]:
def get_BERT_embedding(text):
    model.eval()
    b_input_ids = []
    b_input_mask = []
    encoded_dict = tokenizer.encode_plus(
                        text,                      
                        add_special_tokens = True, 
                        max_length = 500,           
                        pad_to_max_length = True,
                        return_attention_mask = True,   
                        return_tensors = 'pt',     
                   )
    b_input_ids.append(encoded_dict['input_ids'])
    b_input_mask.append(encoded_dict['attention_mask'])
    b_input_ids = torch.cat(b_input_ids).to(device)
    b_input_mask = torch.cat(b_input_mask).to(device)
    with torch.no_grad():
            result = model(b_input_ids, 
                           token_type_ids=None, 
                           attention_mask=b_input_mask,
                           return_dict=True)
    hidden_states = result.hidden_states
    last_hidden_state = hidden_states[-1]
    CLS_token = last_hidden_state[:, 0, :]
    return CLS_token

Получение эмбеддингов тренировочных данных

In [None]:
embedded_texts_train = []
i = 0

for text in reviews_texts_train:
    X = get_BERT_embedding(text)
    X_cpu = X.to('cpu')
    embedded_texts_train.append(X_cpu)
    i += 1
    print(i)
    
tensor_embedded_texts_train = torch.empty((25000, 768))
tensor_embedded_ratings_train = torch.empty((25000,))

for i, text in enumerate(embedded_texts_train):
    tensor_embedded_texts_train[i] = text  
    tensor_embedded_ratings_train[i] = pointed_rating_train[i]

Получение эмбеддингов тестовых данных

In [None]:
embedded_texts_test = []
i = 0

for text in reviews_texts_test:
    X = get_BERT_embedding(text)
    X_cpu = X.to('cpu')
    embedded_texts_test.append(X_cpu)
    i += 1
    print(i)
    
tensor_embedded_texts_test = torch.empty((25000, 768))
tensor_embedded_ratings_test = torch.empty((25000,))

for i, text in enumerate(embedded_texts_test):
    tensor_embedded_texts_test[i] = text
    tensor_embedded_ratings_test[i] = pointed_rating_test[i]

Создаение и фиттинг модели линейной регрессии из sklearn

In [None]:
LR = LinearRegression()
LR.fit(tensor_embedded_texts_train_prime, tensor_embedded_ratings_train)
print("R2 mitrics: ".format(LR.score(tensor_embedded_texts_test, tensor_embedded_ratings_test)))

Функции софтмакса, классификатора сентимента и выставления оценки

In [None]:
def softmaxes(prediction):
    shifted_pred = prediction - np.max(prediction)
    exp_pred = np.exp(shifted_pred)
    softmax = exp_pred / np.sum(exp_pred)
    return softmax

In [None]:
def predict_sentiment(probs):
    sentiment = np.argmax(probs)
    if sentiment == 1:
        sentiment = "positive"
    elif sentiment == 0:
        sentiment = "negative"
    return sentiment

In [None]:
def predict_rating(CLS_token, LR_model):
    rating = LR_model.predict(CLS_token)
    if rating > 10:
        rating = 10
    elif rating < 1:
        rating = 1
    else:
        rating = round(rating)
    return rating

Функция для выставления рейтинга и сентимента отзыва

In [None]:
def predict_rating_and_sentiment_with_BERT(text):
    model.eval()
    b_input_ids = []
    b_input_mask = []
    encoded_dict = tokenizer.encode_plus(
                        text,                      
                        add_special_tokens = True, 
                        max_length = 500,           
                        pad_to_max_length = True,
                        return_attention_mask = True,   
                        return_tensors = 'pt',    
                   )
    b_input_ids.append(encoded_dict['input_ids'])
    b_input_mask.append(encoded_dict['attention_mask'])
    b_input_ids = torch.cat(b_input_ids).to(device)
    b_input_mask = torch.cat(b_input_mask).to(device)
    with torch.no_grad():
            result = model(b_input_ids, 
                           token_type_ids=None, 
                           attention_mask=b_input_mask,
                           return_dict=True)
    logits = result.logits
    logits = logits.detach().cpu().numpy()[0]
    probs = softmaxes(logits)
    
    hidden_states = result.hidden_states
    last_hidden_state = hidden_states[-1]
    CLS_token = last_hidden_state[:, 0, :]

    sentiment = predict_sentiment(probs)
    rating = predict_rating(CLS_token)
    
    return rating, sentiment