In [1]:
!pip3 install torch transformers



In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset, random_split
import pandas as pd
import numpy as np
import random
import time
import datetime

from pathlib import Path
PATH_TO_DATA = Path("drive/My Drive/Masterpiecer/data")
PATH_TO_MODELS = Path("drive/My Drive/Masterpiecer/models")

In [0]:
def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))

    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [0]:
tokenizer = BertTokenizer.from_pretrained(
    str(PATH_TO_MODELS / "rubert_cased_L-12_H-768_A-12_pt"),
    do_lower_case=False
)

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.cuda.get_device_name(device)

'Tesla P4'

In [0]:
model = BertForSequenceClassification.from_pretrained(
    PATH_TO_MODELS / "rubert_cased_L-12_H-768_A-12_pt",
    num_labels=1,
    output_attentions=False,
    output_hidden_states=False
).to(device)

In [0]:
data = pd.read_csv(PATH_TO_DATA / "kinopoisk.csv")
X_raw = data['synopsis'].str.replace('\n', ' ').values

In [9]:
input_ids = []
attention_masks = []

for sentence in X_raw:
    encoded_dict = tokenizer.encode_plus(
        sentence,
        add_special_tokens=True,
        max_length=320,
        pad_to_max_length=True,
        return_attention_mask=True,
        return_tensors='pt'
    )

    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(data['kinopoisk_score'].values)

dataset = TensorDataset(input_ids, attention_masks, labels)

train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size

train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

print(train_size, " training samples")
print(val_size, " validation samples")

4307  training samples
479  validation samples


In [0]:
batch_size = 4

train_dataloader = DataLoader(
    train_dataset,
    sampler=RandomSampler(train_dataset),
    batch_size=batch_size
)

validation_dataloader = DataLoader(
    val_dataset,
    sampler=SequentialSampler(val_dataset),
    batch_size=batch_size
)

In [0]:
optimizer = AdamW(model.parameters(),
                 lr=2e-5, eps=1e-8)

epochs = 4
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0, num_training_steps=total_steps)

In [14]:
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

training_stats = []

total_t0 = time.time()

torch.cuda.empty_cache()

for epoch_i in range(epochs):

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    t0 = time.time()

    total_train_loss = 0

    model.train()

    for step, batch in enumerate(train_dataloader):

        if step % 40 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        b_input_ids = batch[0].to(device)
        b_attention_mask = batch[1].to(torch.float32).to(device)
        b_labels = batch[2].to(torch.float32).to(device)

        torch.cuda.empty_cache()
        model.zero_grad()

        loss, logits = model(b_input_ids,
                             token_type_ids=None,
                             attention_mask=b_attention_mask,
                             labels=b_labels)

        total_train_loss += loss.item()

        loss.float().backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()

    avg_train_loss = total_train_loss / len(train_dataloader)
    training_time = format_time(time.time() - t0)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epoch took: {:}".format(training_time))

    print("")
    print("Running Validation...")

    t0 = time.time()

    model.eval()

    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0

    for batch in validation_dataloader:

        b_input_ids = batch[0].to(device)
        b_attention_mask = batch[1].to(torch.float32).to(device)
        b_labels = batch[2].to(torch.float32).to(device)

        with torch.no_grad():

            loss, logits = model(b_input_ids,
                                 token_type_ids=None,
                                 attention_mask=b_attention_mask,
                                 labels=b_labels)

        total_eval_loss += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

    # Calculate the average loss over all of the batches.
    avg_val_loss = total_eval_loss / len(validation_dataloader)

    # Measure how long the validation run took.
    validation_time = format_time(time.time() - t0)

    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))

print("")
print("Training complete!")

print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))



Training...
  Batch    40  of  1,077.    Elapsed: 0:00:19.
  Batch    80  of  1,077.    Elapsed: 0:00:38.
  Batch   120  of  1,077.    Elapsed: 0:00:57.
  Batch   160  of  1,077.    Elapsed: 0:01:15.
  Batch   200  of  1,077.    Elapsed: 0:01:34.
  Batch   240  of  1,077.    Elapsed: 0:01:53.
  Batch   280  of  1,077.    Elapsed: 0:02:12.
  Batch   320  of  1,077.    Elapsed: 0:02:31.
  Batch   360  of  1,077.    Elapsed: 0:02:50.
  Batch   400  of  1,077.    Elapsed: 0:03:10.
  Batch   440  of  1,077.    Elapsed: 0:03:29.
  Batch   480  of  1,077.    Elapsed: 0:03:48.
  Batch   520  of  1,077.    Elapsed: 0:04:07.
  Batch   560  of  1,077.    Elapsed: 0:04:26.
  Batch   600  of  1,077.    Elapsed: 0:04:45.
  Batch   640  of  1,077.    Elapsed: 0:05:04.
  Batch   680  of  1,077.    Elapsed: 0:05:23.
  Batch   720  of  1,077.    Elapsed: 0:05:42.
  Batch   760  of  1,077.    Elapsed: 0:06:01.
  Batch   800  of  1,077.    Elapsed: 0:06:20.
  Batch   840  of  1,077.    Elapsed: 0:06:39.


In [0]:
torch.save(model, str(PATH_TO_MODELS / "rater_trained.dump"))
