In [None]:
#importing libraries
import pandas as pd
import datasets
import numpy as np
import os
#importing Roberta
from transformers import RobertaTokenizerFast, RobertaForSequenceClassification,Trainer, TrainingArguments
#importing pytorch
import torch.nn as nn
import torch
from torch.utils.data import Dataset, DataLoader
#importing scikitpy
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
##
from tqdm import tqdm
import wandb


In [None]:
#importing and splitting imdb dataset
train_data, test_data = datasets.load_dataset('imdb', split =['train', 'test'],
                                             cache_dir='/media/data_files/github/website_tutorials/data')

Reusing dataset imdb (/media/data_files/github/website_tutorials/data/imdb/plain_text/1.0.0/90099cb476936b753383ba2ae6ab2eae419b2e87f71cd5189cb9c8e5814d12a3)


In [None]:
#Loading the model and tokenizer
model = RobertaForSequenceClassification.from_pretrained('roberta-base')
tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base', max_length = 512)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie

In [None]:
# tokenisation function can tokenize the input and return the output
def tokenization(batched_text):
    return tokenizer(batched_text['text'], padding = True, truncation=True)
#using tokenization function, we can create train and test Data
train_data = train_data.map(tokenization, batched = True, batch_size = len(train_data))
test_data = test_data.map(tokenization, batched = True, batch_size = len(test_data))
#formatting the column
train_data.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
test_data.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

Loading cached processed dataset at /media/data_files/github/website_tutorials/data/imdb/plain_text/1.0.0/90099cb476936b753383ba2ae6ab2eae419b2e87f71cd5189cb9c8e5814d12a3/cache-a99d4d251a632ae8.arrow
Loading cached processed dataset at /media/data_files/github/website_tutorials/data/imdb/plain_text/1.0.0/90099cb476936b753383ba2ae6ab2eae419b2e87f71cd5189cb9c8e5814d12a3/cache-4f8f6cc4e515c73f.arrow


In [None]:
# we are defining the accuracy metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# we are defining the arguments needed for training
training_args = TrainingArguments(
    output_dir = '/media/data_files/github/website_tutorials/results',
    num_train_epochs=3,
    per_device_train_batch_size = 4,
    gradient_accumulation_steps = 16,    
    per_device_eval_batch_size= 8,
    evaluation_strategy = "epoch",
    disable_tqdm = False, 
    load_best_model_at_end=True,
    warmup_steps=500,
    weight_decay=0.01,
    logging_steps = 8,
    fp16 = True,
    logging_dir='/media/data_files/github/website_tutorials/logs',
    dataloader_num_workers = 0,
    run_name = 'roberta-classification_titan'
)

In [None]:
# using GPU for training
trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_data,
    eval_dataset=test_data
)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [None]:
# trainig the model
trainer.train()

roberta-classification_titan


[34m[1mwandb[0m: Currently logged in as: [33mjlealtru[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.10.11 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade
[34m[1mwandb[0m: Tracking run with wandb version 0.10.1
[34m[1mwandb[0m: Run data is saved locally in wandb/run-20201128_144117-1w57uczk
[34m[1mwandb[0m: Syncing run [33mroberta-classification_titan[0m


  return torch.tensor(x, **format_kwargs)





Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
0,0.215055,0.160051,0.94104,0.941429,0.93526,0.94768
1,0.143972,0.132966,0.95132,0.951415,0.949558,0.95328
2,0.056242,0.167104,0.95376,0.95416,0.945982,0.96248


TrainOutput(global_step=1170, training_loss=0.1801864493606437)

After the training has been completed we can evaluate the performance of the model and make sure we are loading the right model.

In [None]:
#the accuracy is 95%
trainer.evaluate()

{'eval_loss': 0.13296626508235931,
 'eval_accuracy': 0.95132,
 'eval_f1': 0.9514152261567328,
 'eval_precision': 0.9495577336839589,
 'eval_recall': 0.95328,
 'epoch': 2.9984}