Install the required libraries

In [None]:
# for cpu !pip3 install torch torchvision torchaudio
!pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
!pip3 install transformers
!pip3 install sklearn
!pip3 install pandas
!pip3 install numpy
!pip3 install tabulate
!pip3 install tqdm
!pip3 install sentencepiece

Load the required libraries

In [1]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import RobertaForSequenceClassification, RobertaTokenizer
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from tabulate import tabulate
from tqdm import trange
import random
from sklearn.metrics import accuracy_score, f1_score
import time

Code for RoBERTa Classifier

In [2]:
# Load the dataset
data = pd.read_csv('IMDB_Reviews_Top_250_preprocessed_without_stopwords.csv')

In [3]:
data.head(5)

Unnamed: 0.1,Unnamed: 0,Review Title,Review Text,Rating,Movie Title
0,0,One Step Masterpiece !,'s honestly absurd good `` Spider-Verse '' mov...,10.0,Spider-Man: Across the Spider-Verse
1,1,Masterful IMAX Spider-Man,"animation , flow everything , genius character...",10.0,Spider-Man: Across the Spider-Verse
2,2,One best sequels anything ever made,"n't already obvious first film , 's officially...",10.0,Spider-Man: Across the Spider-Verse
3,3,new Empire Strikes Back,film visual concert . animation character desi...,10.0,Spider-Man: Across the Spider-Verse
4,5,movie 2023 far .,'s time since 've walked cinema feeling satisf...,10.0,Spider-Man: Across the Spider-Verse


In [4]:
# Set the threshold for binary conversion
threshold = 7.0

# Convert the 'Rating' column to a binary variable
data['Binary Rating'] = data['Rating'].apply(lambda x: 1 if x >= threshold else 0)

In [5]:
# Extract reviews texts and ratings into arrays
reviews = data['Review Text'].values
labels = data['Binary Rating'].values

In [6]:
# Shouldn't be necessary but just to be safe
for review in reviews:
    str(review)  

In [7]:
# Count the number of rows with label 1
num_label_1 = data[data['Binary Rating'] == 1]['Binary Rating'].count()

# Count the number of rows with label 0
num_label_0 = data[data['Binary Rating'] == 0]['Binary Rating'].count()

print(f"Number of rows with Binary Rating 1: {num_label_1}")
print(f"Number of rows with Binary Rating 0: {num_label_0}")

Number of rows with Binary Rating 1: 214112
Number of rows with Binary Rating 0: 35009


In [8]:
reviews[0:5]

array(["'s honestly absurd good `` Spider-Verse '' movies . `` Across Spider-Verse '' great , better `` Spider-Verse '' . really n't know . `` Spider-Man : Across Spider-Verse '' fantastic ! Deftly juggles deeply heartfelt character beats crazy multiverse content , packed many delightful easter eggs . Loved Gwen 's story expanded , scenes Shea Whigham 's Captain Stacy truly special . Ca n't wait third one . Every frame movie gorgeous ! n't want blink n't want miss anything . watch film & & finds something new enjoy . surprises film truly n't expecting . `` Across Spider-Verse '' another milestone animation . Yes 's gorgeous & visually mind-blowing , sequel surpasses first always putting Miles family front center . movie feels like massive celebration Spider-Man fans everywhere , whether love movies , games , comics , etc . , cameos multiversal elements come second Miles Gwen 's story . stunning achievement . Yes , 's visually dazzling , expansive , expressive new realms movie 's real s

In [9]:
labels[0:5]

array([1, 1, 1, 1, 1], dtype=int64)

In [10]:
# Load RoBERTa tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

In [11]:
# testing tokens and token Ids for a random sentence
def print_rand_sentence():
    index = random.randint(0, len(reviews)-1) #random index in texts list
    table = np.array([tokenizer.tokenize(reviews[index]), 
                    tokenizer.convert_tokens_to_ids(tokenizer.tokenize(reviews[index]))]).T #tokenize random text in texts list
    print(tabulate(table,
                 headers = ['Tokens', 'Token IDs'],
                 tablefmt = 'fancy_grid'))  #print in table format

print_rand_sentence()

╒═════════════════╤═════════════╕
│ Tokens          │   Token IDs │
╞═════════════════╪═════════════╡
│ Prep            │       44165 │
├─────────────────┼─────────────┤
│ oster           │       13991 │
├─────────────────┼─────────────┤
│ ous             │        1827 │
├─────────────────┼─────────────┤
│ Ġ,              │        2156 │
├─────────────────┼─────────────┤
│ Ġrep            │        2851 │
├─────────────────┼─────────────┤
│ ulsive          │       25170 │
├─────────────────┼─────────────┤
│ Ġ,              │        2156 │
├─────────────────┼─────────────┤
│ Ġmindless       │       41406 │
├─────────────────┼─────────────┤
│ ĠY              │         854 │
├─────────────────┼─────────────┤
│ ank             │        3153 │
├─────────────────┼─────────────┤
│ Ġtrash          │        8875 │
├─────────────────┼─────────────┤
│ Ġ.              │         479 │
├─────────────────┼─────────────┤
│ Ġlot            │         319 │
├─────────────────┼─────────────┤
│ Ġn          

In [12]:
# Find max sequensce lenght

MAX_LEN = 0
for review in reviews:
    tokenized = tokenizer(review,return_tensors='pt',add_special_tokens=True)
    MAX_LEN = max(MAX_LEN, tokenized['input_ids'].size()[1])

Token indices sequence length is longer than the specified maximum sequence length for this model (1157 > 512). Running this sequence through the model will result in indexing errors


In [13]:
MAX_LEN

2853

In [14]:
# Tokenize and encode texts then extract token ids, attention masks and labels in torch tensor format.
token_id = []
attention_masks = []

def preprocessing(input_text, tokenizer):
    return tokenizer.encode_plus(                    #returns dictionary with token ids, attention masks and token type ids
                        input_text,
                        max_length = 512,            
                        padding = 'max_length',    #padding tokens to be of the same size
                        truncation=True,           # Truncate the text if it exceeds 512 tokens
                        return_attention_mask = True,
                        return_tensors = 'pt'        #torch tensor format
                   )


for sample in reviews:
    encoding_dict = preprocessing(sample, tokenizer)
    token_id.append(encoding_dict['input_ids']) 
    attention_masks.append(encoding_dict['attention_mask'])

#concatinating the tesnors in a single dimension
token_id = torch.cat(token_id, dim = 0)
attention_masks = torch.cat(attention_masks, dim = 0)
labels = torch.tensor(labels)

In [15]:
token_id

tensor([[    0,    18, 10728,  ...,     1,     1,     1],
        [    0, 25042,  1258,  ...,     1,     1,     1],
        [    0,   282,    75,  ...,     1,     1,     1],
        ...,
        [    0,   397,  3996,  ...,     1,     1,     1],
        [    0, 17425, 27941,  ...,     1,     1,     1],
        [    0,   991, 23745,  ...,     1,     1,     1]])

In [16]:
attention_masks

tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])

In [17]:
labels

tensor([1, 1, 1,  ..., 1, 1, 1])

In [18]:
def print_rand_sentence_encoding():
    index = random.randint(0, len(reviews) - 1)
    tokens = tokenizer.convert_ids_to_tokens(token_id[index].numpy().tolist())  # Convert token_ids to a list of strings
    token_ids = token_id[index].numpy()
    attention = attention_masks[index].numpy()

    table = np.array([tokens, token_ids, attention]).T
    print(tabulate(table, headers=['Tokens', 'Token IDs', 'Attention Mask'], tablefmt='fancy_grid'))

print_rand_sentence_encoding()

╒════════════════╤═════════════╤══════════════════╕
│ Tokens         │   Token IDs │   Attention Mask │
╞════════════════╪═════════════╪══════════════════╡
│ <s>            │           0 │                1 │
├────────────────┼─────────────┼──────────────────┤
│ film           │       21928 │                1 │
├────────────────┼─────────────┼──────────────────┤
│ Ġmany          │         171 │                1 │
├────────────────┼─────────────┼──────────────────┤
│ Ġhighs         │        5487 │                1 │
├────────────────┼─────────────┼──────────────────┤
│ Ġlows          │        6917 │                1 │
├────────────────┼─────────────┼──────────────────┤
│ Ġcompletely    │        2198 │                1 │
├────────────────┼─────────────┼──────────────────┤
│ Ġunderstand    │        1346 │                1 │
├────────────────┼─────────────┼──────────────────┤
│ Ġ90            │        1814 │                1 │
├────────────────┼─────────────┼──────────────────┤
│ Ġ'        

In [19]:
# split data in training and validation sets

val_ratio = 0.2 
batch_size = 16

# Indices of the train and validation splits stratified by labels
train_idx, val_idx = train_test_split(
    np.arange(len(labels)),
    test_size = val_ratio,
    shuffle = True,
    stratify = labels)

# Train and validation sets
train_set = TensorDataset(token_id[train_idx], 
                          attention_masks[train_idx], 
                          labels[train_idx])

val_set = TensorDataset(token_id[val_idx], 
                        attention_masks[val_idx], 
                        labels[val_idx])

# DataLoaders
train_dataloader = DataLoader(
            train_set,
            sampler = RandomSampler(train_set),
            batch_size = batch_size
        )

validation_dataloader = DataLoader(
            val_set,
            sampler = SequentialSampler(val_set),
            batch_size = batch_size
        )

In [20]:
# load RoBERTa model as a RobertaForSequenceClassification model
model = RobertaForSequenceClassification.from_pretrained(
    "roberta-base",
    num_labels=2,
    output_attentions=False,
    output_hidden_states=False,
)

optimizer = torch.optim.AdamW(
                model.parameters(),
                lr=2e-5,     # You can also try 3e-5, 2e-5
                eps=1e-8     # AdamW's epsilon value. Probably optimal.
                )

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
# Define device and move the model to it
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [22]:
checkpoint_path = "checkpoint_RoBERTa.pth"

In [23]:
# To resume training from the last checkpoint, you can use the following code:
# Restore the latest checkpoint if needed
# checkpoint = torch.load(checkpoint_path)
# model.load_state_dict(checkpoint['model_state_dict'])
# optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

In [24]:
epochs = 4
debug_interval = 600  # Print a debug message every 10 minutes

# Define variables for early stopping
patience = 3
min_delta = 0.001
best_val_loss = float('inf')
counter = 0

start_time = time.time()  # Record the start time

for epoch in trange(epochs, desc='Epoch'):
    # Set model to training mode for training loop
    model.train()

    # Tracking variables
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0

    for step, batch in enumerate(train_dataloader):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        optimizer.zero_grad()
        # Forward pass
        train_output = model(b_input_ids, 
                             token_type_ids=None, 
                             attention_mask=b_input_mask, 
                             labels=b_labels)

        # Backward pass
        train_output.loss.backward()
        optimizer.step()
        # Update tracking variables
        tr_loss += train_output.loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1

        # Check if five seconds have passed, and print a debug message
        if time.time() - start_time > debug_interval:
            print(f"Epoch {epoch + 1}/{epochs}, Step {step + 1}/{len(train_dataloader)}, "
                  f"Train Loss: {tr_loss / nb_tr_steps:.4f}")
            start_time = time.time()  # Reset the start time for the next 10 minutes

    # After the training loop, save the model checkpoint for each epoch
    torch.save(model.state_dict(), f'model_checkpoint_epoch_RoBERTa_Classifier_{epoch + 1}.pt')

    # Save the checkpoint after each epoch
    checkpoint = {
        'epoch': epoch + 1,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        # Add any other information you want to save
    }
    torch.save(checkpoint, checkpoint_path)
    
    # Set model to evaluation mode for validation loop
    model.eval()
    
    # Initialise metrics 
    val_loss = 0
    val_preds = []
    val_labels = []

    for batch in validation_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        with torch.no_grad():
            # Forward pass
            eval_output = model(b_input_ids, 
                              token_type_ids=None, 
                              attention_mask=b_input_mask, 
                              labels=b_labels)
        logits = eval_output.logits
        eval_loss = eval_output.loss
        val_loss += eval_loss.item()

        # Calculate validation metrics
        preds = torch.argmax(logits, dim=1).cpu().detach().numpy()
        label_ids = b_labels.cpu().detach().numpy()
        val_preds.extend(preds)
        val_labels.extend(label_ids)

    # Calculate average metrics over all batches
    avg_val_loss = val_loss / len(validation_dataloader)
    avg_val_accuracy = accuracy_score(val_labels, val_preds)
    avg_val_f1 = f1_score(val_labels, val_preds, average='weighted')

    print('\n\t - Train loss: {:.4f}'.format(tr_loss / nb_tr_steps))
    print('\n\t - Validation loss: {:.4f}'.format(avg_val_loss))
    print('\n\t - Validation accuracy: {:.4f}'.format(avg_val_accuracy))
    print('\n\t - Validation F1 score: {:.4f}'.format(avg_val_f1))
    
    # Check if validation loss improved
    if avg_val_loss < best_val_loss - min_delta * best_val_loss:
        best_val_loss = avg_val_loss
        counter = 0
    else:
        counter += 1
        if counter == patience:
            print("Validation loss did not improve for {} epochs. Early stopping...".format(patience))
            break

Epoch:   0%|                                                                                                     | 0/4 [00:00<?, ?it/s]

Epoch 1/4, Step 325/12456, Train Loss: 0.3453
Epoch 1/4, Step 652/12456, Train Loss: 0.2920
Epoch 1/4, Step 979/12456, Train Loss: 0.2642
Epoch 1/4, Step 1306/12456, Train Loss: 0.2525
Epoch 1/4, Step 1632/12456, Train Loss: 0.2401
Epoch 1/4, Step 1959/12456, Train Loss: 0.2324
Epoch 1/4, Step 2285/12456, Train Loss: 0.2255
Epoch 1/4, Step 2611/12456, Train Loss: 0.2218
Epoch 1/4, Step 2938/12456, Train Loss: 0.2191
Epoch 1/4, Step 3265/12456, Train Loss: 0.2164
Epoch 1/4, Step 3592/12456, Train Loss: 0.2133
Epoch 1/4, Step 3919/12456, Train Loss: 0.2108
Epoch 1/4, Step 4246/12456, Train Loss: 0.2090
Epoch 1/4, Step 4573/12456, Train Loss: 0.2068
Epoch 1/4, Step 4899/12456, Train Loss: 0.2046
Epoch 1/4, Step 5225/12456, Train Loss: 0.2016
Epoch 1/4, Step 5551/12456, Train Loss: 0.1997
Epoch 1/4, Step 5877/12456, Train Loss: 0.1981
Epoch 1/4, Step 6203/12456, Train Loss: 0.1973
Epoch 1/4, Step 6529/12456, Train Loss: 0.1962
Epoch 1/4, Step 6855/12456, Train Loss: 0.1959
Epoch 1/4, Step 

Epoch:  25%|█████████████████████▎                                                               | 1/4 [6:59:21<20:58:03, 25161.07s/it]


	 - Train loss: 0.1814

	 - Validation loss: 0.1584

	 - Validation accuracy: 0.9409

	 - Validation F1 score: 0.9409
Epoch 2/4, Step 1/12456, Train Loss: 0.0495
Epoch 2/4, Step 323/12456, Train Loss: 0.1447
Epoch 2/4, Step 642/12456, Train Loss: 0.1431
Epoch 2/4, Step 960/12456, Train Loss: 0.1417
Epoch 2/4, Step 1186/12456, Train Loss: 0.1437
Epoch 2/4, Step 1401/12456, Train Loss: 0.1412
Epoch 2/4, Step 1633/12456, Train Loss: 0.1399
Epoch 2/4, Step 1865/12456, Train Loss: 0.1407
Epoch 2/4, Step 2097/12456, Train Loss: 0.1402
Epoch 2/4, Step 2328/12456, Train Loss: 0.1416
Epoch 2/4, Step 2560/12456, Train Loss: 0.1423
Epoch 2/4, Step 2791/12456, Train Loss: 0.1424
Epoch 2/4, Step 3023/12456, Train Loss: 0.1424
Epoch 2/4, Step 3254/12456, Train Loss: 0.1419
Epoch 2/4, Step 3485/12456, Train Loss: 0.1422
Epoch 2/4, Step 3717/12456, Train Loss: 0.1426
Epoch 2/4, Step 3944/12456, Train Loss: 0.1426
Epoch 2/4, Step 4191/12456, Train Loss: 0.1428
Epoch 2/4, Step 4437/12456, Train Loss: 0

Epoch:  50%|██████████████████████████████████████████                                          | 2/4 [14:58:14<15:08:45, 27262.53s/it]


	 - Train loss: 0.1402

	 - Validation loss: 0.1543

	 - Validation accuracy: 0.9413

	 - Validation F1 score: 0.9419
Epoch 3/4, Step 1/12456, Train Loss: 0.1208
Epoch 3/4, Step 297/12456, Train Loss: 0.1001
Epoch 3/4, Step 605/12456, Train Loss: 0.1049
Epoch 3/4, Step 907/12456, Train Loss: 0.1139
Epoch 3/4, Step 1222/12456, Train Loss: 0.1109
Epoch 3/4, Step 1552/12456, Train Loss: 0.1126
Epoch 3/4, Step 1883/12456, Train Loss: 0.1117
Epoch 3/4, Step 2211/12456, Train Loss: 0.1118
Epoch 3/4, Step 2539/12456, Train Loss: 0.1123
Epoch 3/4, Step 2869/12456, Train Loss: 0.1127
Epoch 3/4, Step 3205/12456, Train Loss: 0.1121
Epoch 3/4, Step 3541/12456, Train Loss: 0.1117
Epoch 3/4, Step 3877/12456, Train Loss: 0.1128
Epoch 3/4, Step 4204/12456, Train Loss: 0.1129
Epoch 3/4, Step 4534/12456, Train Loss: 0.1130
Epoch 3/4, Step 4847/12456, Train Loss: 0.1130
Epoch 3/4, Step 5169/12456, Train Loss: 0.1139
Epoch 3/4, Step 5486/12456, Train Loss: 0.1142
Epoch 3/4, Step 5785/12456, Train Loss: 0

Epoch:  50%|██████████████████████████████████████████                                          | 2/4 [21:48:21<21:48:21, 39251.00s/it]


	 - Train loss: 0.1156

	 - Validation loss: 0.1583

	 - Validation accuracy: 0.9444

	 - Validation F1 score: 0.9434
Validation loss did not improve for 3 epochs. Early stopping...



