Install the required libraries

In [None]:
# for cpu !pip3 install torch torchvision torchaudio
!pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
!pip3 install transformers
!pip3 install sklearn
!pip3 install pandas
!pip3 install numpy
!pip3 install tabulate
!pip3 install tqdm
!pip3 install sentencepiece

Load the required libraries

In [1]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import RobertaForSequenceClassification, RobertaTokenizer
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from tabulate import tabulate
from tqdm import trange
import random
from sklearn.metrics import accuracy_score, f1_score
import time

Code for RoBERTa Classifier

In [2]:
# Load the dataset
data = pd.read_csv('IMDB_Reviews_Top_250_preprocessed_without_stopwords.csv')

In [3]:
data.head(5)

Unnamed: 0.1,Unnamed: 0,Review Title,Review Text,Rating,Movie Title
0,0,One Step Masterpiece !,'s honestly absurd good `` Spider-Verse '' mov...,10.0,Spider-Man: Across the Spider-Verse
1,1,Masterful IMAX Spider-Man,"animation , flow everything , genius character...",10.0,Spider-Man: Across the Spider-Verse
2,2,One best sequels anything ever made,"n't already obvious first film , 's officially...",10.0,Spider-Man: Across the Spider-Verse
3,3,new Empire Strikes Back,film visual concert . animation character desi...,10.0,Spider-Man: Across the Spider-Verse
4,4,movie 2023 far .,'s time since 've walked cinema feeling satisf...,10.0,Spider-Man: Across the Spider-Verse


In [4]:
# Set the threshold for binary conversion
threshold = 7.0

# Convert the 'Rating' column to a binary variable
data['Binary Rating'] = data['Rating'].apply(lambda x: 1 if x >= threshold else 0)

In [5]:
# Extract reviews texts and ratings into arrays
reviews = data['Review Text'].values
labels = data['Binary Rating'].values

In [6]:
# Shouldn't be necessary but just to be safe
for review in reviews:
    str(review)  

In [7]:
# Count the number of rows with label 1
num_label_1 = data[data['Binary Rating'] == 1]['Binary Rating'].count()

# Count the number of rows with label 0
num_label_0 = data[data['Binary Rating'] == 0]['Binary Rating'].count()

print(f"Number of rows with Binary Rating 1: {num_label_1}")
print(f"Number of rows with Binary Rating 0: {num_label_0}")

Number of rows with Binary Rating 1: 214112
Number of rows with Binary Rating 0: 35009


In [8]:
reviews[0:5]

array(["'s honestly absurd good `` Spider-Verse '' movies . `` Across Spider-Verse '' great , better `` Spider-Verse '' . really n't know . `` Spider-Man : Across Spider-Verse '' fantastic ! Deftly juggles deeply heartfelt character beats crazy multiverse content , packed many delightful easter eggs . Loved Gwen 's story expanded , scenes Shea Whigham 's Captain Stacy truly special . Ca n't wait third one . Every frame movie gorgeous ! n't want blink n't want miss anything . watch film & & finds something new enjoy . surprises film truly n't expecting . `` Across Spider-Verse '' another milestone animation . Yes 's gorgeous & visually mind-blowing , sequel surpasses first always putting Miles family front center . movie feels like massive celebration Spider-Man fans everywhere , whether love movies , games , comics , etc . , cameos multiversal elements come second Miles Gwen 's story . stunning achievement . Yes , 's visually dazzling , expansive , expressive new realms movie 's real s

In [9]:
labels[0:5]

array([1, 1, 1, 1, 1], dtype=int64)

In [10]:
# Load RoBERTa tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

In [11]:
# testing tokens and token Ids for a random sentence
def print_rand_sentence():
    index = random.randint(0, len(reviews)-1) #random index in texts list
    table = np.array([tokenizer.tokenize(reviews[index]), 
                    tokenizer.convert_tokens_to_ids(tokenizer.tokenize(reviews[index]))]).T #tokenize random text in texts list
    print(tabulate(table,
                 headers = ['Tokens', 'Token IDs'],
                 tablefmt = 'fancy_grid'))  #print in table format

print_rand_sentence()

╒═══════════════╤═════════════╕
│ Tokens        │   Token IDs │
╞═══════════════╪═════════════╡
│ Boost         │       43389 │
├───────────────┼─────────────┤
│ ed            │         196 │
├───────────────┼─────────────┤
│ Ġinteresting  │        2679 │
├───────────────┼─────────────┤
│ Ġanimated     │       12847 │
├───────────────┼─────────────┤
│ Ġstyle        │        2496 │
├───────────────┼─────────────┤
│ Ġgood         │         205 │
├───────────────┼─────────────┤
│ Ġscript       │        8543 │
├───────────────┼─────────────┤
│ Ġ,            │        2156 │
├───────────────┼─────────────┤
│ ĠIron         │        9940 │
├───────────────┼─────────────┤
│ ĠGiant        │       21606 │
├───────────────┼─────────────┤
│ Ġentertaining │       11110 │
├───────────────┼─────────────┤
│ Ġscience      │        2866 │
├───────────────┼─────────────┤
│ -             │          12 │
├───────────────┼─────────────┤
│ fiction       │       25175 │
├───────────────┼─────────────┤
│ Ġadven

In [12]:
# Find max sequensce lenght

MAX_LEN = 0
for review in reviews:
    tokenized = tokenizer(review,return_tensors='pt',add_special_tokens=True)
    MAX_LEN = max(MAX_LEN, tokenized['input_ids'].size()[1])

Token indices sequence length is longer than the specified maximum sequence length for this model (1157 > 512). Running this sequence through the model will result in indexing errors


In [13]:
MAX_LEN

2853

In [14]:
# Tokenize and encode texts then extract token ids, attention masks and labels in torch tensor format.
token_id = []
attention_masks = []

def preprocessing(input_text, tokenizer):
    return tokenizer.encode_plus(                    #returns dictionary with token ids, attention masks and token type ids
                        input_text,
                        max_length = 512,            
                        padding = 'max_length',    #padding tokens to be of the same size
                        truncation=True,           # Truncate the text if it exceeds 512 tokens
                        return_attention_mask = True,
                        return_tensors = 'pt'        #torch tensor format
                   )


for sample in reviews:
    encoding_dict = preprocessing(sample, tokenizer)
    token_id.append(encoding_dict['input_ids']) 
    attention_masks.append(encoding_dict['attention_mask'])

#concatinating the tesnors in a single dimension
token_id = torch.cat(token_id, dim = 0)
attention_masks = torch.cat(attention_masks, dim = 0)
labels = torch.tensor(labels)

In [15]:
token_id

tensor([[    0,    18, 10728,  ...,     1,     1,     1],
        [    0, 25042,  1258,  ...,     1,     1,     1],
        [    0,   282,    75,  ...,     1,     1,     1],
        ...,
        [    0,   397,  3996,  ...,     1,     1,     1],
        [    0, 17425, 27941,  ...,     1,     1,     1],
        [    0,   991, 23745,  ...,     1,     1,     1]])

In [16]:
attention_masks

tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])

In [17]:
labels

tensor([1, 1, 1,  ..., 1, 1, 1])

In [18]:
def print_rand_sentence_encoding():
    index = random.randint(0, len(reviews) - 1)
    tokens = tokenizer.convert_ids_to_tokens(token_id[index].numpy().tolist())  # Convert token_ids to a list of strings
    token_ids = token_id[index].numpy()
    attention = attention_masks[index].numpy()

    table = np.array([tokens, token_ids, attention]).T
    print(tabulate(table, headers=['Tokens', 'Token IDs', 'Attention Mask'], tablefmt='fancy_grid'))

print_rand_sentence_encoding()

╒══════════╤═════════════╤══════════════════╕
│ Tokens   │   Token IDs │   Attention Mask │
╞══════════╪═════════════╪══════════════════╡
│ <s>      │           0 │                1 │
├──────────┼─────────────┼──────────────────┤
│ Ca       │       38593 │                1 │
├──────────┼─────────────┼──────────────────┤
│ Ġn       │         295 │                1 │
├──────────┼─────────────┼──────────────────┤
│ 't       │          75 │                1 │
├──────────┼─────────────┼──────────────────┤
│ Ġtalk    │        1067 │                1 │
├──────────┼─────────────┼──────────────────┤
│ Ġ,       │        2156 │                1 │
├──────────┼─────────────┼──────────────────┤
│ Ġgo      │         213 │                1 │
├──────────┼─────────────┼──────────────────┤
│ Ġwatch   │        1183 │                1 │
├──────────┼─────────────┼──────────────────┤
│ Ġ!       │       27785 │                1 │
├──────────┼─────────────┼──────────────────┤
│ ĠCo      │         944 │        

In [19]:
# split data in training and validation sets

val_ratio = 0.2 
batch_size = 16

# Indices of the train and validation splits stratified by labels
train_idx, val_idx = train_test_split(
    np.arange(len(labels)),
    test_size = val_ratio,
    shuffle = True,
    stratify = labels)

# Train and validation sets
train_set = TensorDataset(token_id[train_idx], 
                          attention_masks[train_idx], 
                          labels[train_idx])

val_set = TensorDataset(token_id[val_idx], 
                        attention_masks[val_idx], 
                        labels[val_idx])

# DataLoaders
train_dataloader = DataLoader(
            train_set,
            sampler = RandomSampler(train_set),
            batch_size = batch_size
        )

validation_dataloader = DataLoader(
            val_set,
            sampler = SequentialSampler(val_set),
            batch_size = batch_size
        )

In [20]:
# load RoBERTa model as a RobertaForSequenceClassification model
model = RobertaForSequenceClassification.from_pretrained(
    "roberta-base",
    num_labels=2,
    output_attentions=False,
    output_hidden_states=False,
)

optimizer = torch.optim.AdamW(
                model.parameters(),
                lr=3e-5,     # You can also try 3e-5, 2e-5
                eps=1e-8     # AdamW's epsilon value. Probably optimal.
                )

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
# Define device and move the model to it
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [22]:
checkpoint_path = "checkpoint_RoBERTa.pth"

In [23]:
# To resume training from the last checkpoint, you can use the following code:
# Restore the latest checkpoint if needed
# checkpoint = torch.load(checkpoint_path)
# model.load_state_dict(checkpoint['model_state_dict'])
# optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

In [24]:
epochs = 4
debug_interval = 600  # Print a debug message every 10 minutes

# Define variables for early stopping
patience = 3
min_delta = 0.001
best_val_loss = float('inf')
counter = 0

start_time = time.time()  # Record the start time

for epoch in trange(epochs, desc='Epoch'):
    # Set model to training mode for training loop
    model.train()

    # Tracking variables
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0

    for step, batch in enumerate(train_dataloader):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        optimizer.zero_grad()
        # Forward pass
        train_output = model(b_input_ids, 
                             token_type_ids=None, 
                             attention_mask=b_input_mask, 
                             labels=b_labels)

        # Backward pass
        train_output.loss.backward()
        optimizer.step()
        # Update tracking variables
        tr_loss += train_output.loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1

        # Check if five seconds have passed, and print a debug message
        if time.time() - start_time > debug_interval:
            print(f"Epoch {epoch + 1}/{epochs}, Step {step + 1}/{len(train_dataloader)}, "
                  f"Train Loss: {tr_loss / nb_tr_steps:.4f}")
            start_time = time.time()  # Reset the start time for the next 10 minutes

    # After the training loop, save the model checkpoint for each epoch
    torch.save(model.state_dict(), f'model_checkpoint_epoch_RoBERTa_Classifier_{epoch + 1}.pt')

    # Save the checkpoint after each epoch
    checkpoint = {
        'epoch': epoch + 1,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        # Add any other information you want to save
    }
    torch.save(checkpoint, checkpoint_path)
    
    # Set model to evaluation mode for validation loop
    model.eval()
    
    # Initialise metrics 
    val_loss = 0
    val_preds = []
    val_labels = []

    for batch in validation_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        with torch.no_grad():
            # Forward pass
            eval_output = model(b_input_ids, 
                              token_type_ids=None, 
                              attention_mask=b_input_mask, 
                              labels=b_labels)
        logits = eval_output.logits
        eval_loss = eval_output.loss
        val_loss += eval_loss.item()

        # Calculate validation metrics
        preds = torch.argmax(logits, dim=1).cpu().detach().numpy()
        label_ids = b_labels.cpu().detach().numpy()
        val_preds.extend(preds)
        val_labels.extend(label_ids)

    # Calculate average metrics over all batches
    avg_val_loss = val_loss / len(validation_dataloader)
    avg_val_accuracy = accuracy_score(val_labels, val_preds)
    avg_val_f1 = f1_score(val_labels, val_preds, average='weighted')

    print('\n\t - Train loss: {:.4f}'.format(tr_loss / nb_tr_steps))
    print('\n\t - Validation loss: {:.4f}'.format(avg_val_loss))
    print('\n\t - Validation accuracy: {:.4f}'.format(avg_val_accuracy))
    print('\n\t - Validation F1 score: {:.4f}'.format(avg_val_f1))
    
    # Check if validation loss improved
    if avg_val_loss < best_val_loss - min_delta * best_val_loss:
        best_val_loss = avg_val_loss
        counter = 0
    else:
        counter += 1
        if counter == patience:
            print("Validation loss did not improve for {} epochs. Early stopping...".format(patience))
            break

Epoch:   0%|                                                                                                     | 0/4 [00:00<?, ?it/s]

Epoch 1/4, Step 312/12456, Train Loss: 0.3663
Epoch 1/4, Step 627/12456, Train Loss: 0.3266
Epoch 1/4, Step 942/12456, Train Loss: 0.3126
Epoch 1/4, Step 1257/12456, Train Loss: 0.3171
Epoch 1/4, Step 1572/12456, Train Loss: 0.3248
Epoch 1/4, Step 1887/12456, Train Loss: 0.3235
Epoch 1/4, Step 2202/12456, Train Loss: 0.3276
Epoch 1/4, Step 2516/12456, Train Loss: 0.3297
Epoch 1/4, Step 2831/12456, Train Loss: 0.3288
Epoch 1/4, Step 3146/12456, Train Loss: 0.3261
Epoch 1/4, Step 3461/12456, Train Loss: 0.3252
Epoch 1/4, Step 3776/12456, Train Loss: 0.3264
Epoch 1/4, Step 4091/12456, Train Loss: 0.3249
Epoch 1/4, Step 4406/12456, Train Loss: 0.3230
Epoch 1/4, Step 4721/12456, Train Loss: 0.3230
Epoch 1/4, Step 5036/12456, Train Loss: 0.3261
Epoch 1/4, Step 5351/12456, Train Loss: 0.3295
Epoch 1/4, Step 5666/12456, Train Loss: 0.3312
Epoch 1/4, Step 5980/12456, Train Loss: 0.3340
Epoch 1/4, Step 6295/12456, Train Loss: 0.3371
Epoch 1/4, Step 6610/12456, Train Loss: 0.3407
Epoch 1/4, Step 

Epoch:  25%|█████████████████████▎                                                               | 1/4 [6:54:08<20:42:25, 24848.44s/it]


	 - Train loss: 0.3717

	 - Validation loss: 0.4088

	 - Validation accuracy: 0.8599

	 - Validation F1 score: 0.7956
Epoch 2/4, Step 1/12456, Train Loss: 0.5170
Epoch 2/4, Step 347/12456, Train Loss: 0.4059
Epoch 2/4, Step 693/12456, Train Loss: 0.4025
Epoch 2/4, Step 1040/12456, Train Loss: 0.4034
Epoch 2/4, Step 1386/12456, Train Loss: 0.4072
Epoch 2/4, Step 1732/12456, Train Loss: 0.4066
Epoch 2/4, Step 2078/12456, Train Loss: 0.4069
Epoch 2/4, Step 2424/12456, Train Loss: 0.4062
Epoch 2/4, Step 2770/12456, Train Loss: 0.4052
Epoch 2/4, Step 3116/12456, Train Loss: 0.4058
Epoch 2/4, Step 3462/12456, Train Loss: 0.4060
Epoch 2/4, Step 3808/12456, Train Loss: 0.4055
Epoch 2/4, Step 4154/12456, Train Loss: 0.4054
Epoch 2/4, Step 4500/12456, Train Loss: 0.4057
Epoch 2/4, Step 4846/12456, Train Loss: 0.4058
Epoch 2/4, Step 5192/12456, Train Loss: 0.4059
Epoch 2/4, Step 5538/12456, Train Loss: 0.4061
Epoch 2/4, Step 5884/12456, Train Loss: 0.4062
Epoch 2/4, Step 6230/12456, Train Loss: 

Epoch:  50%|██████████████████████████████████████████                                          | 2/4 [13:28:44<13:25:18, 24159.01s/it]


	 - Train loss: 0.4073

	 - Validation loss: 0.4064

	 - Validation accuracy: 0.8595

	 - Validation F1 score: 0.7945
Epoch 3/4, Step 1/12456, Train Loss: 0.3745
Epoch 3/4, Step 346/12456, Train Loss: 0.4100
Epoch 3/4, Step 691/12456, Train Loss: 0.4004
Epoch 3/4, Step 1040/12456, Train Loss: 0.4048
Epoch 3/4, Step 1394/12456, Train Loss: 0.4022
Epoch 3/4, Step 1750/12456, Train Loss: 0.4005
Epoch 3/4, Step 2106/12456, Train Loss: 0.4018
Epoch 3/4, Step 2462/12456, Train Loss: 0.4038
Epoch 3/4, Step 2810/12456, Train Loss: 0.4055
Epoch 3/4, Step 3143/12456, Train Loss: 0.4062
Epoch 3/4, Step 3490/12456, Train Loss: 0.4063
Epoch 3/4, Step 3848/12456, Train Loss: 0.4064
Epoch 3/4, Step 4206/12456, Train Loss: 0.4059
Epoch 3/4, Step 4564/12456, Train Loss: 0.4054
Epoch 3/4, Step 4922/12456, Train Loss: 0.4061
Epoch 3/4, Step 5280/12456, Train Loss: 0.4058
Epoch 3/4, Step 5638/12456, Train Loss: 0.4055
Epoch 3/4, Step 5987/12456, Train Loss: 0.4052
Epoch 3/4, Step 6313/12456, Train Loss: 

Epoch:  50%|██████████████████████████████████████████                                          | 2/4 [20:10:21<20:10:21, 36310.89s/it]


	 - Train loss: 0.4074

	 - Validation loss: 0.4066

	 - Validation accuracy: 0.8595

	 - Validation F1 score: 0.7945
Validation loss did not improve for 3 epochs. Early stopping...



