Install the required libraries

In [None]:
# for cpu !pip3 install torch torchvision torchaudio
!pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
!pip3 install transformers
!pip3 install sklearn
!pip3 install pandas
!pip3 install numpy
!pip3 install tabulate
!pip3 install tqdm
!pip3 install sentencepiece

Load the required libraries

In [1]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AutoTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from tabulate import tabulate
from tqdm import trange
import random
from sklearn.metrics import accuracy_score, f1_score
import time

Code for BERT Classifier

In [2]:
# Load the dataset
data = pd.read_csv('IMDB_Reviews_Top_250_preprocessed_without_stopwords.csv')

In [3]:
data.head(5)

Unnamed: 0.1,Unnamed: 0,Review Title,Review Text,Rating,Movie Title
0,0,One Step Masterpiece !,'s honestly absurd good `` Spider-Verse '' mov...,10.0,Spider-Man: Across the Spider-Verse
1,1,Masterful IMAX Spider-Man,"animation , flow everything , genius character...",10.0,Spider-Man: Across the Spider-Verse
2,2,One best sequels anything ever made,"n't already obvious first film , 's officially...",10.0,Spider-Man: Across the Spider-Verse
3,3,new Empire Strikes Back,film visual concert . animation character desi...,10.0,Spider-Man: Across the Spider-Verse
4,4,movie 2023 far .,'s time since 've walked cinema feeling satisf...,10.0,Spider-Man: Across the Spider-Verse


In [4]:
# Set the threshold for binary conversion
threshold = 7.0

# Convert the 'Rating' column to a binary variable
data['Binary Rating'] = data['Rating'].apply(lambda x: 1 if x >= threshold else 0)

In [5]:
# Extract reviews texts and ratings into arrays
reviews = data['Review Text'].values
labels = data['Binary Rating'].values

In [6]:
# Shouldn't be necessary but just to be safe
for review in reviews:
    str(review)  

In [7]:
# Count the number of rows with label 1
num_label_1 = data[data['Binary Rating'] == 1]['Binary Rating'].count()

# Count the number of rows with label 0
num_label_0 = data[data['Binary Rating'] == 0]['Binary Rating'].count()

print(f"Number of rows with Binary Rating 1: {num_label_1}")
print(f"Number of rows with Binary Rating 0: {num_label_0}")

Number of rows with Binary Rating 1: 214112
Number of rows with Binary Rating 0: 35009


In [8]:
reviews[0:5]

array(["'s honestly absurd good `` Spider-Verse '' movies . `` Across Spider-Verse '' great , better `` Spider-Verse '' . really n't know . `` Spider-Man : Across Spider-Verse '' fantastic ! Deftly juggles deeply heartfelt character beats crazy multiverse content , packed many delightful easter eggs . Loved Gwen 's story expanded , scenes Shea Whigham 's Captain Stacy truly special . Ca n't wait third one . Every frame movie gorgeous ! n't want blink n't want miss anything . watch film & & finds something new enjoy . surprises film truly n't expecting . `` Across Spider-Verse '' another milestone animation . Yes 's gorgeous & visually mind-blowing , sequel surpasses first always putting Miles family front center . movie feels like massive celebration Spider-Man fans everywhere , whether love movies , games , comics , etc . , cameos multiversal elements come second Miles Gwen 's story . stunning achievement . Yes , 's visually dazzling , expansive , expressive new realms movie 's real s

In [9]:
labels[0:5]

array([1, 1, 1, 1, 1], dtype=int64)

In [10]:
# Load BERT tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

In [11]:
# testing tokens and token Ids for a random sentence
def print_rand_sentence():
    index = random.randint(0, len(reviews)-1) #random index in texts list
    table = np.array([tokenizer.tokenize(reviews[index]), 
                    tokenizer.convert_tokens_to_ids(tokenizer.tokenize(reviews[index]))]).T #tokenize random text in texts list
    print(tabulate(table,
                 headers = ['Tokens', 'Token IDs'],
                 tablefmt = 'fancy_grid'))  #print in table format

print_rand_sentence()

╒════════════╤═════════════╕
│ Tokens     │   Token IDs │
╞════════════╪═════════════╡
│ action     │        2168 │
├────────────┼─────────────┤
│ movies     │        5558 │
├────────────┼─────────────┤
│ go         │        1301 │
├────────────┼─────────────┤
│ ,          │         117 │
├────────────┼─────────────┤
│ best       │        1436 │
├────────────┼─────────────┤
│ one        │        1141 │
├────────────┼─────────────┤
│ ever       │        1518 │
├────────────┼─────────────┤
│ made       │        1189 │
├────────────┼─────────────┤
│ .          │         119 │
├────────────┼─────────────┤
│ movies     │        5558 │
├────────────┼─────────────┤
│ go         │        1301 │
├────────────┼─────────────┤
│ ,          │         117 │
├────────────┼─────────────┤
│ '          │         112 │
├────────────┼─────────────┤
│ s          │         188 │
├────────────┼─────────────┤
│ firmly     │        7487 │
├────────────┼─────────────┤
│ top        │        1499 │
├────────────┼

In [12]:
# Find max sequensce length

MAX_LEN = 0
for review in reviews:
    tokenized = tokenizer(review,return_tensors='pt',add_special_tokens=True)
    MAX_LEN = max(MAX_LEN, tokenized['input_ids'].size()[1])

Token indices sequence length is longer than the specified maximum sequence length for this model (1208 > 512). Running this sequence through the model will result in indexing errors


In [13]:
MAX_LEN

3620

In [14]:
# Tokenize and encode texts then extract token ids, attention masks and labels in torch tensor format.
token_id = []
attention_masks = []

def preprocessing(input_text, tokenizer):
    return tokenizer.encode_plus(                    #returns dictionary with token ids, attention masks and token type ids
                        input_text,
                        add_special_tokens = True,   #[CLS], [SEP] tokens required by BERT
                        max_length = 512,            #calculated above
                        padding='max_length',    #padding tokens to be of the same size
                        truncation=True,           # Truncate the text if it exceeds 512 tokens
                        return_attention_mask = True,
                        return_tensors = 'pt'        #torch tensor format
                   )


for sample in reviews:
    encoding_dict = preprocessing(sample, tokenizer)
    token_id.append(encoding_dict['input_ids']) 
    attention_masks.append(encoding_dict['attention_mask'])


token_id = torch.cat(token_id, dim = 0) #concatinating the tesnors in a single dimension
attention_masks = torch.cat(attention_masks, dim = 0)
labels = torch.tensor(labels)

In [15]:
token_id

tensor([[  101,   112,   188,  ...,     0,     0,     0],
        [  101,  8794,   117,  ...,     0,     0,     0],
        [  101,   183,   112,  ...,     0,     0,     0],
        ...,
        [  101,  1299,  2542,  ...,     0,     0,     0],
        [  101,  3177, 16598,  ...,     0,     0,     0],
        [  101,  6438, 22055,  ...,     0,     0,     0]])

In [16]:
attention_masks

tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])

In [17]:
labels

tensor([1, 1, 1,  ..., 1, 1, 1])

In [19]:
#print ids and masks for a random sentence
def print_rand_sentence_encoding():
    index = random.randint(0, len(review) - 1)
    tokens = tokenizer.tokenize(tokenizer.decode(token_id[index]))
    token_ids = [i.numpy() for i in token_id[index]]
    attention = [i.numpy() for i in attention_masks[index]]
    table = np.array([tokens, token_ids, attention]).T
    print(tabulate(table, 
                 headers = ['Tokens', 'Token IDs', 'Attention Mask'],
                 tablefmt = 'fancy_grid'))

print_rand_sentence_encoding()

╒══════════════╤═════════════╤══════════════════╕
│ Tokens       │   Token IDs │   Attention Mask │
╞══════════════╪═════════════╪══════════════════╡
│ [CLS]        │         101 │                1 │
├──────────────┼─────────────┼──────────────────┤
│ rating       │        5261 │                1 │
├──────────────┼─────────────┼──────────────────┤
│ 9            │         130 │                1 │
├──────────────┼─────────────┼──────────────────┤
│ .            │         119 │                1 │
├──────────────┼─────────────┼──────────────────┤
│ 5            │         126 │                1 │
├──────────────┼─────────────┼──────────────────┤
│ /            │         120 │                1 │
├──────────────┼─────────────┼──────────────────┤
│ 10           │        1275 │                1 │
├──────────────┼─────────────┼──────────────────┤
│ Watch        │        8553 │                1 │
├──────────────┼─────────────┼──────────────────┤
│ ##ed         │        1174 │                1 │


In [20]:
# split data in training and validation sets

val_ratio = 0.2 
batch_size = 16

# Indices of the train and validation splits stratified by labels
train_idx, val_idx = train_test_split(
    np.arange(len(labels)),
    test_size = val_ratio,
    shuffle = True,
    stratify = labels)

# Train and validation sets
train_set = TensorDataset(token_id[train_idx], 
                          attention_masks[train_idx], 
                          labels[train_idx])

val_set = TensorDataset(token_id[val_idx], 
                        attention_masks[val_idx], 
                        labels[val_idx])

# DataLoaders
train_dataloader = DataLoader(
            train_set,
            sampler = RandomSampler(train_set),
            batch_size = batch_size
        )

validation_dataloader = DataLoader(
            val_set,
            sampler = SequentialSampler(val_set),
            batch_size = batch_size
        )

In [21]:
# load BERT model as a BertForSequenceClassification model

model = BertForSequenceClassification.from_pretrained(
    "bert-base-cased",
    num_labels = 2,
    output_attentions = False,
    output_hidden_states = False,
)


optimizer = torch.optim.AdamW(
                model.parameters(), 
                lr = 3e-5,  # can also try 3e-5, 2e-5
                eps = 1e-08 # AdamW's epsilon value. Probably optimal.
                )

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
# Define device and move the model to it
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [23]:
checkpoint_path = "checkpoint_BERT.pth"

In [24]:
# To resume training from the last checkpoint, you can use the following code:
# Restore the latest checkpoint if needed
# checkpoint = torch.load(checkpoint_path)
# model.load_state_dict(checkpoint['model_state_dict'])
# optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

In [25]:
epochs = 4
debug_interval = 600  # Print a debug message every 10 minutes

# Define variables for early stopping
patience = 3
min_delta = 0.001
best_val_loss = float('inf')
counter = 0

start_time = time.time()  # Record the start time

for epoch in trange(epochs, desc='Epoch'):
    # Set model to training mode for training loop
    model.train()

    # Tracking variables
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0

    for step, batch in enumerate(train_dataloader):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        optimizer.zero_grad()
        # Forward pass
        train_output = model(b_input_ids, 
                             token_type_ids=None, 
                             attention_mask=b_input_mask, 
                             labels=b_labels)

        # Backward pass
        train_output.loss.backward()
        optimizer.step()
        # Update tracking variables
        tr_loss += train_output.loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1

        # Check if five seconds have passed, and print a debug message
        if time.time() - start_time > debug_interval:
            print(f"Epoch {epoch + 1}/{epochs}, Step {step + 1}/{len(train_dataloader)}, "
                  f"Train Loss: {tr_loss / nb_tr_steps:.4f}")
            start_time = time.time()  # Reset the start time for the next 10 minutes

    # After the training loop, save the model checkpoint for each epoch
    torch.save(model.state_dict(), f'model_checkpoint_epoch_BERT_Classifier_{epoch + 1}.pt')

    # Save the checkpoint after each epoch
    checkpoint = {
        'epoch': epoch + 1,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        # Add any other information you want to save
    }
    torch.save(checkpoint, checkpoint_path)
    
    # Set model to evaluation mode for validation loop
    model.eval()
    
    # Initialise metrics 
    val_loss = 0
    val_preds = []
    val_labels = []

    for batch in validation_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        with torch.no_grad():
            # Forward pass
            eval_output = model(b_input_ids, 
                              token_type_ids=None, 
                              attention_mask=b_input_mask, 
                              labels=b_labels)
        logits = eval_output.logits
        eval_loss = eval_output.loss
        val_loss += eval_loss.item()

        # Calculate validation metrics
        preds = torch.argmax(logits, dim=1).cpu().detach().numpy()
        label_ids = b_labels.cpu().detach().numpy()
        val_preds.extend(preds)
        val_labels.extend(label_ids)

    # Calculate average metrics over all batches
    avg_val_loss = val_loss / len(validation_dataloader)
    avg_val_accuracy = accuracy_score(val_labels, val_preds)
    avg_val_f1 = f1_score(val_labels, val_preds, average='weighted')

    print('\n\t - Train loss: {:.4f}'.format(tr_loss / nb_tr_steps))
    print('\n\t - Validation loss: {:.4f}'.format(avg_val_loss))
    print('\n\t - Validation accuracy: {:.4f}'.format(avg_val_accuracy))
    print('\n\t - Validation F1 score: {:.4f}'.format(avg_val_f1))
    
    # Check if validation loss improved
    if avg_val_loss < best_val_loss - min_delta * best_val_loss:
        best_val_loss = avg_val_loss
        counter = 0
    else:
        counter += 1
        if counter == patience:
            print("Validation loss did not improve for {} epochs. Early stopping...".format(patience))
            break

Epoch:   0%|                                                                                                     | 0/4 [00:00<?, ?it/s]

Epoch 1/4, Step 338/12456, Train Loss: 0.2950
Epoch 1/4, Step 690/12456, Train Loss: 0.2661
Epoch 1/4, Step 1042/12456, Train Loss: 0.2550
Epoch 1/4, Step 1394/12456, Train Loss: 0.2458
Epoch 1/4, Step 1746/12456, Train Loss: 0.2374
Epoch 1/4, Step 2098/12456, Train Loss: 0.2324
Epoch 1/4, Step 2450/12456, Train Loss: 0.2297
Epoch 1/4, Step 2802/12456, Train Loss: 0.2253
Epoch 1/4, Step 3154/12456, Train Loss: 0.2226
Epoch 1/4, Step 3506/12456, Train Loss: 0.2201
Epoch 1/4, Step 3858/12456, Train Loss: 0.2176
Epoch 1/4, Step 4210/12456, Train Loss: 0.2160
Epoch 1/4, Step 4562/12456, Train Loss: 0.2138
Epoch 1/4, Step 4914/12456, Train Loss: 0.2116
Epoch 1/4, Step 5266/12456, Train Loss: 0.2102
Epoch 1/4, Step 5618/12456, Train Loss: 0.2078
Epoch 1/4, Step 5970/12456, Train Loss: 0.2067
Epoch 1/4, Step 6322/12456, Train Loss: 0.2051
Epoch 1/4, Step 6674/12456, Train Loss: 0.2049
Epoch 1/4, Step 7035/12456, Train Loss: 0.2036
Epoch 1/4, Step 7404/12456, Train Loss: 0.2028
Epoch 1/4, Step

Epoch:  25%|█████████████████████▎                                                               | 1/4 [6:06:26<18:19:18, 21986.25s/it]


	 - Train loss: 0.1925

	 - Validation loss: 0.1729

	 - Validation accuracy: 0.9349

	 - Validation F1 score: 0.9308
Epoch 2/4, Step 1/12456, Train Loss: 0.0434
Epoch 2/4, Step 370/12456, Train Loss: 0.1318
Epoch 2/4, Step 739/12456, Train Loss: 0.1281
Epoch 2/4, Step 1108/12456, Train Loss: 0.1321
Epoch 2/4, Step 1477/12456, Train Loss: 0.1345
Epoch 2/4, Step 1846/12456, Train Loss: 0.1341
Epoch 2/4, Step 2215/12456, Train Loss: 0.1352
Epoch 2/4, Step 2584/12456, Train Loss: 0.1352
Epoch 2/4, Step 2953/12456, Train Loss: 0.1356
Epoch 2/4, Step 3322/12456, Train Loss: 0.1370
Epoch 2/4, Step 3691/12456, Train Loss: 0.1367
Epoch 2/4, Step 4060/12456, Train Loss: 0.1371
Epoch 2/4, Step 4429/12456, Train Loss: 0.1372
Epoch 2/4, Step 4798/12456, Train Loss: 0.1378
Epoch 2/4, Step 5167/12456, Train Loss: 0.1377
Epoch 2/4, Step 5536/12456, Train Loss: 0.1380
Epoch 2/4, Step 5905/12456, Train Loss: 0.1383
Epoch 2/4, Step 6274/12456, Train Loss: 0.1387
Epoch 2/4, Step 6643/12456, Train Loss: 

Epoch:  50%|██████████████████████████████████████████                                          | 2/4 [12:49:56<12:56:28, 23294.22s/it]


	 - Train loss: 0.1429

	 - Validation loss: 0.1903

	 - Validation accuracy: 0.9364

	 - Validation F1 score: 0.9329
Epoch 3/4, Step 1/12456, Train Loss: 0.0655
Epoch 3/4, Step 385/12456, Train Loss: 0.0977
Epoch 3/4, Step 769/12456, Train Loss: 0.0991
Epoch 3/4, Step 1151/12456, Train Loss: 0.1005
Epoch 3/4, Step 1527/12456, Train Loss: 0.1005
Epoch 3/4, Step 1903/12456, Train Loss: 0.1051
Epoch 3/4, Step 2279/12456, Train Loss: 0.1048
Epoch 3/4, Step 2655/12456, Train Loss: 0.1038
Epoch 3/4, Step 3031/12456, Train Loss: 0.1044
Epoch 3/4, Step 3407/12456, Train Loss: 0.1052
Epoch 3/4, Step 3783/12456, Train Loss: 0.1064
Epoch 3/4, Step 4142/12456, Train Loss: 0.1064
Epoch 3/4, Step 4486/12456, Train Loss: 0.1066
Epoch 3/4, Step 4826/12456, Train Loss: 0.1071
Epoch 3/4, Step 5184/12456, Train Loss: 0.1066
Epoch 3/4, Step 5477/12456, Train Loss: 0.1067
Epoch 3/4, Step 5769/12456, Train Loss: 0.1066
Epoch 3/4, Step 6061/12456, Train Loss: 0.1067
Epoch 3/4, Step 6353/12456, Train Loss: 

Epoch:  50%|██████████████████████████████████████████                                          | 2/4 [19:05:51<19:05:51, 34375.95s/it]


	 - Train loss: 0.1115

	 - Validation loss: 0.1837

	 - Validation accuracy: 0.9383

	 - Validation F1 score: 0.9363
Validation loss did not improve for 3 epochs. Early stopping...



