Install the required libraries

In [None]:
# for cpu !pip3 install torch torchvision torchaudio
!pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
!pip3 install transformers
!pip3 install sklearn
!pip3 install pandas
!pip3 install numpy
!pip3 install tabulate
!pip3 install tqdm
!pip3 install sentencepiece

Load the required libraries

In [1]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import XLNetForSequenceClassification, AutoTokenizer
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from tabulate import tabulate
from tqdm import trange
import random
from sklearn.metrics import accuracy_score, f1_score
import time

Code for XLNet Classifier

In [2]:
# Load the dataset
data = pd.read_csv('IMDB_Reviews_Top_250_preprocessed_without_stopwords.csv')

In [3]:
data.head(5)

Unnamed: 0.1,Unnamed: 0,Review Title,Review Text,Rating,Movie Title
0,0,One Step Masterpiece !,'s honestly absurd good `` Spider-Verse '' mov...,10.0,Spider-Man: Across the Spider-Verse
1,1,Masterful IMAX Spider-Man,"animation , flow everything , genius character...",10.0,Spider-Man: Across the Spider-Verse
2,2,One best sequels anything ever made,"n't already obvious first film , 's officially...",10.0,Spider-Man: Across the Spider-Verse
3,3,new Empire Strikes Back,film visual concert . animation character desi...,10.0,Spider-Man: Across the Spider-Verse
4,4,movie 2023 far .,'s time since 've walked cinema feeling satisf...,10.0,Spider-Man: Across the Spider-Verse


In [4]:
# Set the threshold for binary conversion
threshold = 7.0

# Convert the 'Rating' column to a binary variable
data['Binary Rating'] = data['Rating'].apply(lambda x: 1 if x >= threshold else 0)

In [5]:
# Extract reviews texts and ratings into arrays
reviews = data['Review Text'].values
labels = data['Binary Rating'].values

In [6]:
# Shouldn't be necessary but just to be safe
for review in reviews:
    str(review)  

In [7]:
# Count the number of rows with label 1
num_label_1 = data[data['Binary Rating'] == 1]['Binary Rating'].count()

# Count the number of rows with label 0
num_label_0 = data[data['Binary Rating'] == 0]['Binary Rating'].count()

print(f"Number of rows with Binary Rating 1: {num_label_1}")
print(f"Number of rows with Binary Rating 0: {num_label_0}")

Number of rows with Binary Rating 1: 214112
Number of rows with Binary Rating 0: 35009


In [8]:
reviews[0:5]

array(["'s honestly absurd good `` Spider-Verse '' movies . `` Across Spider-Verse '' great , better `` Spider-Verse '' . really n't know . `` Spider-Man : Across Spider-Verse '' fantastic ! Deftly juggles deeply heartfelt character beats crazy multiverse content , packed many delightful easter eggs . Loved Gwen 's story expanded , scenes Shea Whigham 's Captain Stacy truly special . Ca n't wait third one . Every frame movie gorgeous ! n't want blink n't want miss anything . watch film & & finds something new enjoy . surprises film truly n't expecting . `` Across Spider-Verse '' another milestone animation . Yes 's gorgeous & visually mind-blowing , sequel surpasses first always putting Miles family front center . movie feels like massive celebration Spider-Man fans everywhere , whether love movies , games , comics , etc . , cameos multiversal elements come second Miles Gwen 's story . stunning achievement . Yes , 's visually dazzling , expansive , expressive new realms movie 's real s

In [9]:
labels[0:5]

array([1, 1, 1, 1, 1], dtype=int64)

In [10]:
# Load XLNet tokenizer
tokenizer = AutoTokenizer.from_pretrained('xlnet-base-cased')

In [11]:
# testing tokens and token Ids for a random sentence
def print_rand_sentence():
    index = random.randint(0, len(reviews)-1) #random index in texts list
    table = np.array([tokenizer.tokenize(reviews[index]), 
                    tokenizer.convert_tokens_to_ids(tokenizer.tokenize(reviews[index]))]).T #tokenize random text in texts list
    print(tabulate(table,
                 headers = ['Tokens', 'Token IDs'],
                 tablefmt = 'fancy_grid'))  #print in table format

print_rand_sentence()

╒════════════════╤═════════════╕
│ Tokens         │   Token IDs │
╞════════════════╪═════════════╡
│ ▁need          │         214 │
├────────────────┼─────────────┤
│ ▁suspend       │       10366 │
├────────────────┼─────────────┤
│ ▁disbelief     │       21714 │
├────────────────┼─────────────┤
│ ▁like          │         115 │
├────────────────┼─────────────┤
│ ▁              │          17 │
├────────────────┼─────────────┤
│ '              │          26 │
├────────────────┼─────────────┤
│ ve             │         189 │
├────────────────┼─────────────┤
│ ▁never         │         287 │
├────────────────┼─────────────┤
│ ▁suspended     │        4161 │
├────────────────┼─────────────┤
│ ▁disbelief     │       21714 │
├────────────────┼─────────────┤
│ ▁watch         │        1628 │
├────────────────┼─────────────┤
│ ▁film          │         468 │
├────────────────┼─────────────┤
│ ▁              │          17 │
├────────────────┼─────────────┤
│ ,              │          19 │
├─────────

In [12]:
# Find max sequensce lenght

MAX_LEN = 0
for review in reviews:
    tokenized = tokenizer(review,return_tensors='pt',add_special_tokens=True)
    MAX_LEN = max(MAX_LEN, tokenized['input_ids'].size()[1])

In [13]:
MAX_LEN

3356

In [14]:
# Tokenize and encode texts then extract token ids, attention masks and labels in torch tensor format.
token_id = []
attention_masks = []

def preprocessing(input_text, tokenizer):
    return tokenizer.encode_plus(                    #returns dictionary with token ids, attention masks and token type ids
                        input_text,
                        max_length = 512,
                        add_special_tokens = True,
                        padding = 'max_length',    #padding tokens to be of the same size
                        truncation=True,           # Truncate the text if it exceeds 512 tokens
                        return_attention_mask = True,
                        return_tensors = 'pt'        #torch tensor format
                   )


for sample in reviews:
    encoding_dict = preprocessing(sample, tokenizer)
    token_id.append(encoding_dict['input_ids']) 
    attention_masks.append(encoding_dict['attention_mask'])

#concatinating the tesnors in a single dimension
token_id = torch.cat(token_id, dim = 0)
attention_masks = torch.cat(attention_masks, dim = 0)
labels = torch.tensor(labels)

In [15]:
token_id

tensor([[  5,   5,   5,  ...,   9,   4,   3],
        [  5,   5,   5,  ...,   9,   4,   3],
        [  5,   5,   5,  ...,   9,   4,   3],
        ...,
        [  5,   5,   5,  ..., 197,   4,   3],
        [  5,   5,   5,  ...,   9,   4,   3],
        [  5,   5,   5,  ...,   9,   4,   3]])

In [16]:
attention_masks

tensor([[0, 0, 0,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 1, 1, 1],
        ...,
        [0, 0, 0,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 1, 1, 1]])

In [17]:
labels

tensor([1, 1, 1,  ..., 1, 1, 1])

In [18]:
def print_rand_sentence_encoding():
    index = random.randint(0, len(reviews) - 1)
    tokens = tokenizer.convert_ids_to_tokens(token_id[index].numpy().tolist())  # Convert token_ids to a list of strings
    token_ids = token_id[index].numpy()
    attention = attention_masks[index].numpy()

    table = np.array([tokens, token_ids, attention]).T
    print(tabulate(table, headers=['Tokens', 'Token IDs', 'Attention Mask'], tablefmt='fancy_grid'))

print_rand_sentence_encoding()

╒══════════════╤═════════════╤══════════════════╕
│ Tokens       │   Token IDs │   Attention Mask │
╞══════════════╪═════════════╪══════════════════╡
│ <pad>        │           5 │                0 │
├──────────────┼─────────────┼──────────────────┤
│ <pad>        │           5 │                0 │
├──────────────┼─────────────┼──────────────────┤
│ <pad>        │           5 │                0 │
├──────────────┼─────────────┼──────────────────┤
│ <pad>        │           5 │                0 │
├──────────────┼─────────────┼──────────────────┤
│ <pad>        │           5 │                0 │
├──────────────┼─────────────┼──────────────────┤
│ <pad>        │           5 │                0 │
├──────────────┼─────────────┼──────────────────┤
│ <pad>        │           5 │                0 │
├──────────────┼─────────────┼──────────────────┤
│ <pad>        │           5 │                0 │
├──────────────┼─────────────┼──────────────────┤
│ <pad>        │           5 │                0 │


In [19]:
# split data in training and validation sets

val_ratio = 0.2 
batch_size = 16

# Indices of the train and validation splits stratified by labels
train_idx, val_idx = train_test_split(
    np.arange(len(labels)),
    test_size = val_ratio,
    shuffle = True,
    stratify = labels)

# Train and validation sets
train_set = TensorDataset(token_id[train_idx], 
                          attention_masks[train_idx], 
                          labels[train_idx])

val_set = TensorDataset(token_id[val_idx], 
                        attention_masks[val_idx], 
                        labels[val_idx])

# DataLoaders
train_dataloader = DataLoader(
            train_set,
            sampler = RandomSampler(train_set),
            batch_size = batch_size
        )

validation_dataloader = DataLoader(
            val_set,
            sampler = SequentialSampler(val_set),
            batch_size = batch_size
        )

In [20]:
model = XLNetForSequenceClassification.from_pretrained(
    "xlnet-base-cased",
    num_labels=2,
    output_attentions=False,
    output_hidden_states=False,
)

optimizer = torch.optim.AdamW(
                model.parameters(),
                lr=3e-5,     # You can also try 3e-5, 2e-5
                eps=1e-8     # AdamW's epsilon value. Probably optimal.
                )

Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'sequence_summary.summary.weight', 'sequence_summary.summary.bias', 'logits_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
# Define device and move the model to it
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

XLNetForSequenceClassification(
  (transformer): XLNetModel(
    (word_embedding): Embedding(32000, 768)
    (layer): ModuleList(
      (0-11): 12 x XLNetLayer(
        (rel_attn): XLNetRelativeAttention(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (ff): XLNetFeedForward(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (layer_1): Linear(in_features=768, out_features=3072, bias=True)
          (layer_2): Linear(in_features=3072, out_features=768, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (activation_function): GELUActivation()
        )
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (sequence_summary): SequenceSummary(
    (summary): Linear(in_features=768, out_features=768, bias=True)
    (activation): Tanh()
    (first_dropout): Identity()
    (last

In [22]:
checkpoint_path = "checkpoint_XLNet.pth"

In [23]:
# To resume training from the last checkpoint, you can use the following code:
# Restore the latest checkpoint if needed
# checkpoint = torch.load(checkpoint_path)
# model.load_state_dict(checkpoint['model_state_dict'])
# optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

In [24]:
epochs = 4
debug_interval = 600  # Print a debug message every 10 minutes

# Define variables for early stopping
patience = 3
min_delta = 0.001
best_val_loss = float('inf')
counter = 0

start_time = time.time()  # Record the start time

for epoch in trange(epochs, desc='Epoch'):
    # Set model to training mode for training loop
    model.train()

    # Tracking variables
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0

    for step, batch in enumerate(train_dataloader):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        optimizer.zero_grad()
        # Forward pass
        train_output = model(b_input_ids, 
                             token_type_ids=None, 
                             attention_mask=b_input_mask, 
                             labels=b_labels)

        # Backward pass
        train_output.loss.backward()
        optimizer.step()
        # Update tracking variables
        tr_loss += train_output.loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1

        # Check if five seconds have passed, and print a debug message
        if time.time() - start_time > debug_interval:
            print(f"Epoch {epoch + 1}/{epochs}, Step {step + 1}/{len(train_dataloader)}, "
                  f"Train Loss: {tr_loss / nb_tr_steps:.4f}")
            start_time = time.time()  # Reset the start time for the next 10 minutes

    # After the training loop, save the model checkpoint for each epoch
    torch.save(model.state_dict(), f'model_checkpoint_epoch_XLNet_Classifier_{epoch + 1}.pt')

    # Save the checkpoint after each epoch
    checkpoint = {
        'epoch': epoch + 1,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        # Add any other information you want to save
    }
    torch.save(checkpoint, checkpoint_path)
    
    # Set model to evaluation mode for validation loop
    model.eval()
    
    # Initialise metrics 
    val_loss = 0
    val_preds = []
    val_labels = []

    for batch in validation_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        with torch.no_grad():
            # Forward pass
            eval_output = model(b_input_ids, 
                              token_type_ids=None, 
                              attention_mask=b_input_mask, 
                              labels=b_labels)
        logits = eval_output.logits
        eval_loss = eval_output.loss
        val_loss += eval_loss.item()

        # Calculate validation metrics
        preds = torch.argmax(logits, dim=1).cpu().detach().numpy()
        label_ids = b_labels.cpu().detach().numpy()
        val_preds.extend(preds)
        val_labels.extend(label_ids)

    # Calculate average metrics over all batches
    avg_val_loss = val_loss / len(validation_dataloader)
    avg_val_accuracy = accuracy_score(val_labels, val_preds)
    avg_val_f1 = f1_score(val_labels, val_preds, average='weighted')

    print('\n\t - Train loss: {:.4f}'.format(tr_loss / nb_tr_steps))
    print('\n\t - Validation loss: {:.4f}'.format(avg_val_loss))
    print('\n\t - Validation accuracy: {:.4f}'.format(avg_val_accuracy))
    print('\n\t - Validation F1 score: {:.4f}'.format(avg_val_f1))
    
    # Check if validation loss improved
    if avg_val_loss < best_val_loss - min_delta * best_val_loss:
        best_val_loss = avg_val_loss
        counter = 0
    else:
        counter += 1
        if counter == patience:
            print("Validation loss did not improve for {} epochs. Early stopping...".format(patience))
            break

Epoch:   0%|                                                                                                     | 0/4 [00:00<?, ?it/s]

Epoch 1/4, Step 117/12456, Train Loss: 0.4328
Epoch 1/4, Step 238/12456, Train Loss: 0.3667
Epoch 1/4, Step 365/12456, Train Loss: 0.3257
Epoch 1/4, Step 493/12456, Train Loss: 0.3047
Epoch 1/4, Step 621/12456, Train Loss: 0.2929
Epoch 1/4, Step 749/12456, Train Loss: 0.2786
Epoch 1/4, Step 877/12456, Train Loss: 0.2712
Epoch 1/4, Step 1005/12456, Train Loss: 0.2647
Epoch 1/4, Step 1133/12456, Train Loss: 0.2581
Epoch 1/4, Step 1261/12456, Train Loss: 0.2510
Epoch 1/4, Step 1389/12456, Train Loss: 0.2456
Epoch 1/4, Step 1517/12456, Train Loss: 0.2414
Epoch 1/4, Step 1645/12456, Train Loss: 0.2398
Epoch 1/4, Step 1773/12456, Train Loss: 0.2371
Epoch 1/4, Step 1901/12456, Train Loss: 0.2346
Epoch 1/4, Step 2029/12456, Train Loss: 0.2327
Epoch 1/4, Step 2157/12456, Train Loss: 0.2306
Epoch 1/4, Step 2285/12456, Train Loss: 0.2284
Epoch 1/4, Step 2413/12456, Train Loss: 0.2263
Epoch 1/4, Step 2541/12456, Train Loss: 0.2238
Epoch 1/4, Step 2669/12456, Train Loss: 0.2221
Epoch 1/4, Step 2797

Epoch:  25%|█████████████████████                                                               | 1/4 [17:34:30<52:43:32, 63270.97s/it]


	 - Train loss: 0.1860

	 - Validation loss: 0.1620

	 - Validation accuracy: 0.9396

	 - Validation F1 score: 0.9395
Epoch 2/4, Step 1/12456, Train Loss: 0.0226
Epoch 2/4, Step 111/12456, Train Loss: 0.1308
Epoch 2/4, Step 221/12456, Train Loss: 0.1361
Epoch 2/4, Step 331/12456, Train Loss: 0.1370
Epoch 2/4, Step 441/12456, Train Loss: 0.1360
Epoch 2/4, Step 551/12456, Train Loss: 0.1316
Epoch 2/4, Step 661/12456, Train Loss: 0.1356
Epoch 2/4, Step 771/12456, Train Loss: 0.1379
Epoch 2/4, Step 881/12456, Train Loss: 0.1413
Epoch 2/4, Step 991/12456, Train Loss: 0.1414
Epoch 2/4, Step 1101/12456, Train Loss: 0.1418
Epoch 2/4, Step 1211/12456, Train Loss: 0.1396
Epoch 2/4, Step 1321/12456, Train Loss: 0.1405
Epoch 2/4, Step 1431/12456, Train Loss: 0.1393
Epoch 2/4, Step 1541/12456, Train Loss: 0.1396
Epoch 2/4, Step 1651/12456, Train Loss: 0.1399
Epoch 2/4, Step 1761/12456, Train Loss: 0.1401
Epoch 2/4, Step 1871/12456, Train Loss: 0.1400
Epoch 2/4, Step 1981/12456, Train Loss: 0.1403


Epoch:  50%|██████████████████████████████████████████                                          | 2/4 [38:27:05<39:02:02, 70261.23s/it]


	 - Train loss: 0.1458

	 - Validation loss: 0.1555

	 - Validation accuracy: 0.9412

	 - Validation F1 score: 0.9411
Epoch 3/4, Step 1/12456, Train Loss: 0.1278
Epoch 3/4, Step 144/12456, Train Loss: 0.1197
Epoch 3/4, Step 287/12456, Train Loss: 0.1119
Epoch 3/4, Step 430/12456, Train Loss: 0.1128
Epoch 3/4, Step 573/12456, Train Loss: 0.1090
Epoch 3/4, Step 716/12456, Train Loss: 0.1033
Epoch 3/4, Step 859/12456, Train Loss: 0.1092
Epoch 3/4, Step 1002/12456, Train Loss: 0.1114
Epoch 3/4, Step 1145/12456, Train Loss: 0.1119
Epoch 3/4, Step 1285/12456, Train Loss: 0.1125
Epoch 3/4, Step 1420/12456, Train Loss: 0.1139
Epoch 3/4, Step 1547/12456, Train Loss: 0.1142
Epoch 3/4, Step 1681/12456, Train Loss: 0.1148
Epoch 3/4, Step 1817/12456, Train Loss: 0.1155
Epoch 3/4, Step 1953/12456, Train Loss: 0.1151
Epoch 3/4, Step 2089/12456, Train Loss: 0.1160
Epoch 3/4, Step 2225/12456, Train Loss: 0.1187
Epoch 3/4, Step 2361/12456, Train Loss: 0.1186
Epoch 3/4, Step 2497/12456, Train Loss: 0.11

Epoch:  50%|█████████████████████████████████████████▌                                         | 2/4 [57:15:05<57:15:05, 103052.73s/it]


	 - Train loss: 0.1329

	 - Validation loss: 0.1694

	 - Validation accuracy: 0.9387

	 - Validation F1 score: 0.9378
Validation loss did not improve for 3 epochs. Early stopping...



