<a href="https://colab.research.google.com/github/Anoif01/Sentiment-Analysis-FrozenLayer/blob/main/Git_AllInOne_FrozenLayer_IMDB_Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## This is a Notebook on the topic of sentiment analysis tasks in the field of NLP.
### The main tools used are Pytorch and tensorflows.
#### Author: Xiaohua LU

In [None]:
!pip install wandb
!pip install datasets
!pip install pyarrow==11.0.0

In [None]:
import os
import random
import gc
import time

import numpy as np
import pandas as pd
from tqdm import tqdm
import re
from argparse import Namespace
from matplotlib import pyplot as plt

import wandb

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset, random_split

from transformers import AutoModel, AutoTokenizer, AutoConfig, AdamW
from datasets import load_dataset

gc.collect()

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

## 0. Set the SEED value for reproducing the experiment.

In [None]:
random.seed(42)
torch.manual_seed(42)
np.random.seed(42)

In [None]:
wb_log = True # @param {type:"boolean"}

args = Namespace()
args.forze_embed=True # @param {type:"boolean"}
args.if_distil = True # @param {type:"boolean"}
args.model_name = 'distilbert/distilbert-base-uncased'  # @param ['prajjwal1/bert-tiny','prajjwal1/bert-mini', 'prajjwal1/bert-small', 'prajjwal1/bert-medium', 'bert-base-uncased','distilbert/distilbert-base-uncased']
args.dataset_name = 'imdb' # @param
args.max_length = 256 # @param
args.bs = 64 # @param
args.N=1 # @param

args.lr=2e-5 # @param
args.epoch = 50 # @param
args.gpu = "A100" # @param ['A100', 'V100', 'T4']

if wb_log:
  wandb.login()
  run = wandb.init(
      # Set the project where this run will be logged
      project="Sentiment-Analysis-FrozenLayer", # @param
      # BertBase, DistilBert， BertTiny, BertMini, BertSmall, BertMedium
      name="IMDB-DistilBert-N1",# @param
      # Track hyperparameters and run metadata
      config=vars(args),
  )

## 1. Prepare data

###Data Fields
The data fields are the same among all splits.

###Columns
- text: a string feature.

- label: a classification label, with possible values including **neg (0), pos (1)**.

In [None]:
def clean_data(example):
  text = example['text']
  # Remove HTML<...>
  cleaned_text = re.sub(r"<.*?>", "", text)
  cleaned_text = cleaned_text.lower()
  # Remove ponctuation
  cleaned_text = re.sub(r"[^\w\s]", "", cleaned_text)
  example['text'] = cleaned_text
  return example

def tokenize_function(example):
  dict_token = tokenizer(example["text"], padding="max_length", truncation=True, max_length=args.max_length)
  return {**example, **dict_token}

In [None]:
# Create Custom Torch Dataset
class IMDBTorchDataset(Dataset):
    def __init__(self, hf_dataset, if_train=True):
        self.hf_dataset = hf_dataset
        self.if_train = if_train
        self.texts = hf_dataset['text']

    def __getitem__(self, i):
        item = self.hf_dataset[i]
        input_ids = torch.tensor(item['input_ids'])
        attention_mask = torch.tensor(item['attention_mask'])
        if not self.if_train:
            return (input_ids, attention_mask)
        else:
            label = item['label']
            return (input_ids, attention_mask, label)

    def __len__(self):
        return len(self.texts)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(args.model_name, lower=True)

hf_train_dataset = load_dataset(args.dataset_name, split="train", num_proc=2)
hf_test_dataset = load_dataset(args.dataset_name, split="test", num_proc=2)

hf_train_dataset = hf_train_dataset.map(clean_data, num_proc=2)
hf_train_dataset = hf_train_dataset.map(tokenize_function, num_proc=4)
hf_test_dataset = hf_test_dataset.map(clean_data, num_proc=2)
hf_test_dataset = hf_test_dataset.map(tokenize_function, num_proc=4)

# Avoid dict format with Custom Dataset
train_dataset = IMDBTorchDataset(hf_train_dataset, if_train=True)
test_dataset = IMDBTorchDataset(hf_test_dataset, if_train=True)
del hf_train_dataset, hf_test_dataset

# Create validation dataset
lengths = [int(len(test_dataset) * 0.8), int(len(test_dataset) * 0.2)]
test_dataset, valid_dataset = random_split(test_dataset, lengths=lengths, generator=torch.Generator().manual_seed(42))

# Split the dataset by batch
train_dataloader = DataLoader(train_dataset, batch_size=args.bs, shuffle=True)
val_dataloader = DataLoader(valid_dataset, batch_size=args.bs)
test_dataloader = DataLoader(test_dataset, batch_size=args.bs)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map (num_proc=4):   0%|          | 0/25000 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/25000 [00:00<?, ? examples/s]

### 2. Modeling

In [None]:
class Model(nn.Module):
    def __init__(self, N=5, args=args):
        super(Model, self).__init__()

        self.N = N
        self.forze_embed = args.forze_embed
        self.if_distil = args.if_distil

        model_config = AutoConfig.from_pretrained(args.model_name)
        self.model = AutoModel.from_pretrained(args.model_name)

        if not self.if_distil:
            for name, param in self.model.named_parameters():
                # Freeze the first N Transformer layers
                if name.startswith('encoder.layer') and int(name.split('.')[2]) < self.N:
                    param.requires_grad = False

                # Freeze the embedding layer
                if self.forze_embed and name.startswith('embeddings'):
                    param.requires_grad = False

            # Define a linear layer as classifier, input dimension is the size of the hidden layer of the BERT, and output dimension is 2 (num of labels).
            self.classifier = nn.Linear(model_config.hidden_size, 2)

        else:
            print('Config for distilBert')
            for name, param in self.model.named_parameters():
                # Freeze the first N Transformer layers
                if name.startswith('transformer.layer') and int(name.split('.')[2]) < self.N:
                    param.requires_grad = False

                # Freeze the embedding layer
                if self.forze_embed and name.startswith('embeddings'):
                    param.requires_grad = False

            # Define a linear layer to process CLS token later (will add by a tanh layer)
            self.pre_classifier = nn.Linear(model_config.hidden_size, model_config.hidden_size)
            self.dropout = nn.Dropout(0.2)

            # Define a linear layer as classifier, input dimension is the size of the hidden layer of the BERT, and output dimension is 2 (num of labels).
            self.classifier = nn.Linear(model_config.hidden_size, 2)

    def forward(self, input_ids, attention_mask):
        model_output = self.model(input_ids=input_ids, attention_mask=attention_mask)

        if not self.if_distil:
            pooler_output = model_output[1] # [batch_size,hidden]
            out = self.classifier(pooler_output)
        else:
            # print('Config for distilBert')

            # Get last hidden state, Shape: [batch_size, sequence_length, hidden_size]
            last_hidden_state = model_output.last_hidden_state

            # Get [CLS] token, Shape: [batch_size, hidden_size]
            cls_token_state = last_hidden_state[:, 0, :]

            # Pooler_output – use LinearLayer and a TanhActivation for classification token
            pooler_output = self.pre_classifier(cls_token_state)
            pooler_output = nn.Tanh()(pooler_output)
            pooler_output = self.dropout(pooler_output)

            out = self.classifier(pooler_output)
        return out


In [None]:
class EarlyStopping:
    def __init__(self, patience=5, verbose=False, delta=0.0001, path='checkpoint.pt', trace_func=print):
        self.patience = patience
        self.verbose = verbose
        self.delta = delta
        self.path = path
        self.trace_func = trace_func

        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_acc_max = -np.Inf

    def __call__(self, val_acc, model):
        score = val_acc

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_acc, model)
        elif score < self.best_score + self.delta:
            self.counter += 1
            self.trace_func(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(val_acc, model)
            self.counter = 0

    def save_checkpoint(self, val_acc, model):
        """Save the model when validation accuracy increases"""
        if self.verbose:
            self.trace_func(f'Validation accuracy increased ({self.val_acc_max:.6f} --> {val_acc:.6f}).  Saving model ...')
        torch.save(model.state_dict(), self.path)
        self.val_acc_max = val_acc  # Updating the maximum valid accuracy

In [None]:
# Model initialization, optimizer setup, and loss function definition.
model = Model(N=args.N, args=args)

# Calculate the number of trainable parameters
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

model.to(device)
optimizer = AdamW(model.parameters(), lr=args.lr)
criterion = nn.CrossEntropyLoss()
early_stopping = EarlyStopping(patience=3, verbose=True, delta=0.0001, path='checkpoint.pt', )
gc.collect()

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Config for distilBert




297

In [None]:
total_loss = []
total_val_acc = []
total_train_time = []
total_forward_time = []
total_backward_time = []


for epoch in range(args.epoch):
    # Clearing unused cache memory
    torch.cuda.empty_cache()

    start_train_time_epoch = time.time()
    model.train()

    epoch_loss = []
    epoch_forward_time_total = 0
    epoch_backward_time_total = 0

    for input_ids, attention_mask, target in tqdm(train_dataloader):
        # Data Migration to Cuda Devices for acceleration
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        target = target.to(device)

        # Zero the accumulated gradient
        optimizer.zero_grad()

        # Forward propagation
        start_time_forward = time.time()
        y_pred = model(input_ids, attention_mask)
        forward_time = time.time() - start_time_forward
        epoch_forward_time_total += forward_time

        # Loss Calculation and Backpropagation and Optimizer Steps
        start_time_backward = time.time()
        loss = criterion(y_pred, target)
        loss.backward()
        optimizer.step()
        backward_time = time.time() - start_time_backward
        epoch_backward_time_total += backward_time

        epoch_loss.append(loss.item())

    mean_epoch_loss = np.mean(epoch_loss)
    total_loss.append(mean_epoch_loss)
    end_train_time_epoch = time.time()
    epoch_train_time = end_train_time_epoch - start_train_time_epoch

    total_forward_time.append(epoch_forward_time_total)
    total_backward_time.append(epoch_backward_time_total)
    total_train_time.append(epoch_train_time)

    # Free up GPU memory
    input_ids = input_ids.to(torch.device('cpu'))
    attention_mask = attention_mask.to(torch.device('cpu'))
    target = target.to(torch.device('cpu'))
    gc.collect()

    # Calculate the acc of the valid set
    val_accs= []
    model.eval()
    for input_ids, attention_mask, target in tqdm(val_dataloader):
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        target = target.to(device)

        y_pred = model(input_ids, attention_mask)
        _, y_pred = torch.max(y_pred, -1)

        acc = torch.mean((torch.tensor(y_pred.cpu() == target.cpu(), dtype=torch.float)))
        val_accs.append(acc.cpu())
    mean_epoch_acc = np.array(val_accs).mean()
    total_val_acc.append(mean_epoch_acc)

    input_ids = input_ids.to(torch.device('cpu'))
    attention_mask = attention_mask.to(torch.device('cpu'))
    target = target.to(torch.device('cpu'))
    gc.collect()

    print(f"Epoch: {epoch+1}, Loss: {mean_epoch_loss}, Val-Acc: {mean_epoch_acc}, "
          f"Forward Time: {epoch_forward_time_total:.2f}s, Backward Time: {epoch_backward_time_total:.2f}s, "
          f"Total Epoch Train Time: {epoch_train_time:.2f}s")

    if wb_log:
      wandb.log({"epoch": epoch+1, "train_loss": mean_epoch_loss, "val_accuracy": mean_epoch_acc,
                 "epoch_train_time": epoch_train_time, "epoch_forward_time": epoch_forward_time_total, "epoch_backward_time": epoch_backward_time_total})

    # Call early stop at the end of the epoch
    early_stopping(mean_epoch_acc, model)
    if early_stopping.early_stop:
        print("Early stopping")
        gc.collect()
        break

    gc.collect()

100%|██████████| 391/391 [01:54<00:00,  3.42it/s]
  acc = torch.mean((torch.tensor(y_pred.cpu() == target.cpu(), dtype=torch.float)))
100%|██████████| 79/79 [00:09<00:00,  8.10it/s]


Epoch: 1, Loss: 0.3081406127956822, Val-Acc: 0.905063271522522, Forward Time: 3.36s, Backward Time: 5.11s, Total Epoch Train Time: 114.25s
Validation accuracy increased (-inf --> 0.905063).  Saving model ...


100%|██████████| 391/391 [01:53<00:00,  3.44it/s]
100%|██████████| 79/79 [00:09<00:00,  8.06it/s]


Epoch: 2, Loss: 0.21509576360206775, Val-Acc: 0.889438271522522, Forward Time: 2.82s, Backward Time: 4.92s, Total Epoch Train Time: 113.70s
EarlyStopping counter: 1 out of 3


100%|██████████| 391/391 [01:53<00:00,  3.44it/s]
100%|██████████| 79/79 [00:09<00:00,  8.08it/s]


Epoch: 3, Loss: 0.1609851393200781, Val-Acc: 0.9145569801330566, Forward Time: 2.79s, Backward Time: 4.93s, Total Epoch Train Time: 113.79s
Validation accuracy increased (0.905063 --> 0.914557).  Saving model ...


100%|██████████| 391/391 [01:53<00:00,  3.44it/s]
100%|██████████| 79/79 [00:09<00:00,  8.04it/s]


Epoch: 4, Loss: 0.10969914494039458, Val-Acc: 0.9104034900665283, Forward Time: 2.80s, Backward Time: 4.94s, Total Epoch Train Time: 113.79s
EarlyStopping counter: 1 out of 3


100%|██████████| 391/391 [01:53<00:00,  3.43it/s]
 73%|███████▎  | 58/79 [00:07<00:02,  7.96it/s]

In [None]:
# Get the acc of the test set
model.eval()

predictions = []
labels = []

for input_ids, attention_mask, target in tqdm(test_dataloader):
    input_ids = input_ids.to(device)
    attention_mask = attention_mask.to(device)
    labels += list(target.numpy())

    y_preds = model(input_ids, attention_mask)
    _, y_pred = torch.max(y_preds, -1)
    predictions += list(y_pred.cpu().numpy())

test_acc = np.mean(np.array(predictions) == np.array(labels))
print(test_acc)

input_ids = input_ids.to(torch.device('cpu'))
attention_mask = attention_mask.to(torch.device('cpu'))
gc.collect()

In [None]:
if wb_log:
  wandb.summary['total_loss'] = total_loss
  wandb.summary['total_val_acc'] = total_val_acc
  wandb.summary['mean_train_time'] = np.mean(total_train_time)
  wandb.summary['mean_forward_time'] = np.mean(total_forward_time)
  wandb.summary['mean_backward_time'] = np.mean(total_backward_time)
  wandb.summary['test_acc'] = test_acc
  wandb.summary['trainable_params'] = trainable_params

  wandb.finish()