## This structure is for BERT training

In [1]:
from transformers import AutoModel, AutoTokenizer, BertModel, PreTrainedTokenizer, get_cosine_schedule_with_warmup
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import AdamW
from torch.utils.data import DataLoader, Dataset 
from torch.nn.utils.rnn import pad_sequence # for dynamic padding
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import random

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
# set seeds
def set_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # if you are using multi-GPU.
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    
set_seed(114514)

In [3]:
# import pretrained model
model : BertModel = AutoModel.from_pretrained("microsoft/deberta-v3-base")
tokenizer : PreTrainedTokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-base")

model



DebertaV2Model(
  (embeddings): DebertaV2Embeddings(
    (word_embeddings): Embedding(128100, 768, padding_idx=0)
    (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
    (dropout): StableDropout()
  )
  (encoder): DebertaV2Encoder(
    (layer): ModuleList(
      (0-11): 12 x DebertaV2Layer(
        (attention): DebertaV2Attention(
          (self): DisentangledSelfAttention(
            (query_proj): Linear(in_features=768, out_features=768, bias=True)
            (key_proj): Linear(in_features=768, out_features=768, bias=True)
            (value_proj): Linear(in_features=768, out_features=768, bias=True)
            (pos_dropout): StableDropout()
            (dropout): StableDropout()
          )
          (output): DebertaV2SelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
            (dropout): StableDropout()
          )
        )
        (intermedia

In [4]:
# import datasets

train_df = pd.read_csv("../preprocessed_dataset/train.csv").iloc[:,1:]
validation_df = pd.read_csv("../preprocessed_dataset/validation.csv").iloc[:,1:]
test_df = pd.read_csv("../preprocessed_dataset/test.csv").iloc[:,1:]
train_df

Unnamed: 0,text,label
0,the rock is destined to be the 21st century's ...,1
1,"the gorgeously elaborate continuation of "" the...",1
2,effective but too-tepid biopic,1
3,if you sometimes like to go to the movies to h...,1
4,"emerges as something rare , an issue movie tha...",1
...,...,...
8525,any enjoyment will be hinge from a personal th...,0
8526,if legendary shlockmeister ed wood had ever ma...,0
8527,hardly a nuanced portrait of a young woman's b...,0
8528,"interminably bleak , to say nothing of boring .",0


In [5]:
tokenizer("hello world")

{'input_ids': [1, 12018, 447, 2], 'token_type_ids': [0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1]}

In [6]:
## Define model
class SentimentClassifier(nn.Module):
    def __init__(
            self, 
            bert : BertModel
        ) -> None:
        super().__init__()
        self.bert = bert
        self.fc = nn.Linear(768, 1) # total 768 dim output
        
    def forward(
            self,
            input_ids : torch.Tensor,
            attention_mask : torch.Tensor
        ) -> torch.Tensor:
        outputs = self.bert.forward(
            input_ids = input_ids,
            attention_mask = attention_mask
        )
        # get the output of [CLS] position (first position) for training 
        cls_output = outputs.last_hidden_state[:,0,:] # [B, seqlen, embed]
        return self.fc.forward(cls_output)

In [7]:
tokenizer("efsdfs")

{'input_ids': [1, 46733, 268, 32392, 268, 2], 'token_type_ids': [0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1]}

In [8]:
# define dataset and data loader
from typing import Any


class CustomizeDataset(Dataset):
    def __init__(
            self,
            tokenizer : PreTrainedTokenizer,
            df : pd.DataFrame
        ) -> None:
        super().__init__()
        self.df = df
        self.tokenizer = tokenizer
        
    def __len__(self) -> int:
        return len(self.df)
    
    def __getitem__(
            self, 
            index : int
        ) -> dict:
        
        inputs = self.df.iloc[index,0] # 0 is text
        label = self.df.iloc[index, 1] # 1 is label
        
        tok = self.tokenizer(inputs)
        
        return {
            "input_ids" : tok["input_ids"],
            "label" : label
        }
        
# collater function
class Collater:
    def __init__(
            self,
            tokenizer : PreTrainedTokenizer
        ) -> None:
        self.tokenizer = tokenizer 
    
    def __call__(
        self,
        instances : list
        ) -> Any:
        input_ids = [torch.tensor(instance["input_ids"], dtype = torch.int64) for instance in instances]
        label = [torch.tensor(instance["label"], dtype = torch.int64) for instance in instances]
        input_ids = pad_sequence(input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id)
        attention_mask = input_ids.ne(self.tokenizer.pad_token_id).type(torch.int64)
        
        return {
            "input_ids": input_ids,
            "label": torch.tensor(label),
            "attention_mask": attention_mask # attention mask 本质就是找到不等于 pad_token_id 的位置，就是有效位置
        }

In [9]:
# Hyper paramaters

num_train_epochs = 2

batch_size = 16
lr = 2e-4
weight_decay = 1e-6

warmup_ratio=0.05
max_grad_norm = 1

In [10]:
# create dataset and data loader

collate_fn = Collater(tokenizer)

train_ds = CustomizeDataset(
    tokenizer = tokenizer,
    df = train_df
)

train_loader = DataLoader(
    dataset = train_ds,
    batch_size = batch_size,
    collate_fn = collate_fn,
    shuffle = True
)

val_ds = CustomizeDataset(
    tokenizer = tokenizer,
    df = validation_df
)

val_loader = DataLoader(
    dataset = val_ds,
    batch_size = batch_size,
    collate_fn = collate_fn
)

test_ds = CustomizeDataset(
    tokenizer = tokenizer,
    df = test_df
)

test_loader = DataLoader(
    dataset = test_ds,
    batch_size = batch_size,
    collate_fn = collate_fn
)

In [11]:
# compute warmup status
num_training_steps = num_train_epochs * len(train_loader)
num_warmup_steps = int(num_training_steps * warmup_ratio)
print(f"train:{num_training_steps}, warm up: {num_warmup_steps}")

train:1068, warm up: 53


In [12]:
# Set optimizer, loss_fn and so on

cls_model = SentimentClassifier(model)

loss_fn = nn.BCEWithLogitsLoss()

optimizer = AdamW(
    params = cls_model.parameters(),
    lr = lr,
    weight_decay = weight_decay
)

scheduler  = get_cosine_schedule_with_warmup(
    optimizer = optimizer,
    num_warmup_steps = num_warmup_steps,
    num_training_steps = num_training_steps
)

In [13]:
# compute accuracy

def compute_accuracy(data_loader: DataLoader) -> float:
    
    cls_model.eval()
    
    correct = 0
    total = 0
    
    with torch.no_grad():
        for data in data_loader:
            input_ids = data["input_ids"].to(device)
            attention_mask = data["attention_mask"].to(device)
            labels = data["label"].view(-1, 1).float().to(device)
            
            logits = cls_model(input_ids, attention_mask=attention_mask)
     
            predictions = (logits > 0).long()
            
            correct += (predictions == labels).sum().item()
            total += labels.size(0)
    
    accuracy = correct / total
    return accuracy

In [14]:
# Training loop

cls_model.to(device)

optimizer.zero_grad()

step = 0

train_losses = []

for epoch in range(num_train_epochs):
    
    # train loop
    cls_model.train()
    for data in train_loader:
        input_ids = data["input_ids"].to(device)
        attention_mask = data["attention_mask"].to(device)
        label = data["label"].view(-1, 1).float().to(device)
        
        logits = cls_model.forward(input_ids, attention_mask)
        
        
        loss = loss_fn.forward(
            input = logits,
            target = label
        )
        
        loss.backward()
        optimizer.step() # update paramater
        scheduler.step() 
        optimizer.zero_grad() # clear gradient
        
        vis_loss = loss.detach().cpu()
        
        print(f"[{step + 1}/{num_training_steps}] train loss: [{vis_loss:.4f}] (epoch [{epoch + 1}/{num_train_epochs}])")
        
        
        train_losses.append(vis_loss)
        
        step += 1
        
    val_loss = 0
    
    cls_model.eval()
    with torch.no_grad():
        for data in val_loader:
            input_ids = data["input_ids"].to(device)
            attention_mask = data["attention_mask"].to(device)
            labels = data["label"].view(-1, 1).float().to(device)
            
            logits = cls_model.forward(input_ids, attention_mask)
            loss = loss_fn.forward(
                input = logits,
                target = labels
            )
            
            val_loss += loss.detach().cpu()
            
        val_loss /= len(val_loader)
        
    val_acc = compute_accuracy(val_loader)
    print(f"epoch [{epoch + 1}/{num_train_epochs}] validation loss: [{val_loss:.4f}] validation accuracy: [{val_acc}]")

[1/1068] train loss: [0.7727] (epoch [1/2])
[2/1068] train loss: [0.7663] (epoch [1/2])
[3/1068] train loss: [0.6256] (epoch [1/2])
[4/1068] train loss: [0.7094] (epoch [1/2])
[5/1068] train loss: [0.6739] (epoch [1/2])
[6/1068] train loss: [0.6398] (epoch [1/2])
[7/1068] train loss: [0.6929] (epoch [1/2])
[8/1068] train loss: [0.7200] (epoch [1/2])
[9/1068] train loss: [0.6619] (epoch [1/2])
[10/1068] train loss: [0.5983] (epoch [1/2])
[11/1068] train loss: [0.7275] (epoch [1/2])
[12/1068] train loss: [0.7151] (epoch [1/2])
[13/1068] train loss: [0.6762] (epoch [1/2])
[14/1068] train loss: [0.7005] (epoch [1/2])
[15/1068] train loss: [0.6514] (epoch [1/2])
[16/1068] train loss: [0.6439] (epoch [1/2])
[17/1068] train loss: [0.6480] (epoch [1/2])
[18/1068] train loss: [0.7064] (epoch [1/2])
[19/1068] train loss: [0.7021] (epoch [1/2])
[20/1068] train loss: [0.7008] (epoch [1/2])
[21/1068] train loss: [0.5955] (epoch [1/2])
[22/1068] train loss: [0.5675] (epoch [1/2])
[23/1068] train los

In [None]:
# test loss
cls_model.eval()

test_loss = 0
with torch.no_grad():
    for data in test_loader:
        input_ids = data["input_ids"].to(device)
        attention_mask = data["attention_mask"].to(device)
        label = data["label"].view(-1, 1).float().to(device)
        
        logits = cls_model.forward(input_ids, attention_mask)
        loss = loss_fn.forward(
            input = logits,
            target = label
        )
        
        test_loss += loss.detach().cpu()
        
    test_loss /= len(val_loader)
    
test_acc = compute_accuracy(test_loader)

print(f"Test Loss: [{test_loss:.4f}]\nTest accuracy: [{test_acc:.4f}]")

Test Loss: [0.3094753324985504]
Test accuracy: [0.8930581613508443]
