## This structure is for BERT training

In [1]:
from transformers import AutoModel, AutoTokenizer, BertModel, PreTrainedTokenizer, get_cosine_schedule_with_warmup
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import AdamW
from torch.utils.data import DataLoader, Dataset 
from torch.nn.utils.rnn import pad_sequence # for dynamic padding
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import random

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# set seeds
def set_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # if you are using multi-GPU.
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    
set_seed(114514)

In [None]:
# import pretrained model
model : BertModel = AutoModel.from_pretrained("bert-base-cased")
tokenizer : PreTrainedTokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

model



BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(28996, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [None]:
# import datasets

train_df = pd.read_csv("../preprocessed_dataset/train.csv").iloc[:,1:]
validation_df = pd.read_csv("../preprocessed_dataset/validation.csv").iloc[:,1:]
test_df = pd.read_csv("../preprocessed_dataset/test.csv").iloc[:,1:]
train_df

Unnamed: 0,text,label
0,the rock is destined to be the 21st century's ...,1
1,"the gorgeously elaborate continuation of "" the...",1
2,effective but too-tepid biopic,1
3,if you sometimes like to go to the movies to h...,1
4,"emerges as something rare , an issue movie tha...",1
...,...,...
8525,any enjoyment will be hinge from a personal th...,0
8526,if legendary shlockmeister ed wood had ever ma...,0
8527,hardly a nuanced portrait of a young woman's b...,0
8528,"interminably bleak , to say nothing of boring .",0


In [None]:
tokenizer("hello world")

{'input_ids': [101, 19082, 1362, 102], 'token_type_ids': [0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1]}

In [None]:
## Define model
class SentimentClassifier(nn.Module):
    def __init__(
            self, 
            bert : BertModel
        ) -> None:
        super().__init__()
        self.bert = bert
        self.fc = nn.Linear(768, 1) # total 768 dim output
        
    def forward(
            self,
            input_ids : torch.Tensor,
            attention_mask : torch.Tensor
        ) -> torch.Tensor:
        outputs = self.bert.forward(
            input_ids = input_ids,
            attention_mask = attention_mask
        )
        # get the output of [CLS] position (first position) for training 
        cls_output = outputs.last_hidden_state[:,0,:] # [B, seqlen, embed]
        return self.fc.forward(cls_output)

In [None]:
tokenizer("efsdfs")

{'input_ids': [101, 174, 22816, 1181, 22816, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1]}

In [None]:
# define dataset and data loader
from typing import Any


class CustomizeDataset(Dataset):
    def __init__(
            self,
            tokenizer : PreTrainedTokenizer,
            df : pd.DataFrame
        ) -> None:
        super().__init__()
        self.df = df
        self.tokenizer = tokenizer
        
    def __len__(self) -> int:
        return len(self.df)
    
    def __getitem__(
            self, 
            index : int
        ) -> dict:
        
        inputs = self.df.iloc[index,0] # 0 is text
        label = self.df.iloc[index, 1] # 1 is label
        
        tok = self.tokenizer(inputs)
        
        return {
            "input_ids" : tok["input_ids"],
            "label" : label
        }
        
# collater function
class Collater:
    def __init__(
            self,
            tokenizer : PreTrainedTokenizer
        ) -> None:
        self.tokenizer = tokenizer 
    
    def __call__(
        self,
        instances : list
        ) -> Any:
        input_ids = [torch.tensor(instance["input_ids"], dtype = torch.int64) for instance in instances]
        label = [torch.tensor(instance["label"], dtype = torch.int64) for instance in instances]
        input_ids = pad_sequence(input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id)
        attention_mask = input_ids.ne(self.tokenizer.pad_token_id).type(torch.int64)
        
        return {
            "input_ids": input_ids,
            "label": torch.tensor(label),
            "attention_mask": attention_mask # attention mask 本质就是找到不等于 pad_token_id 的位置，就是有效位置
        }

In [None]:
# Hyper paramaters

num_train_epochs = 4

batch_size = 32
lr = 1e-4
weight_decay = 1e-6

warmup_ratio=0.05
max_grad_norm = 1

In [None]:
# create dataset and data loader

collate_fn = Collater(tokenizer)

train_ds = CustomizeDataset(
    tokenizer = tokenizer,
    df = train_df
)

train_loader = DataLoader(
    dataset = train_ds,
    batch_size = batch_size,
    collate_fn = collate_fn,
    shuffle = True
)

val_ds = CustomizeDataset(
    tokenizer = tokenizer,
    df = validation_df
)

val_loader = DataLoader(
    dataset = val_ds,
    batch_size = batch_size,
    collate_fn = collate_fn
)

test_ds = CustomizeDataset(
    tokenizer = tokenizer,
    df = test_df
)

test_loader = DataLoader(
    dataset = test_ds,
    batch_size = batch_size,
    collate_fn = collate_fn
)

In [None]:
# compute warmup status
num_training_steps = num_train_epochs * len(train_loader)
num_warmup_steps = int(num_training_steps * warmup_ratio)
print(f"train:{num_training_steps}, warm up: {num_warmup_steps}")

train:2136, warm up: 106


In [None]:
# Set optimizer, loss_fn and so on

cls_model = SentimentClassifier(model)

loss_fn = nn.BCEWithLogitsLoss()

optimizer = AdamW(
    params = cls_model.parameters(),
    lr = lr,
    weight_decay = weight_decay
)

scheduler  = get_cosine_schedule_with_warmup(
    optimizer = optimizer,
    num_warmup_steps = num_warmup_steps,
    num_training_steps = num_training_steps
)

In [None]:
# compute accuracy

def compute_accuracy(data_loader: DataLoader) -> float:
    
    cls_model.eval()
    
    correct = 0
    total = 0
    
    with torch.no_grad():
        for data in data_loader:
            input_ids = data["input_ids"].to(device)
            attention_mask = data["attention_mask"].to(device)
            labels = data["label"].view(-1, 1).float().to(device)
            
            logits = cls_model(input_ids, attention_mask=attention_mask)
     
            predictions = (logits > 0).long()
            
            correct += (predictions == labels).sum().item()
            total += labels.size(0)
    
    accuracy = correct / total
    return accuracy

In [None]:
# Training loop

cls_model.to(device)

optimizer.zero_grad()

step = 0

train_losses = []

for epoch in range(num_train_epochs):
    
    # train loop
    cls_model.train()
    for data in train_loader:
        input_ids = data["input_ids"].to(device)
        attention_mask = data["attention_mask"].to(device)
        label = data["label"].view(-1, 1).float().to(device)
        
        logits = cls_model.forward(input_ids, attention_mask)
        
        
        loss = loss_fn.forward(
            input = logits,
            target = label
        )
        
        loss.backward()
        optimizer.step() # update paramater
        scheduler.step() 
        optimizer.zero_grad() # clear gradient
        
        vis_loss = loss.detach().cpu()
        
        print(f"[{step + 1}/{num_training_steps}] train loss: [{vis_loss:.4f}] (epoch [{epoch + 1}/{num_train_epochs}])")
        
        
        train_losses.append(vis_loss)
        
        step += 1
        
    val_loss = 0
    
    cls_model.eval()
    with torch.no_grad():
        for data in val_loader:
            input_ids = data["input_ids"].to(device)
            attention_mask = data["attention_mask"].to(device)
            labels = data["label"].view(-1, 1).float().to(device)
            
            logits = cls_model.forward(input_ids, attention_mask)
            loss = loss_fn.forward(
                input = logits,
                target = labels
            )
            
            val_loss += loss.detach().cpu()
            
        val_loss /= len(val_loader)
        
    val_acc = compute_accuracy(val_loader)
    print(f"epoch [{epoch + 1}/{num_train_epochs}] validation loss: [{val_loss:.4f}] validation accuracy: [{val_acc}]")

  attn_output = torch.nn.functional.scaled_dot_product_attention(


[1/2136] train loss: [0.6802] (epoch [1/4])
[2/2136] train loss: [0.6365] (epoch [1/4])
[3/2136] train loss: [0.7885] (epoch [1/4])
[4/2136] train loss: [0.7278] (epoch [1/4])
[5/2136] train loss: [0.7504] (epoch [1/4])
[6/2136] train loss: [0.7761] (epoch [1/4])
[7/2136] train loss: [0.7422] (epoch [1/4])
[8/2136] train loss: [0.7169] (epoch [1/4])
[9/2136] train loss: [0.7840] (epoch [1/4])
[10/2136] train loss: [0.8355] (epoch [1/4])
[11/2136] train loss: [0.6911] (epoch [1/4])
[12/2136] train loss: [0.6757] (epoch [1/4])
[13/2136] train loss: [0.7800] (epoch [1/4])
[14/2136] train loss: [0.7433] (epoch [1/4])
[15/2136] train loss: [0.8212] (epoch [1/4])
[16/2136] train loss: [0.7852] (epoch [1/4])
[17/2136] train loss: [0.7730] (epoch [1/4])
[18/2136] train loss: [0.6851] (epoch [1/4])
[19/2136] train loss: [0.7209] (epoch [1/4])
[20/2136] train loss: [0.7254] (epoch [1/4])
[21/2136] train loss: [0.7846] (epoch [1/4])
[22/2136] train loss: [0.7598] (epoch [1/4])
[23/2136] train los

In [None]:
# test loss
cls_model.eval()

test_loss = 0
with torch.no_grad():
    for data in test_loader:
        input_ids = data["input_ids"].to(device)
        attention_mask = data["attention_mask"].to(device)
        label = data["label"].view(-1, 1).float().to(device)
        
        logits = cls_model.forward(input_ids, attention_mask)
        loss = loss_fn.forward(
            input = logits,
            target = label
        )
        
        test_loss += loss.detach().cpu()
        
    test_loss /= len(val_loader)
    
test_acc = compute_accuracy(test_loader)

print(f"Test Loss: [{test_loss}]\nTest accuracy: [{test_acc}]")

Test Loss: [0.379112184047699]
Test accuracy: [0.8395872420262664]
