In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Load Data

In [None]:
!pip install --upgrade nlpaug datasets

Collecting nlpaug
  Downloading nlpaug-1.1.11-py3-none-any.whl.metadata (14 kB)
Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading nlpaug-1.1.11-py3-none-any.whl (410 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m410.5/410.5 kB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m31.

In [None]:
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel, BertForSequenceClassification

from tqdm.notebook import tqdm
import wandb

batch_size = 32
aug_mode = 'mixed'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

wandb.login()
wandb.init(project="Contrastive Learning with TinyBERT", name=f"{aug_mode}_aug_run")

train_df = pd.read_table('/content/drive/MyDrive/NLP_Final_Project/data/train.tsv')
val_df = pd.read_table('/content/drive/MyDrive/NLP_Final_Project/data/dev.tsv')
test_df = pd.read_table('/content/drive/MyDrive/NLP_Final_Project/data/test.tsv')

device

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mtungcs1703[0m ([33muetai[0m). Use [1m`wandb login --relogin`[0m to force relogin


device(type='cuda')

In [None]:
print("Train dataset:")
print(train_df['class'].value_counts())

print("\nVal dataset:")
print(val_df['class'].value_counts())

print("\nTest dataset:")
print(test_df['class'].value_counts())

Train dataset:
class
1    3610
0    3310
Name: count, dtype: int64

Val dataset:
class
1    444
0    428
Name: count, dtype: int64

Test dataset:
class
0    912
1    909
Name: count, dtype: int64


## Balance train dataset

In [None]:
test_class_0 = test_df[test_df['class'] == 0]
test_class_1 = test_df[test_df['class'] == 1]

sampled_test_class_0 = test_class_0.sample(n=600, random_state=42)
sampled_test_class_1 = test_class_1.sample(n=300, random_state=42)

sampled_test_data = pd.concat([sampled_test_class_0, sampled_test_class_1])

test_df = test_df.drop(sampled_test_data.index)
train_df = pd.concat([train_df, sampled_test_data])

In [None]:
print("Size of train_df:", len(train_df))
print("Size of test_df:", len(test_df))

Size of train_df: 7820
Size of test_df: 921


# Data Augmentation

In [None]:
import random
import nlpaug.augmenter.word as naw

tokenizer = BertTokenizer.from_pretrained('huawei-noah/TinyBERT_General_4L_312D')

synonym_aug = naw.SynonymAug(aug_src='wordnet', aug_p=0.6)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/409 [00:00<?, ?B/s]

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [None]:
def find_max_tokens(df, tokenizer):
    max_tokens = 0
    for text in df['text']:
        tokens = tokenizer.encode(text, add_special_tokens=True)
        max_tokens = max(max_tokens, len(tokens))
    return max_tokens

max_train_tokens = find_max_tokens(train_df, tokenizer)
max_val_tokens = find_max_tokens(val_df, tokenizer)
max_test_tokens = find_max_tokens(test_df, tokenizer)

print(f"Max tokens in train set: {max_train_tokens}")
print(f"Max tokens in val set: {max_val_tokens}")
print(f"Max tokens in test set: {max_test_tokens}")

In [None]:
class AugmentedDataset(Dataset):
    def __init__(self, sentences, labels, tokenizer, mode='delete_word', max_length=80): # Mode: delete_word, synnonym, mixed
        self.sentences = sentences
        self.synonym = [synonym_aug.augment(sent) for sent in sentences]
        self.labels = labels
        self.tokenizer = tokenizer
        self.mode = mode
        self.max_length = max_length

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sentence = self.sentences[idx]
        label = self.labels[idx]
        words = sentence.split()
        if self.mode == 'synnonym':
            aug_sentence = self.synonym[idx]
        elif self.mode == 'delete_word':
            if len(words) > 1:
                del_index = random.randint(0, len(words) - 1)
                aug_sentence = " ".join(words[:del_index] + words[del_index + 1:])
            else:
                aug_sentence = sentence
        # Mixed mode
        else:
            augmentation_type = random.choice(['synonym', 'delete_one_word'])
            if augmentation_type == 'synonym':
                aug_sentence = self.synonym[idx]
            else:
                if len(words) > 1:
                    del_index = random.randint(0, len(words) - 1)
                    aug_sentence = " ".join(words[:del_index] + words[del_index + 1:])
                else:
                    aug_sentence = sentence

        encoding_original = self.tokenizer(
            sentence,
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt"
        )

        encoding_augmented = self.tokenizer(
            aug_sentence,
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt"
        )

        return {
            'input_ids_origin': encoding_original['input_ids'].flatten(),
            'attention_mask_origin': encoding_original['attention_mask'].flatten(),
            'input_ids_positive': encoding_augmented['input_ids'].flatten(),
            'attention_mask_positive': encoding_augmented['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

# Prepare dataloader

In [None]:
import nltk
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


True

In [None]:
train_dataset = AugmentedDataset(train_df['sentence'].tolist(), train_df['class'].tolist(), tokenizer, mode=aug_mode)
val_dataset = AugmentedDataset(val_df['sentence'].tolist(), val_df['class'].tolist(), tokenizer, mode=aug_mode)
test_dataset = AugmentedDataset(test_df['sentence'].tolist(), test_df['class'].tolist(), tokenizer, mode=aug_mode)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [None]:
len(train_loader), len(val_loader), len(test_loader)

(245, 28, 29)

# Contrastive model

In [None]:
class TinyBERTContrastive(nn.Module):
    def __init__(self, hidden_size=312):
        super(TinyBERTContrastive, self).__init__()
        self.bert = BertModel.from_pretrained('huawei-noah/TinyBERT_General_4L_312D')
        self.projection_head = nn.Sequential(
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(hidden_size, hidden_size)
        )
        self.classifier = nn.Sequential(
            nn.Dropout(0.2),
            nn.Linear(hidden_size, 2)
        )

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        en_output = outputs.last_hidden_state[:, 0]
        projection = self.projection_head(en_output)
        logits = self.classifier(projection)
        return projection, logits

## Loss function: CE + InfoNCE

In [None]:
class SentimentLoss(nn.Module):
    def __init__(self):
        super(SentimentLoss, self).__init__()
        self.cross_entropy = nn.CrossEntropyLoss()

    def infonce(self, z1, z2, temperature=0.05):
        z1 = F.normalize(z1, p=2, dim=-1)
        z2 = F.normalize(z2, p=2, dim=-1)

        z = torch.cat([z1, z2], dim=0)
        n_samples = z.shape[0]

        cov = torch.mm(z, z.t().contiguous())
        sim = torch.exp(cov / temperature)

        full = torch.eye(n_samples)
        mask = torch.eq(full, 0).to(sim.device)

        neg = sim.masked_select(mask).view(n_samples, -1).sum(dim=-1)

        pos = torch.exp(torch.sum(z1 * z2, dim=-1) / temperature)
        pos = torch.cat([pos, pos], dim=0)
        loss = -torch.log(pos / neg).mean()
        return loss

    def forward(self, z1, z2, logits, label):
        return self.cross_entropy(logits, label) + self.infonce(z1, z2)

# Training

In [None]:
def train_epoch(model, train_loader, batch_size, loss_fn, optimizer, scaler, epoch, device=device):
    model.train()
    total, total_loss = 0, 0
    total_acc = 0
    pbar = tqdm(train_loader)
    for batch in pbar:
        input_ids_origin = batch['input_ids_origin'].to(device)
        attention_mask_origin = batch['attention_mask_origin'].to(device)
        input_ids_positive = batch['input_ids_positive'].to(device)
        attention_mask_positive = batch['attention_mask_positive'].to(device)
        labels = batch['label'].to(device)

        z1, logits = model(input_ids_origin, attention_mask_origin)
        z2, _ = model(input_ids_positive, attention_mask_positive)
        predicted = torch.argmax(logits, dim=1)

        loss = loss_fn(z1, z2, logits, labels)

        optimizer.zero_grad()
        optimizer.step()
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        total += batch_size
        total_loss += loss.item()
        total_acc += torch.sum(predicted == labels)

        pbar.set_description(f"Epoch {epoch+1}: Train | Loss = {total_loss/total:.4f}")

    avg_loss = total_loss / total
    print(f"Train Accuracy = {total_acc/total*100:.2f}%")
    return avg_loss, total_acc/total

def evaluate(model, val_loader, batch_size, loss_fn, scheduler, epoch, device=device):
    model.eval()
    total, total_loss = 0, 0
    total_acc = 0
    with torch.no_grad():
        pbar = tqdm(val_loader)
        for batch in pbar:
            input_ids_origin = batch['input_ids_origin'].to(device)
            attention_mask_origin = batch['attention_mask_origin'].to(device)
            input_ids_positive = batch['input_ids_positive'].to(device)
            attention_mask_positive = batch['attention_mask_positive'].to(device)
            labels = batch['label'].to(device)

            z1, logits = model(input_ids_origin, attention_mask_origin)
            z2, _ = model(input_ids_positive, attention_mask_positive)
            predicted = torch.argmax(logits, dim=1)

            loss = loss_fn(z1, z2, logits, labels)

            total += batch_size
            total_loss += loss.item()
            total_acc += torch.sum(predicted == labels)

            pbar.set_description(f"Epoch {epoch+1}: Val | Loss = {total_loss/total:.4f}")

    avg_val_loss = total_loss / total
    scheduler.step(avg_val_loss)
    print(f"Val Accuracy = {total_acc/total*100:.2f}%")
    return avg_val_loss, total_acc/total

def train(model, train_loader, val_loader, loss_fn, optimizer, scaler,
          scheduler, batch_size=batch_size, max_epochs=10, device=device):
    best_val_loss = float('inf')
    for epoch in range(max_epochs):
        ## train loop
        train_loss, train_acc = train_epoch(model, train_loader, batch_size, loss_fn, optimizer, scaler, epoch, device)
        ## val loop
        val_loss, val_acc = evaluate(model, val_loader, batch_size, loss_fn, scheduler, epoch, device)
        wandb.log({"Train Loss": train_loss, "Validation Loss": val_loss}, step=epoch)
        wandb.log({"Train Accuracy": train_acc, "Validation Accuracy": val_acc}, step=epoch)
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            ckpt_path = f'/content/drive/MyDrive/NLP_Final_Project/best_{aug_mode}.pth'
            torch.save(model.state_dict(), ckpt_path)
            print(f"Checkpoint saved at epoch {epoch+1} with validation loss: {best_val_loss:.4f}")

In [None]:
contrastive_model = TinyBERTContrastive().to(device)
loss_fn = SentimentLoss()
optimizer = torch.optim.AdamW(contrastive_model.parameters(), lr=2e-5)
scaler = torch.cuda.amp.GradScaler()
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=2, gamma=0.5)

pytorch_model.bin:   0%|          | 0.00/62.7M [00:00<?, ?B/s]

  scaler = torch.cuda.amp.GradScaler()


## Model structure

In [None]:
num_params = sum(p.numel() for p in contrastive_model.parameters() if p.requires_grad)
print(f"Number of trainable parameters: {round(num_params/1000000, 1)}M")
contrastive_model

Number of trainable parameters: 14.5M


TinyBERTContrastive(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 312, padding_idx=0)
      (position_embeddings): Embedding(512, 312)
      (token_type_embeddings): Embedding(2, 312)
      (LayerNorm): LayerNorm((312,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-3): 4 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=312, out_features=312, bias=True)
              (key): Linear(in_features=312, out_features=312, bias=True)
              (value): Linear(in_features=312, out_features=312, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=312, out_features=312, bias=True)
              (LayerNorm): LayerNorm((312,), eps=1e-12, element

In [None]:
train(contrastive_model, train_loader, val_loader, loss_fn, optimizer, scaler, scheduler, max_epochs=15)
# Luu state_dict sau khi train xong
torch.save(contrastive_model.state_dict(), f'/content/drive/MyDrive/NLP_Final_Project/last_{aug_mode}.pth')

  0%|          | 0/245 [00:00<?, ?it/s]

Train Accuracy = 59.23%


  0%|          | 0/28 [00:00<?, ?it/s]

Val Accuracy = 71.43%
Checkpoint saved at epoch 1 with validation loss: 0.0234


  0%|          | 0/245 [00:00<?, ?it/s]

Train Accuracy = 70.43%


  0%|          | 0/28 [00:00<?, ?it/s]

Val Accuracy = 74.89%
Checkpoint saved at epoch 2 with validation loss: 0.0207


  0%|          | 0/245 [00:00<?, ?it/s]

Train Accuracy = 74.03%


  0%|          | 0/28 [00:00<?, ?it/s]

Val Accuracy = 75.89%
Checkpoint saved at epoch 3 with validation loss: 0.0194


  0%|          | 0/245 [00:00<?, ?it/s]

Train Accuracy = 76.71%


  0%|          | 0/28 [00:00<?, ?it/s]

Val Accuracy = 78.46%
Checkpoint saved at epoch 4 with validation loss: 0.0176


  0%|          | 0/245 [00:00<?, ?it/s]

Train Accuracy = 79.27%


  0%|          | 0/28 [00:00<?, ?it/s]

Val Accuracy = 79.80%
Checkpoint saved at epoch 5 with validation loss: 0.0154


  0%|          | 0/245 [00:00<?, ?it/s]

Train Accuracy = 81.44%


  0%|          | 0/28 [00:00<?, ?it/s]

Val Accuracy = 78.13%
Checkpoint saved at epoch 6 with validation loss: 0.0148


  0%|          | 0/245 [00:00<?, ?it/s]

Train Accuracy = 83.20%


  0%|          | 0/28 [00:00<?, ?it/s]

Val Accuracy = 82.70%
Checkpoint saved at epoch 7 with validation loss: 0.0137


  0%|          | 0/245 [00:00<?, ?it/s]

Train Accuracy = 85.46%


  0%|          | 0/28 [00:00<?, ?it/s]

Val Accuracy = 84.60%
Checkpoint saved at epoch 8 with validation loss: 0.0125


  0%|          | 0/245 [00:00<?, ?it/s]

Train Accuracy = 87.35%


  0%|          | 0/28 [00:00<?, ?it/s]

Val Accuracy = 83.37%
Checkpoint saved at epoch 9 with validation loss: 0.0118


  0%|          | 0/245 [00:00<?, ?it/s]

Train Accuracy = 88.15%


  0%|          | 0/28 [00:00<?, ?it/s]

Val Accuracy = 84.15%
Checkpoint saved at epoch 10 with validation loss: 0.0113


  0%|          | 0/245 [00:00<?, ?it/s]

Train Accuracy = 89.90%


  0%|          | 0/28 [00:00<?, ?it/s]

Val Accuracy = 85.60%
Checkpoint saved at epoch 11 with validation loss: 0.0100


  0%|          | 0/245 [00:00<?, ?it/s]

Train Accuracy = 91.62%


  0%|          | 0/28 [00:00<?, ?it/s]

Val Accuracy = 85.04%


  0%|          | 0/245 [00:00<?, ?it/s]

Train Accuracy = 92.35%


  0%|          | 0/28 [00:00<?, ?it/s]

Val Accuracy = 86.72%
Checkpoint saved at epoch 13 with validation loss: 0.0095


  0%|          | 0/245 [00:00<?, ?it/s]

Train Accuracy = 94.09%


  0%|          | 0/28 [00:00<?, ?it/s]

Val Accuracy = 86.05%


  0%|          | 0/245 [00:00<?, ?it/s]

Train Accuracy = 94.96%


  0%|          | 0/28 [00:00<?, ?it/s]

Val Accuracy = 85.38%
Checkpoint saved at epoch 15 with validation loss: 0.0092


# Evaluation

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import time

def evaluate_classification(model, test_loader, base_model=False, device=device):
    model.eval()
    all_predictions = []
    all_labels = []
    inference_times = []
    with torch.no_grad():
        for batch in tqdm(test_loader):
            input_ids = batch['input_ids_origin'].to(device)
            attention_mask = batch['attention_mask_origin'].to(device)
            labels = batch['label'].to(device)

            start = time.time()
            outputs = model(input_ids, attention_mask)
            end = time.time()
            inference_times.append(end - start)
            if base_model:
                predicted = torch.argmax(outputs.logits, 1) # Base Bert Output là phần tử với key là logits
            else:
                predicted = torch.argmax(outputs[1], 1)

            all_predictions.extend(predicted.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    avg_inference_time = sum(inference_times) / len(inference_times)
    accuracy = accuracy_score(all_labels, all_predictions)
    precision = precision_score(all_labels, all_predictions, average='weighted')
    recall = recall_score(all_labels, all_predictions, average='weighted')
    f1 = f1_score(all_labels, all_predictions, average='weighted')

    print(f"Average inference time: {avg_inference_time:.4f} seconds")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-score: {f1:.4f}")

    return accuracy, precision, recall, f1

## Contrastive model

In [None]:
eval_model = TinyBERTContrastive().to(device)
eval_model.load_state_dict(torch.load(f'/content/drive/MyDrive/NLP_Final_Project/best_{aug_mode}.pth'))
metrics = evaluate_classification(eval_model, test_loader)

pytorch_model.bin:   0%|          | 0.00/62.7M [00:00<?, ?B/s]

  eval_model.load_state_dict(torch.load(f'/content/drive/MyDrive/NLP_Final_Project/best_{aug_mode}.pth'))


  0%|          | 0/29 [00:00<?, ?it/s]

Average inference time: 0.0052 seconds
Accuracy: 0.8947
Precision: 0.8966
Recall: 0.8947
F1-score: 0.8953


## TinyBERT model

In [26]:
tinybert_ckpt_path = '/content/drive/MyDrive/NLP_Final_Project/tinybert_ckpt/checkpoint-980'
tinybert_model = BertForSequenceClassification.from_pretrained(tinybert_ckpt_path).to(device)
metrics = evaluate_classification(tinybert_model, test_loader, base_model=True)

  0%|          | 0/29 [00:00<?, ?it/s]

Average inference time: 0.0046 seconds
Accuracy: 0.8784
Precision: 0.8780
Recall: 0.8784
F1-score: 0.8782


## BERT model

In [None]:
bert_ckpt_path = '/content/drive/MyDrive/NLP_Final_Project/bert_ckpt/checkpoint-980'
bert_model = BertForSequenceClassification.from_pretrained(bert_ckpt_path).to(device)
metrics = evaluate_classification(bert_model, test_loader, base_model=True)

  0%|          | 0/29 [00:00<?, ?it/s]

Average inference time: 0.0312 seconds
Accuracy: 0.9077
Precision: 0.9078
Recall: 0.9077
F1-score: 0.9077
