In [1]:
!pip install datasets==2.15.0

Collecting datasets==2.15.0
  Downloading datasets-2.15.0-py3-none-any.whl.metadata (20 kB)
Collecting pyarrow-hotfix (from datasets==2.15.0)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl.metadata (3.6 kB)
Collecting fsspec<=2023.10.0,>=2023.1.0 (from fsspec[http]<=2023.10.0,>=2023.1.0->datasets==2.15.0)
  Downloading fsspec-2023.10.0-py3-none-any.whl.metadata (6.8 kB)
Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading fsspec-2023.10.0-py3-none-any.whl (166 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m166.4/166.4 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Installing collected packages: pyarrow-hotfix, fsspec, datasets
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2023.12.2
    Uninstalling fsspec-2023.12.2:
      S

In [2]:
import os
from tqdm import tqdm

import torch
from transformers import BertTokenizer, BertForSequenceClassification, BertConfig
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from torch.utils.data.distributed import DistributedSampler
from datasets import load_dataset

import torch
from contextlib import nullcontext
from torch.cuda.amp import GradScaler, autocast


torch.backends.cuda.matmul.allow_tf32 = True # allow tf32 on matmul
torch.backends.cudnn.allow_tf32 = True # allow tf32 on cudnn

# -----------------------------------------------------------------------------
# Load the dataset
dataset = load_dataset("sepidmnorozy/Vietnamese_sentiment")

# Initialize the model, tokenizer, and training settings
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")
config = BertConfig.from_pretrained("bert-base-multilingual-cased")
model = BertForSequenceClassification(config)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
criterion = torch.nn.CrossEntropyLoss()

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/329k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/52.0k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/99.2k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

  return pd.read_csv(xopen(filepath_or_buffer, "rb", download_config=download_config), **kwargs)


Generating validation split: 0 examples [00:00, ? examples/s]

  return pd.read_csv(xopen(filepath_or_buffer, "rb", download_config=download_config), **kwargs)


Generating test split: 0 examples [00:00, ? examples/s]

  return pd.read_csv(xopen(filepath_or_buffer, "rb", download_config=download_config), **kwargs)


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

In [None]:
# Preprocess the data using the datasets library
def tokenize_and_encode(batch):
    encoded = tokenizer(batch["text"], truncation=True, padding="max_length", max_length=128)
    return {
        "input_ids": encoded["input_ids"],
        "attention_mask": encoded["attention_mask"],
        "labels": batch["label"],
    }

encoded_train_dataset = dataset["train"].map(tokenize_and_encode, batched=True, remove_columns=["text"])
encoded_eval_dataset = dataset["test"].map(tokenize_and_encode, batched=True, remove_columns=["text"])
encoded_train_dataset.set_format("torch")
encoded_eval_dataset.set_format("torch")

# Create the DataLoaders
train_dataloader = DataLoader(
    encoded_train_dataset,
    sampler=RandomSampler(encoded_train_dataset),
    batch_size=64,
    collate_fn=lambda x: {
        "input_ids": torch.stack([sample["input_ids"] for sample in x]),
        "attention_mask": torch.stack([sample["attention_mask"] for sample in x]),
        "labels": torch.tensor([sample["labels"] for sample in x]),
    },
)

eval_dataloader = DataLoader(
    encoded_eval_dataset,
    sampler=SequentialSampler(encoded_eval_dataset),
    batch_size=64,
    collate_fn=lambda x: {
        "input_ids": torch.stack([sample["input_ids"] for sample in x]),
        "attention_mask": torch.stack([sample["attention_mask"] for sample in x]),
        "labels": torch.tensor([sample["labels"] for sample in x]),
    },
)

Map:   0%|          | 0/2384 [00:00<?, ? examples/s]

Map:   0%|          | 0/685 [00:00<?, ? examples/s]

In [None]:
mixed_precision_dtype = torch.float16 # torch.bfloat16
ctx = nullcontext() if mixed_precision_dtype == None else torch.amp.autocast(device_type='cuda', dtype=mixed_precision_dtype)
# Initialize GradScaler for mixed precision training
scaler = GradScaler()

In [None]:
for epoch in range(3):
    # Training
    model.train()
    total_train_loss, total_train_correct = 0, 0
    train_progress_bar = tqdm(train_dataloader, desc=f"Epoch {epoch + 1} [Training]", position=0, leave=True)
    for batch in train_progress_bar:
        input_ids, attention_masks, labels = (batch["input_ids"].to(device), batch["attention_mask"].to(device), batch["labels"].to(device))

        optimizer.zero_grad()

        # Use autocast to automatically cast tensor types for mixed precision training
        with ctx:
            outputs = model(input_ids, attention_mask=attention_masks, labels=labels)
            loss = criterion(outputs.logits, labels)
        if mixed_precision_dtype:
            # Scale the loss and backpropagate with the help of GradScaler
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
        else:
            loss.backward()
            optimizer.step()

        total_train_loss += loss.item()
        preds = torch.argmax(outputs.logits, dim=1)
        total_train_correct += (preds == labels).sum().item()
        
    avg_train_loss = total_train_loss / len(train_dataloader)
    avg_train_accuracy = total_train_correct / len(encoded_train_dataset)
    print(f"Epoch {epoch + 1}, Train Loss: {avg_train_loss}, Train Accuracy: {avg_train_accuracy}")

    
    # Evaluation
    model.eval()
    total_eval_loss, total_eval_correct = 0, 0
    eval_progress_bar = tqdm(eval_dataloader, desc=f"Epoch {epoch + 1} [Evaluation]", position=0, leave=True)
    for batch in eval_progress_bar:
        input_ids, attention_masks, labels = (batch["input_ids"].to(device), batch["attention_mask"].to(device), batch["labels"].to(device))

        with torch.no_grad():
            with ctx:
                outputs = model(input_ids, attention_mask=attention_masks, labels=labels)
                loss = criterion(outputs.logits, labels)
        total_eval_loss += loss.item()
        preds = torch.argmax(outputs.logits, dim=1)
        total_eval_correct += (preds == labels).sum().item()

    avg_eval_loss = total_eval_loss / len(eval_dataloader)
    avg_eval_accuracy = total_eval_correct / len(encoded_eval_dataset)
    print(f"Epoch {epoch + 1}, Evaluation Loss: {avg_eval_loss}, Evaluation Accuracy: {avg_eval_accuracy}")

Epoch 1 [Training]: 100%|██████████| 38/38 [00:15<00:00,  2.41it/s]


Epoch 1, Train Loss: 0.6983088443153783, Train Accuracy: 0.5331375838926175


Epoch 1 [Evaluation]: 100%|██████████| 11/11 [00:01<00:00,  8.90it/s]


Epoch 1, Evaluation Loss: 0.6822684407234192, Evaluation Accuracy: 0.5562043795620438


Epoch 2 [Training]: 100%|██████████| 38/38 [00:14<00:00,  2.61it/s]


Epoch 2, Train Loss: 0.6732683683696546, Train Accuracy: 0.5641778523489933


Epoch 2 [Evaluation]: 100%|██████████| 11/11 [00:01<00:00,  8.68it/s]


Epoch 2, Evaluation Loss: 0.6756156953898343, Evaluation Accuracy: 0.5518248175182482


Epoch 3 [Training]: 100%|██████████| 38/38 [00:14<00:00,  2.57it/s]


Epoch 3, Train Loss: 0.6141160663805509, Train Accuracy: 0.6577181208053692


Epoch 3 [Evaluation]: 100%|██████████| 11/11 [00:01<00:00,  8.52it/s]

Epoch 3, Evaluation Loss: 0.8228270519863475, Evaluation Accuracy: 0.5635036496350365



