In [1]:
# Check hardware env
import os
import torch

os.environ['CUDA_VISIBLE_DEVICES'] = 'MIG-5b5e7d9c-4282-5dbc-9e77-5217dd9cd485'

print(f'PyTorch version: {torch.__version__}')

if torch.cuda.is_available():
    print(f"Number of GPUs: {torch.cuda.device_count()}")
    for i in range(torch.cuda.device_count()):
        print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
else:
    print("CUDA is not available")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Using device: {device}')

PyTorch version: 2.5.0.dev20240718+cu124
Number of GPUs: 1
GPU 0: NVIDIA A100-PCIE-40GB MIG 4g.20gb
Using device: cuda


In [2]:
# Here are the arguments you should define
from transformers import TrainingArguments

# Datasets' path
train_path = '../data/clean/train.parquet'
test_path = '../data/clean/test.parquet'

# Which model to use and where to store them
checkpoint = 'hfl/chinese-roberta-wwm-ext'
cache_dir = '../src/'

# Your training arguments
training_args = TrainingArguments(
    output_dir="../src/train_official",
    num_train_epochs=4,
    per_device_train_batch_size=32,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='logs/',
    logging_steps=300,
    eval_strategy="steps",
    eval_steps=300,
    save_strategy="steps",
    save_steps=300,
    save_total_limit=3,
    load_best_model_at_end=True
)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Load dataset, tokenizer and model
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding

train = Dataset.from_parquet(train_path)
test = Dataset.from_parquet(test_path)
print(f'Train set size: {len(train)}\nTest set size: {len(test)}')

tokenizer = AutoTokenizer.from_pretrained(checkpoint, cache_dir=cache_dir)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2, cache_dir=cache_dir)

Train set size: 138117
Test set size: 82872


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at hfl/chinese-roberta-wwm-ext and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [5]:
# Define tokenize and prepare functions
def tokenize_function(example):
    return tokenizer(example["tweets"], truncation=True, max_length=512)


def tokenize_prepare(dataset: Dataset) -> Dataset:
  tokenized_dataset = dataset.map(tokenize_function, batched=True)

  tokenized_dataset = tokenized_dataset.remove_columns(["tweets", "label", "idx"])
  tokenized_dataset = tokenized_dataset.rename_column("label_id", "labels")
  tokenized_dataset.set_format("torch")

  return tokenized_dataset

In [6]:
# Tokenize datasets
tokenized_train = tokenize_prepare(train)
tokenized_test = tokenize_prepare(test)

In [7]:
# Prepare metrics (accuracy and f1)
import evaluate
import numpy as np

def compute_metrics(eval_preds):
    accuracy_metric = evaluate.load("accuracy")
    f1_metric = evaluate.load("f1")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)
    f1 = f1_metric.compute(predictions=predictions, references=labels, average='macro')
    return {"accuracy": accuracy["accuracy"],"f1": f1["f1"]}

In [8]:
# Prepare trainer
from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [9]:
# Let the magic begin
trainer.train()

Step,Training Loss,Validation Loss,Accuracy,F1
300,0.1296,0.005846,0.998552,0.998552
600,0.0106,0.006344,0.998709,0.998709
900,0.0105,0.003122,0.9993,0.9993
1200,0.0052,0.004509,0.99895,0.99895
1500,0.0033,0.017386,0.996513,0.996513
1800,0.0081,0.00326,0.999373,0.999373
2100,0.009,0.010741,0.99749,0.99749
2400,0.0076,0.004308,0.999179,0.999179
2700,0.0071,0.002499,0.99936,0.99936
3000,0.0049,0.003754,0.999059,0.999059


TrainOutput(global_step=17268, training_loss=0.007380581783307235, metrics={'train_runtime': 21723.2825, 'train_samples_per_second': 25.432, 'train_steps_per_second': 0.795, 'total_flos': 4.2371846813413976e+16, 'train_loss': 0.007380581783307235, 'epoch': 4.0})