In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = 'MIG-8b3c9b9c-b08f-5ec0-af05-8ac9882721e5'

In [2]:
import torch

print(f'PyTorch version: {torch.__version__}')

if torch.cuda.is_available():
    print(f"Number of GPUs: {torch.cuda.device_count()}")
    for i in range(torch.cuda.device_count()):
        print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
else:
    print("CUDA is not available")

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f'Using device: {device}')

PyTorch version: 2.5.0.dev20240718+cu124
Number of GPUs: 1
GPU 0: NVIDIA A100-PCIE-40GB MIG 4g.20gb
Using device: cuda:0


In [3]:
# Prepare dataset, tokenizer and model, transfer model to GPU
from datasets import Dataset
from create_training_set import split_train_dev_test
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding

dev_set = Dataset.from_parquet('../data/clean/dev.parquet')
train, dev, test = split_train_dev_test(dev_set, percentage=0.8)

checkpoint = 'hfl/chinese-roberta-wwm-ext'
cache_dir = '../src/'

tokenizer = AutoTokenizer.from_pretrained(checkpoint, cache_dir=cache_dir)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2, cache_dir=cache_dir)

  from .autonotebook import tqdm as notebook_tqdm
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at hfl/chinese-roberta-wwm-ext and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [5]:
# Define tokenize and prepare functions
def tokenize_function(example):
    return tokenizer(example["tweets"], truncation=True, max_length=512)


def tokenize_prepare(dataset: Dataset) -> Dataset:
  tokenized_dataset = dataset.map(tokenize_function, batched=True)

  tokenized_dataset = tokenized_dataset.remove_columns(["tweets", "label", "idx"])
  tokenized_dataset = tokenized_dataset.rename_column("label_id", "labels")
  tokenized_dataset.set_format("torch")

  return tokenized_dataset

In [6]:
# Tokenize datasets
tokenized_train = tokenize_prepare(train)
tokenized_test = tokenize_prepare(test)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map: 100%|██████████| 23458/23458 [00:03<00:00, 7676.49 examples/s]
Map: 100%|██████████| 2933/2933 [00:00<00:00, 9583.47 examples/s] 


In [7]:
# Prepare metrics
import evaluate
import numpy as np

def compute_metrics(eval_preds):
    accuracy_metric = evaluate.load("accuracy")
    f1_metric = evaluate.load("f1")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)
    f1 = f1_metric.compute(predictions=predictions, references=labels, average='macro')
    return {"accuracy": accuracy["accuracy"],"f1": f1["f1"]}

In [8]:
from transformers import Trainer
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="../src/test_train",
    num_train_epochs=3,
    per_device_train_batch_size=32,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='../logs',
    logging_steps=100,
    eval_strategy="steps",
    eval_steps=100,
    save_strategy="steps",
    save_total_limit=3,
    load_best_model_at_end=True
)

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [9]:
trainer.train()

Step,Training Loss,Validation Loss,Accuracy,F1
100,0.3248,0.024184,0.994886,0.994883
200,0.0152,0.010521,0.997613,0.997612
300,0.0198,0.009316,0.998295,0.998295
400,0.0189,0.001185,1.0,1.0
500,0.0079,0.004146,0.998636,0.998636
600,0.0071,0.004813,0.998636,0.998635
700,0.0118,0.00522,0.998295,0.998294
800,0.0027,0.012434,0.997272,0.997271
900,0.0062,0.012316,0.99625,0.996247
1000,0.009,0.007783,0.998295,0.998294


TrainOutput(global_step=2202, training_loss=0.019996422934705865, metrics={'train_runtime': 1023.1915, 'train_samples_per_second': 68.779, 'train_steps_per_second': 2.152, 'total_flos': 5435734625989440.0, 'train_loss': 0.019996422934705865, 'epoch': 3.0})