In [2]:
# download requirements
!pip install transformers datasets evaluate peft

Collecting evaluate
  Downloading evaluate-0.4.5-py3-none-any.whl.metadata (9.5 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.13.0->peft)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.13.0->peft)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.13.0->peft)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.13.0->peft)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.13.0->peft)
  Downloading nvidia_cubl

In [3]:
# Import library's

from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, set_seed
from peft import get_peft_model, LoraConfig, TaskType
import evaluate

2025-09-16 09:07:11.364499: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1758013631.741854      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1758013631.854760      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [4]:
set_seed(42)

In [5]:
# download dataset(sst2)
data = load_dataset('sst2')

README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/3.11M [00:00<?, ?B/s]

data/validation-00000-of-00001.parquet:   0%|          | 0.00/72.8k [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/148k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/67349 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/872 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1821 [00:00<?, ? examples/s]

In [6]:
# view structure of dataset
data

DatasetDict({
    train: Dataset({
        features: ['idx', 'sentence', 'label'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['idx', 'sentence', 'label'],
        num_rows: 872
    })
    test: Dataset({
        features: ['idx', 'sentence', 'label'],
        num_rows: 1821
    })
})

In [7]:
# Identify different parts of the data to simplify use
train_data = data['train']
test_data = data['test']
valid_data = data['validation']

In [8]:
# view sample data
print(f"traning part: \n idx: {train_data['idx'][0]} \n sentence: {train_data['sentence'][0]} \n label: {train_data['label'][0]}")
print('=========================================================================================================================')
print(f"validation part: \n idx: {valid_data['idx'][0]} \n sentence: {valid_data['sentence'][0]} \n label: {valid_data['label'][0]}")
print('=========================================================================================================================')
print(f"test part: \n idx: {test_data['idx'][0]} \n sentence: {test_data['sentence'][0]} \n label: {test_data['label'][0]}")
# label of test data all of them are -1 because we should predict the label and send the predict label to glue for evaluate your model

traning part: 
 idx: 0 
 sentence: hide new secretions from the parental units  
 label: 0
validation part: 
 idx: 0 
 sentence: it 's a charming and often affecting journey .  
 label: 1
test part: 
 idx: 0 
 sentence: uneasy mishmash of styles and genres . 
 label: -1


In [9]:
# load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
# create a function for tokenize the train,valid and test data
def tokenize_dataset(data):
    return tokenizer(
        data['sentence'],
        max_length=512,  #you can find the any open source model in huggingface! for example you go to this link: https://huggingface.co/google-bert/bert-base-multilingual-cased in the rigth sidebar you can see a button named: "files info" if you click this you can see the max_length of model for example in this model it's 512!!!
        truncation=True,
        padding='max_length'
    )

In [11]:
# aplly the tokenize_dataset function on "train", "valid" and "test" data
train_data = train_data.map(tokenize_dataset, batched=True)
valid_data  = valid_data.map(tokenize_dataset, batched=True)
test_data = test_data.map(tokenize_dataset, batched=True)

Map:   0%|          | 0/67349 [00:00<?, ? examples/s]

Map:   0%|          | 0/872 [00:00<?, ? examples/s]

Map:   0%|          | 0/1821 [00:00<?, ? examples/s]

In [12]:
#  adding some new columns after tokenizer!
train_data.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
valid_data.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
test_data.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

In [13]:
# create a function for calculate the evaluate the data
accuracy = evaluate.load('accuracy')

def compute_metrics(pred_data):
    logits, label = pred_data
    predictions = logits.argmax(axis=-1)
    return accuracy.compute(predictions=predictions, references=label)

Downloading builder script: 0.00B [00:00, ?B/s]

In [14]:
# view structure of model befor apply the lora config
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [15]:
# set LoRA config
lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    r=16,
    lora_alpha=32,
    lora_dropout=0.1,
    bias='none'
)

# apply the lora config on base model
model = get_peft_model(model, lora_config)

In [16]:
# view structure of model after apply the lora config
model

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): BertForSequenceClassification(
      (bert): BertModel(
        (embeddings): BertEmbeddings(
          (word_embeddings): Embedding(30522, 768, padding_idx=0)
          (position_embeddings): Embedding(512, 768)
          (token_type_embeddings): Embedding(2, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): BertEncoder(
          (layer): ModuleList(
            (0-11): 12 x BertLayer(
              (attention): BertAttention(
                (self): BertSdpaSelfAttention(
                  (query): lora.Linear(
                    (base_layer): Linear(in_features=768, out_features=768, bias=True)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.1, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default

In [17]:
# set the setting for train the model
training_args = TrainingArguments(
    output_dir = './fine_tuned_bert_on_stt2',
    eval_strategy='steps',
    eval_steps=100,
    save_strategy='steps',
    save_steps=200,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=4,
    learning_rate=1e-5,
    logging_dir='./logs',
    logging_strategy='steps',
    logging_steps=50,
    save_total_limit=2,
    load_best_model_at_end=True,
    report_to='none',
    metric_for_best_model='accuracy'
)

In [None]:
# create a trainer and start the training...
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=valid_data,
    compute_metrics=compute_metrics
)

trainer.train()