# Overview



In [1]:
import os
from huggingface_hub import login
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
login(token=user_secrets.get_secret("HUGGINGFACE_TOKEN"))

os.environ["WANDB_API_KEY"]=user_secrets.get_secret("WANDB_API_KEY")
os.environ["WANDB_PROJECT"] = "Fine-tuning bert-base-uncased"
os.environ["WANDB_NAME"] = "ft-bert-base-uncased-for-sentiment-classification"
os.environ["MODEL_NAME"] = "bert-base-uncased"
os.environ["TOKENIZER_NAME"] = "bert-base-uncased"
os.environ["DATASET"] = "https://huggingface.co/datasets/takala/financial_phrasebank"

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [2]:
from datasets import load_dataset
from transformers import BertTokenizer, BertForSequenceClassification

# Load the Financial PhraseBank dataset with a specified configuration
ds = load_dataset("financial_phrasebank", "sentences_allagree", trust_remote_code=True)

Downloading builder script:   0%|          | 0.00/6.04k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/8.88k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/682k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2264 [00:00<?, ? examples/s]

In [3]:
ds

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label'],
        num_rows: 2264
    })
})

## Preprocess the data

In [4]:
tokenizer = BertTokenizer.from_pretrained(os.getenv("MODEL_NAME"))

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



In [5]:
def tokenize_function(example):
    return tokenizer(example['sentence'], padding="max_length", truncation=True)

tokenized_ds=ds.map(tokenize_function, batched=True)

Map:   0%|          | 0/2264 [00:00<?, ? examples/s]

In [6]:
tokenized_ds

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2264
    })
})

In [7]:
tokenized_ds=tokenized_ds.rename_column("label", "labels")

In [8]:
train_test_split=tokenized_ds['train'].train_test_split(test_size=0.1)

In [9]:
train_ds=train_test_split['train']
val_ds=train_test_split['test']

In [10]:
train_ds.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
val_ds.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

In [11]:
train_ds[0]

{'labels': tensor(1),
 'input_ids': tensor([  101, 26850,  5620,  7479,  1012, 26850,  5620,  1012,  7367,  2097,
          2468,  2112,  1997, 18906, 17619,  2474,  5946, 15451,  3775,  4063,
          2422, 12278,  3131,  1012,   102,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,  

# Load the model

In [12]:
tokenizer = BertTokenizer.from_pretrained(os.getenv("MODEL_NAME"))
model=BertForSequenceClassification.from_pretrained(os.getenv("MODEL_NAME"), device_map="cuda", num_labels=3) # 3 sentiment labels

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
model.device

device(type='cuda', index=0)

# Training

In [15]:
from transformers import Trainer, TrainingArguments
import numpy as np
# from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Define training arguments
training_args = TrainingArguments(
    output_dir=os.getenv("WANDB_NAME"),
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    report_to="tensorboard",
    run_name=os.getenv('WANDB_NAME')
)

# Define a compute_metrics function for evaluation
# def compute_metrics(eval_pred):
#     logits, labels = eval_pred
#     predictions = np.argmax(logits, axis=-1)
#     precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="weighted")
#     acc = accuracy_score(labels, predictions)
#     return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
#     compute_metrics=compute_metrics,
)

# Fine-tune the model
trainer.train()



Epoch,Training Loss,Validation Loss
1,0.1649,0.13192
2,0.1322,0.123158
3,0.0092,0.112018


TrainOutput(global_step=384, training_loss=0.19538592130993493, metrics={'train_runtime': 352.0204, 'train_samples_per_second': 17.36, 'train_steps_per_second': 1.091, 'total_flos': 1607886095735808.0, 'train_loss': 0.19538592130993493, 'epoch': 3.0})

# Upload model to HF(optional)

In [16]:
kwargs={
    'model_name': os.getenv("WANDB_NAME"),
    'finetuned_from': os.getenv('MODEL_NAME'),
#     'tasks': 'Text-Generation',
#     'dataset_tags':'',
    'dataset': os.getenv("DATASET")
}

tokenizer.push_to_hub(os.getenv("WANDB_NAME"))
trainer.push_to_hub(**kwargs)

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/5.24k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/aisuko/ft-bert-base-uncased-for-sentiment-classification/commit/3e930e0e0b724017039c2c053a1dfa38b98d2009', commit_message='End of training', commit_description='', oid='3e930e0e0b724017039c2c053a1dfa38b98d2009', pr_url=None, pr_revision=None, pr_num=None)

# Acknowledgements

* https://medium.com/gopenai/day-13-fine-tuning-llms-for-specific-use-cases-278c4535a468
* https://www.kaggle.com/code/aisuko/mock-intermediate-level-challenge-1/notebook