# Fine-tuning Sandbox

#### Environment Instrustions

1. conda install -c huggingface -c conda-forge datasets
2. conda install conda-forge::transformers
3. pip install peft
4. pip install evaluate
5. pip install torch
6. pip install scikit-learn (accuracy evaluation)

In [1]:
from datasets import load_dataset, DatasetDict, Dataset

from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer)

from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig  #state-of-the-art Parameter-Efficient Fine-Tuning (PEFT) methods
import evaluate
import torch
import numpy as np

### dataset

In [7]:
# # how dataset was generated

# # load imdb data
# imdb_dataset = load_dataset("imdb")

# # define subsample size
# N = 1000
# # generate indexes for random subsample
# rand_idx = np.random.randint(24999, size=N)

# # extract train and test data
# x_train = imdb_dataset['train'][rand_idx]['text']
# y_train = imdb_dataset['train'][rand_idx]['label']

# x_test = imdb_dataset['test'][rand_idx]['text']
# y_test = imdb_dataset['test'][rand_idx]['label']

# # create new dataset
# dataset = DatasetDict({'train':Dataset.from_dict({'label':y_train,'text':x_train}),
#                              'validation':Dataset.from_dict({'label':y_test,'text':x_test})})

In [8]:
# load dataset
dataset = load_dataset('shawhin/imdb-truncated')
dataset

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/592 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/836k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/853k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['label', 'text'],
        num_rows: 1000
    })
})

In [9]:
# display % of training data with label=1
np.array(dataset['train']['label']).sum()/len(dataset['train']['label'])

0.5

### model

In [10]:
model_checkpoint = 'distilbert-base-uncased'
# model_checkpoint = 'roberta-base' # you can alternatively use roberta-base but this model is bigger thus training will take longer

# define label maps
id2label = {0: "Negative", 1: "Positive"}
label2id = {"Negative":0, "Positive":1}

# generate classification model from model_checkpoint
model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint, num_labels=2, id2label=id2label, label2id=label2id)

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
# display architecture
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

### preprocess data

In [12]:
# create tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_spaces=True) #tokenizer is used for converting text data into numberical form for LLM to understand it


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [13]:
#Playing with Autokonenizer!!!
text= "I enjoy learning about Large Language Models"
res= tokenizer(text)
print(res)
tokens= tokenizer.tokenize(text)
print(tokens)
ids=tokenizer.convert_tokens_to_ids(tokens)
print(ids)
decoded_string= tokenizer.convert_ids_to_tokens(ids)
print(decoded_string)

{'input_ids': [101, 1045, 5959, 4083, 2055, 2312, 2653, 4275, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}
['i', 'enjoy', 'learning', 'about', 'large', 'language', 'models']
[1045, 5959, 4083, 2055, 2312, 2653, 4275]
['i', 'enjoy', 'learning', 'about', 'large', 'language', 'models']


#### Token Embedding Matrix

1. Every sentence is converted in tokens(words) and each token will have a specific token id associate with it.
2. Also, every token id can be converted into token vectors, thus a sentemce can be represented in the form of a token embedding matrix

In [14]:
print(len(tokenizer))

30522


In [15]:
#The pad token is used to fill shorter sequences in a batch to match the longest sequence's length, ensuring uniform input size for model processing.
#Note, when adding new tokens to the vocabulary, you should make sure to also resize the token embedding matrix of the model so that its embedding matrix matches the tokenizer.
# add pad token if none exists
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

print(len(tokenizer))

30522


**What is truncation?**

Removing a token from the longest sequence in the pair if a pair of sequences (or a batch of pairs) is provided.


In [16]:
# create tokenize function
def tokenize_function(examples):
    # extract text
    text = examples["text"]

    #tokenize and truncate text
    tokenizer.truncation_side = "left" #trancation side means side from which token should be removed form longest sequence
    tokenized_inputs = tokenizer(
        text,
        return_tensors="np",
        truncation=True,
        max_length=512
    )

    return tokenized_inputs

In [17]:
# tokenize training and validation datasets
# This adds ids and attentionmamsk columns to the dataset for each sequence/text in rows
tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 1000
    })
})

**What is Data Collator?**

A data collator is a function that takes a list of samples from a Dataset and collates them into a batch.

In [18]:
# create data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer) #Data collator that will dynamically pad the inputs received.

### evaluation

In [19]:
# import accuracy evaluation metric
accuracy = evaluate.load("accuracy")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [20]:
# define an evaluation function to pass into trainer later
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)

    return {"accuracy": accuracy.compute(predictions=predictions, references=labels)}

### Apply untrained model to text

**Inference on a sample text**

In [21]:
inputs = tokenizer.encode("i am fine", return_tensors="pt")# Converts a string to a sequence of ids (integer), using the tokenizer and vocabulary.

# compute logits
logits = model(inputs).logits
print("logits: ", logits)

# convert logits to label
predictions = torch.argmax(logits)
print("predictions: ", predictions)

print(text + " - " + id2label[predictions.tolist()])

logits:  tensor([[0.1016, 0.0735]], grad_fn=<AddmmBackward0>)
predictions:  tensor(0)
I enjoy learning about Large Language Models - Negative


In [22]:
# define list of examples
text_list = ["It was good.", "Not a fan, don't recommed.", "Better than the first one.", "This is not worth watching even once.", "This one is a pass."]

print("Untrained model predictions:")
print("----------------------------")
for text in text_list:
    # tokenize text
    inputs = tokenizer.encode(text, return_tensors="pt")# Converts a string to a sequence of ids (integer), using the tokenizer and vocabulary.

    # compute logits
    logits = model(inputs).logits
    # convert logits to label
    predictions = torch.argmax(logits)

    print(text + " - " + id2label[predictions.tolist()])

Untrained model predictions:
----------------------------
It was good. - Positive
Not a fan, don't recommed. - Negative
Better than the first one. - Negative
This is not worth watching even once. - Positive
This one is a pass. - Negative


### Train model

In [23]:
peft_config = LoraConfig(task_type="SEQ_CLS",
                        r=4,
                        lora_alpha=32,
                        lora_dropout=0.01,
                        target_modules = ['q_lin'])

In [24]:
peft_config

LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, task_type='SEQ_CLS', inference_mode=False, r=4, target_modules={'q_lin'}, lora_alpha=32, lora_dropout=0.01, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', loftq_config={}, use_dora=False, layer_replication=None)

In [25]:
# We only have to train 0.93% of total trainable parameters.This is just AMAZING!!!!!!!!
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 628,994 || all params: 67,584,004 || trainable%: 0.9307


In [26]:
# hyperparameters
lr = 1e-3
batch_size = 4
num_epochs = 10

In [27]:
# define training arguments
training_args = TrainingArguments(
    output_dir= model_checkpoint + "-lora-text-classification",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)



In [28]:
# creater trainer object
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator, # this will dynamically pad examples in each batch to be equal length
    compute_metrics=compute_metrics,
)

# train model
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.35525,{'accuracy': 0.879}
2,0.420800,0.476451,{'accuracy': 0.889}
3,0.420800,0.626745,{'accuracy': 0.888}
4,0.178300,0.691926,{'accuracy': 0.887}
5,0.178300,0.817288,{'accuracy': 0.894}
6,0.053700,0.928935,{'accuracy': 0.884}
7,0.053700,1.043775,{'accuracy': 0.891}
8,0.020600,1.091993,{'accuracy': 0.881}
9,0.020600,1.098184,{'accuracy': 0.887}
10,0.003600,1.113841,{'accuracy': 0.884}


Trainer is attempting to log a value of "{'accuracy': 0.879}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.889}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.888}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.887}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.894}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This i

TrainOutput(global_step=2500, training_loss=0.13538675351142884, metrics={'train_runtime': 476.3618, 'train_samples_per_second': 20.992, 'train_steps_per_second': 5.248, 'total_flos': 1112883852759936.0, 'train_loss': 0.13538675351142884, 'epoch': 10.0})

### Generate prediction

In [29]:
model.to('cpu') # moving to mps for Mac (can alternatively do 'cpu')

print("Trained model predictions:")
print("--------------------------")
for text in text_list:
    inputs = tokenizer.encode(text, return_tensors="pt").to("cpu") # moving to mps for Mac (can alternatively do 'cpu')

    logits = model(inputs).logits
    predictions = torch.max(logits,1).indices

    print(text + " - " + id2label[predictions.tolist()[0]])

Trained model predictions:
--------------------------
It was good. - Positive
Not a fan, don't recommed. - Negative
Better than the first one. - Positive
This is not worth watching even once. - Negative
This one is a pass. - Negative


### Optional: push model to hub

In [36]:
# # option 1: notebook login
# from huggingface_hub import notebook_login
# notebook_login() # ensure token gives write access

# option 2: key login
from huggingface_hub import login
write_key = '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' # paste token here
login(write_key)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [39]:
hf_name = 'Ashwin13' # your hf username or org name
model_id = hf_name + "/" + model_checkpoint + "-lora-text-classification" # you can name the model whatever you want

In [40]:
model.push_to_hub(model_id) # save model

adapter_model.safetensors:   0%|          | 0.00/2.52M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Ashwin13/distilbert-base-uncased-lora-text-classification/commit/13df1596fa9466220b5b3198ce8471d72231e967', commit_message='Upload model', commit_description='', oid='13df1596fa9466220b5b3198ce8471d72231e967', pr_url=None, pr_revision=None, pr_num=None)

In [41]:
trainer.push_to_hub(model_id) # save trainer

events.out.tfevents.1718802036.c4744fbfc554.324.0:   0%|          | 0.00/9.03k [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/5.18k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Ashwin13/distilbert-base-uncased-lora-text-classification/commit/bee2401061623286fa366b61a816595c81c970b2', commit_message='Ashwin13/distilbert-base-uncased-lora-text-classification', commit_description='', oid='bee2401061623286fa366b61a816595c81c970b2', pr_url=None, pr_revision=None, pr_num=None)

### Optional: load peft model

In [42]:
# how to load peft model from hub for inference
config = PeftConfig.from_pretrained(model_id)
inference_model = AutoModelForSequenceClassification.from_pretrained(
    config.base_model_name_or_path, num_labels=2, id2label=id2label, label2id=label2id
)
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
model = PeftModel.from_pretrained(inference_model, model_id)

adapter_config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


adapter_model.safetensors:   0%|          | 0.00/2.52M [00:00<?, ?B/s]