# **DistilBERT Sentiment Classifier with LoRA**
## This project focuses on fine-tuning a DistilBERT model for sentiment analysis, utilizing LoRA (Low-Rank Adaptation) to enhance model efficiency. The approach enables faster training with fewer parameters while maintaining high accuracy in sentiment classification tasks.

In [1]:
!pip install datasets transformers peft evaluate torch numpy


Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Col

In [1]:
from datasets import load_dataset, DatasetDict, Dataset

from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer)

from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
import evaluate
import torch
import numpy as np

# **Load Dataset**

In [2]:
# # how dataset was generated

# # load imdb data
imdb_dataset = load_dataset("imdb")

# # define subsample size
N = 1000
# # generate indexes for random subsample
rand_idx = np.random.randint(24999, size=N)

# # extract train and test data
x_train = imdb_dataset['train'][rand_idx]['text']
y_train = imdb_dataset['train'][rand_idx]['label']

x_test = imdb_dataset['test'][rand_idx]['text']
y_test = imdb_dataset['test'][rand_idx]['label']

# # create new dataset
dataset = DatasetDict({'train':Dataset.from_dict({'label':y_train,'text':x_train}),'validation':Dataset.from_dict({'label':y_test,'text':x_test})})

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['label', 'text'],
        num_rows: 1000
    })
})

# Check for imbalance

In [4]:
import numpy as np

# Get labels from training dataset
labels = np.array(dataset['train']['label'])

# Count occurrences of each label
unique, counts = np.unique(labels, return_counts=True)

# Calculate percentage for each label
label_distribution = {label: count / len(labels) * 100 for label, count in zip(unique, counts)}

# Display the percentage of each label
print(label_distribution)


{0: 51.2, 1: 48.8}


# Define the model

In [5]:

model_checkpoint = 'distilbert-base-uncased'
# model_checkpoint = 'roberta-base' # you can alternatively use roberta-base but this model is bigger thus training will take longer

# define label maps
id2label = {0: "Negative", 1: "Positive"}
label2id = {"Negative":0, "Positive":1}

# generate classification model from model_checkpoint
model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint, num_labels=2, id2label=id2label, label2id=label2id)

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


# Data Preprocessing

In [7]:
# create tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)

# add pad token if none exists
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [8]:
def tokenize_function(examples):
    """
    Tokenizes and truncates text input for a transformer model.

    Args:
        examples (dict): A dictionary containing text data with a key "text".

    Returns:
        dict: A dictionary containing tokenized input IDs and attention masks.
    """

    # Extract text from input dictionary
    text = examples["text"]

    # Set truncation to remove excess tokens from the left side if needed
    tokenizer.truncation_side = "left"

    # Tokenize and truncate the text
    tokenized_inputs = tokenizer(
        text,                  # Input text to be tokenized
        return_tensors="np",    # Return NumPy arrays instead of PyTorch/TensorFlow tensors
        truncation=True,        # Enable truncation for long sequences
        max_length=512          # Limit tokenized input to 512 tokens (common for transformer models)
    )

    # Return tokenized inputs, including input IDs and attention masks
    return tokenized_inputs


In [9]:
# tokenize training and validation datasets
tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 1000
    })
})

In [10]:

# create data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Define Evaluation Metric

In [11]:
# import accuracy evaluation metric
accuracy = evaluate.load("accuracy")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [12]:
# define an evaluation function to pass into trainer later
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)

    return {"accuracy": accuracy.compute(predictions=predictions, references=labels)}

# Inference before finetuning

In [13]:
# define list of examples
text_list = ["It was good.", "Not a fan, don't recommed.", "Better than the first one.", "This is not worth watching even once.", "This one is a pass."]

print("Untrained model predictions:")
print("----------------------------")
for text in text_list:
    # tokenize text
    inputs = tokenizer.encode(text, return_tensors="pt")
    # compute logits
    logits = model(inputs).logits
    # convert logits to label
    predictions = torch.argmax(logits)

    print(text + " - " + id2label[predictions.tolist()])

Untrained model predictions:
----------------------------
It was good. - Positive
Not a fan, don't recommed. - Positive
Better than the first one. - Positive
This is not worth watching even once. - Positive
This one is a pass. - Positive


# finetuning with LoRA


# Define LoRA Configurations

In [14]:
from peft import LoraConfig

# Create a LoRA configuration for fine-tuning a transformer model
peft_config = LoraConfig(
    task_type="SEQ_CLS",  # Task type: "SEQ_CLS" (Sequence Classification)
    r=4,                  # Rank of LoRA adaptation matrices (smaller values reduce parameters)
    lora_alpha=32,        # Scaling factor for LoRA updates (higher values increase impact of LoRA weights)
    lora_dropout=0.01,    # Dropout probability for LoRA layers (helps regularization)
    target_modules=['q_lin']  # Specifies which layers to apply LoRA (e.g., query projection in attention)
)


In [15]:
peft_config

LoraConfig(task_type='SEQ_CLS', peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, inference_mode=False, r=4, target_modules={'q_lin'}, exclude_modules=None, lora_alpha=32, lora_dropout=0.01, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', loftq_config={}, eva_config=None, use_dora=False, layer_replication=None, runtime_config=LoraRuntimeConfig(ephemeral_gpu_offload=False), lora_bias=False)

# Apply PEFT (LoRA) configuration to the model

In [16]:

model = get_peft_model(model, peft_config)

# Print the number of trainable parameters in the model
model.print_trainable_parameters()


trainable params: 628,994 || all params: 67,584,004 || trainable%: 0.9307


# Define Hyperparamters

In [17]:
# Hyperparameters
lr = 1e-3            # Learning rate
batch_size = 4       # Batch size
num_epochs = 10      # Number of epochs


In [18]:
# Define training arguments for fine-tuning the model
training_args = TrainingArguments(
    output_dir=model_checkpoint + "-lora-sentiment-classification",  # Directory where the model will be saved
    learning_rate=lr,  # The learning rate for optimization
    per_device_train_batch_size=batch_size,  # The batch size for training (how many examples per step)
    per_device_eval_batch_size=batch_size,  # The batch size for evaluation (how many examples per step during evaluation)
    num_train_epochs=num_epochs,  # The number of times the model will train over the entire dataset
    weight_decay=0.01,  # Regularization term to prevent overfitting by adding a penalty to large weights
    evaluation_strategy="epoch",  # When to run evaluation (here, after each epoch)
    save_strategy="epoch",  # When to save the model (here, after each epoch)
    load_best_model_at_end=True,  # Load the best model (based on evaluation metrics) after training is finished
    report_to=["none"],
)




# Train the model

In [19]:
# Create Trainer object
import os
os.environ["WANDB_DISABLED"] = "true"

trainer = Trainer(
    model=model,  # The model that will be fine-tuned
    args=training_args,  # The training arguments (hyperparameters and settings)
    train_dataset=tokenized_dataset["train"],  # The training dataset (tokenized)
    eval_dataset=tokenized_dataset["validation"],  # The validation dataset (tokenized)
    tokenizer=tokenizer,  # The tokenizer used for tokenizing text inputs
    data_collator=data_collator,  # Data collator to dynamically pad batches to equal length
    compute_metrics=compute_metrics,  # A function to compute evaluation metrics (e.g., accuracy)

)

# Train the model
trainer.train()


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.544833,{'accuracy': 0.856}
2,0.430900,0.587993,{'accuracy': 0.83}
3,0.430900,0.769975,{'accuracy': 0.858}
4,0.173700,0.984964,{'accuracy': 0.85}
5,0.173700,1.108024,{'accuracy': 0.843}
6,0.061700,1.34429,{'accuracy': 0.847}
7,0.061700,1.401265,{'accuracy': 0.843}
8,0.029300,1.333711,{'accuracy': 0.847}
9,0.029300,1.412108,{'accuracy': 0.852}
10,0.010300,1.386904,{'accuracy': 0.849}


TrainOutput(global_step=2500, training_loss=0.14119732971191407, metrics={'train_runtime': 464.5254, 'train_samples_per_second': 21.527, 'train_steps_per_second': 5.382, 'total_flos': 1119148137296064.0, 'train_loss': 0.14119732971191407, 'epoch': 10.0})

# Inference using finetuned model

In [20]:
model.to('cpu') # moving to mps for Mac (can alternatively do 'cpu')
# define list of examples
text_list = ["It was good.", "Not a fan, don't recommed.", "Better than the first one.", "This is not worth watching even once.", "This one is a pass."]
print("Trained model predictions:")
print("--------------------------")
for text in text_list:
    inputs = tokenizer.encode(text, return_tensors="pt").to("cpu") # moving to mps for Mac (can alternatively do 'cpu')

    logits = model(inputs).logits
    predictions = torch.max(logits,1).indices

    print(text + " - " + id2label[predictions.tolist()[0]])

Trained model predictions:
--------------------------
It was good. - Positive
Not a fan, don't recommed. - Negative
Better than the first one. - Positive
This is not worth watching even once. - Negative
This one is a pass. - Positive


In [21]:
# Save the model and tokenizer locally in Colab
model.save_pretrained('/content/sentiment_finetuned_model')
tokenizer.save_pretrained('/content/tokenizer')

('/content/tokenizer/tokenizer_config.json',
 '/content/tokenizer/special_tokens_map.json',
 '/content/tokenizer/vocab.txt',
 '/content/tokenizer/added_tokens.json',
 '/content/tokenizer/tokenizer.json')

# Pushing Model to Hugging Face

In [None]:
!pip install huggingface_hub



In [25]:
from huggingface_hub import login

# Login with your Hugging Face token
login(token="hf_Opxxxxxxxxxxxxxx")


In [26]:
# Push your model to the Hugging Face Hub
model.push_to_hub("Anitha0495/Sentiment-finetuning-LoRA")
tokenizer.push_to_hub("Anitha0495/Sentiment-finetuning-LoRA")


No files have been modified since last commit. Skipping to prevent empty commit.
No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/Anitha0495/Sentiment-finetuning-LoRA/commit/9e7d95794eade285264e659173be04390996880c', commit_message='Upload tokenizer', commit_description='', oid='9e7d95794eade285264e659173be04390996880c', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Anitha0495/Sentiment-finetuning-LoRA', endpoint='https://huggingface.co', repo_type='model', repo_id='Anitha0495/Sentiment-finetuning-LoRA'), pr_revision=None, pr_num=None)

# Loading the Fine-Tuned PEFT Model for Inference

In [29]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Load the model and tokenizer from the saved directory
model_path = '/content/sentiment_finetuned_model'

# Load the model
model = AutoModelForSequenceClassification.from_pretrained(model_path).to('cpu')

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [30]:
# ID to label mapping (this should match your training labels)
id2label = {0: "Negative", 1: "Positive"}  # Example, update based on your training setup

def infer_text(text):
    # Tokenize the input text
    inputs = tokenizer.encode(text, return_tensors="pt").to('cpu')

    # Get the model output
    logits = model(inputs).logits
    predictions = torch.max(logits, 1).indices

    # Print the predicted label
    return id2label[predictions.tolist()[0]]

# Loop to accept user input continuously and show predictions
while True:
    text_input = input("Enter text to analyze sentiment (or 'exit' to quit): ")
    if text_input.lower() == 'exit':
        break
    prediction = infer_text(text_input)
    print(f"Prediction: {prediction}")


Enter text to analyze sentiment (or 'exit' to quit): I loved this movie!
Prediction: Positive
Enter text to analyze sentiment (or 'exit' to quit): This was a waste of time.
Prediction: Negative
Enter text to analyze sentiment (or 'exit' to quit): exit


In [35]:
from IPython.display import display, HTML, Javascript

# Define the HTML structure for your UI
html_content = """
<div style="text-align:center;">
    <h2>Sentiment Analysis Prediction</h2>
    <textarea id="inputText" style="width: 100%; height: 100px;" placeholder="Type your text here..."></textarea><br><br>
    <button onclick="predictSentiment()">Predict Sentiment</button><br><br>
    <div id="outputDiv" style="font-size: 20px; color: green;"></div>
</div>

<script>
    function predictSentiment() {
        var inputText = document.getElementById('inputText').value;

        google.colab.kernel.invokeFunction('notebook.predict_sentiment', [inputText], {});
    }
</script>
"""

# Display the HTML content
display(HTML(html_content))

# Define a Python function that will be called from JavaScript
from google.colab import output

def predict_sentiment(text):
    # call the model for sentiment prediction
    inputs = tokenizer.encode(text, return_tensors="pt").to('cpu')
    logits = model(inputs).logits
    predictions = torch.max(logits, 1).indices
    sentiment = id2label[predictions.tolist()[0]]

    # Return the result back to the HTML interface
    display(HTML(f"<script>document.getElementById('outputDiv').innerText = 'Predicted Sentiment: ' + '{sentiment}'</script>"))

output.register_callback('notebook.predict_sentiment', predict_sentiment)
