In [2]:
from datasets import load_dataset, DatasetDict, Dataset
from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer
)
from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
import evaluate
import torch
import numpy as np

In [3]:
model_checkpt = 'distilbert-base-uncased'

In [4]:
# Define the label maps (Positive --> 1, Negative --> 0)
id2label = {0: "Negative",
            1: "Positive"}
label2id = {"Negative": 0,
            "Positive": 1}

# Classification model from the prescribed model ckpt
model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpt,
    num_labels = 2,
    id2label = id2label,
    label2id = label2id
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
# Load dataset
dataset = load_dataset("shawhin/imdb-truncated")

In [6]:
# Dataset Structure
dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['label', 'text'],
        num_rows: 1000
    })
})

In [7]:
# Create Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpt, add_prefix_space = True)

In [8]:
# Creating tokenizer function
def tokenize_function(examples):
    # Extract text
    text = examples['text']

    # Now truncate and tokenize the reviewer's text
    tokenizer.truncate_side = "left"
    tokenized_inputs = tokenizer(
        text,
        return_tensors="np",
        truncation=True,
        max_length=512
    )

    return tokenized_inputs

In [9]:
# Pad tokens if none exist
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

In [10]:
# Now tokenize datasets -> train and validation
tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 1000
    })
})

In [11]:
# Create data collator
dataCollator = DataCollatorWithPadding(tokenizer=tokenizer)

In [12]:
# Evaluation metric --> Accuracy
accuracy = evaluate.load('accuracy')

In [13]:
# Define eval function that we will later use to pass through trainer
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)
    return {"accuracy": accuracy.compute(predictions=predictions,
                                         references=labels)}

In [14]:
# Before training the model!

text_list = [
    "It was good.",
    "Not a fan, don't recommend.",
    "Better than the first one",
    "This is not worth watching even once.",
    "This one is a pass."
]

print("Untrained model predictions:")
print("----------------------------\n")

for text in text_list:
    inputs = tokenizer.encode(text, return_tensors="pt")
    logits = model(inputs).logits
    predictions = torch.argmax(logits)
    print(f"{text} - {id2label[predictions.tolist()]}")

Untrained model predictions:
----------------------------

It was good. - Negative
Not a fan, don't recommend. - Negative
Better than the first one - Negative
This is not worth watching even once. - Negative
This one is a pass. - Negative


In [15]:
# Apply LoRA
peft_config = LoraConfig(task_type="SEQ_CLS",       # Sequence Classification
                        r=4,                        # Rank
                        lora_alpha=32,              # Learning rate
                        lora_dropout=0.01,          # Dropout Probability
                         target_modules=['q_lin']   # Apply LoRA to query layer
                         )

In [16]:
# Update model
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 628,994 || all params: 67,584,004 || trainable%: 0.9306847223789819


In [17]:
# Hyperparameters
lr = 1e-3 
batchSize = 4
numEpochs = 10

# Training arguments
training_args = TrainingArguments(
    output_dir=model_checkpt + "-sentiment-analysis-using-LoRA",
    learning_rate=lr,
    per_device_train_batch_size=batchSize,
    per_device_eval_batch_size=batchSize,
    num_train_epochs=numEpochs,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True
)

In [18]:
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_dataset['train'],
    eval_dataset = tokenized_dataset['validation'],
    tokenizer = tokenizer,
    data_collator = dataCollator,
    compute_metrics = compute_metrics
)

trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


  0%|          | 0/2500 [00:00<?, ?it/s]

  0%|          | 0/250 [00:00<?, ?it/s]

{'eval_loss': 0.38284415006637573, 'eval_accuracy': {'accuracy': 0.859}, 'eval_runtime': 32.3703, 'eval_samples_per_second': 30.892, 'eval_steps_per_second': 7.723, 'epoch': 1.0}




{'loss': 0.4364, 'grad_norm': 14.797685623168945, 'learning_rate': 0.0008, 'epoch': 2.0}


  0%|          | 0/250 [00:00<?, ?it/s]

{'eval_loss': 0.4836050271987915, 'eval_accuracy': {'accuracy': 0.88}, 'eval_runtime': 19.1837, 'eval_samples_per_second': 52.128, 'eval_steps_per_second': 13.032, 'epoch': 2.0}




  0%|          | 0/250 [00:00<?, ?it/s]

{'eval_loss': 1.2064400911331177, 'eval_accuracy': {'accuracy': 0.836}, 'eval_runtime': 19.2073, 'eval_samples_per_second': 52.064, 'eval_steps_per_second': 13.016, 'epoch': 3.0}




{'loss': 0.2305, 'grad_norm': 0.0324484184384346, 'learning_rate': 0.0006, 'epoch': 4.0}


  0%|          | 0/250 [00:00<?, ?it/s]

{'eval_loss': 0.6465535759925842, 'eval_accuracy': {'accuracy': 0.887}, 'eval_runtime': 19.5147, 'eval_samples_per_second': 51.243, 'eval_steps_per_second': 12.811, 'epoch': 4.0}




  0%|          | 0/250 [00:00<?, ?it/s]

{'eval_loss': 0.7836781144142151, 'eval_accuracy': {'accuracy': 0.892}, 'eval_runtime': 19.0922, 'eval_samples_per_second': 52.377, 'eval_steps_per_second': 13.094, 'epoch': 5.0}




{'loss': 0.0653, 'grad_norm': 0.00022772687952965498, 'learning_rate': 0.0004, 'epoch': 6.0}


  0%|          | 0/250 [00:00<?, ?it/s]

{'eval_loss': 0.8337900638580322, 'eval_accuracy': {'accuracy': 0.889}, 'eval_runtime': 19.3765, 'eval_samples_per_second': 51.609, 'eval_steps_per_second': 12.902, 'epoch': 6.0}




  0%|          | 0/250 [00:00<?, ?it/s]

{'eval_loss': 0.9208342432975769, 'eval_accuracy': {'accuracy': 0.887}, 'eval_runtime': 19.1313, 'eval_samples_per_second': 52.27, 'eval_steps_per_second': 13.068, 'epoch': 7.0}




{'loss': 0.0154, 'grad_norm': 0.0009204890229739249, 'learning_rate': 0.0002, 'epoch': 8.0}


  0%|          | 0/250 [00:00<?, ?it/s]

{'eval_loss': 1.0305984020233154, 'eval_accuracy': {'accuracy': 0.888}, 'eval_runtime': 19.3171, 'eval_samples_per_second': 51.767, 'eval_steps_per_second': 12.942, 'epoch': 8.0}




  0%|          | 0/250 [00:00<?, ?it/s]

{'eval_loss': 1.0186481475830078, 'eval_accuracy': {'accuracy': 0.885}, 'eval_runtime': 19.1585, 'eval_samples_per_second': 52.196, 'eval_steps_per_second': 13.049, 'epoch': 9.0}




{'loss': 0.0079, 'grad_norm': 5.9334747675166e-06, 'learning_rate': 0.0, 'epoch': 10.0}


  0%|          | 0/250 [00:00<?, ?it/s]

{'eval_loss': 1.040765404701233, 'eval_accuracy': {'accuracy': 0.885}, 'eval_runtime': 19.456, 'eval_samples_per_second': 51.398, 'eval_steps_per_second': 12.85, 'epoch': 10.0}




{'train_runtime': 722.1403, 'train_samples_per_second': 13.848, 'train_steps_per_second': 3.462, 'train_loss': 0.15112618370056152, 'epoch': 10.0}


TrainOutput(global_step=2500, training_loss=0.15112618370056152, metrics={'train_runtime': 722.1403, 'train_samples_per_second': 13.848, 'train_steps_per_second': 3.462, 'train_loss': 0.15112618370056152, 'epoch': 10.0})

In [20]:
model.to('mps')

print("Trained model predictions:")
print("----------------------------\n")

for text in text_list:
    inputs = tokenizer.encode(text, return_tensors="pt").to("mps")
    logits = model(inputs).logits
    predictions = torch.max(logits, 1).indices
    print(f"{text} - {id2label[predictions.tolist()[0]]}")

Trained model predictions:
----------------------------

It was good. - Positive
Not a fan, don't recommend. - Negative
Better than the first one - Positive
This is not worth watching even once. - Negative
This one is a pass. - Negative
