In [None]:
from datasets import load_dataset, DatasetDict, Dataset
from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
)
from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
import evaluate
import torch
import numpy as np
    Trainer,
)
from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
import evaluate
import torch
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_checkpoint = "distilbert-base-uncased"

id2label = {0: "Negative", 1: "Positive"}

label2id = {"Negative": 0, "Positive": 1}

model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint, num_labels=2, id2label=id2label, label2id=label2id
)

# load dataset

dataset = load_dataset("shawhin/imdb-truncated")
dataset

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['label', 'text'],
        num_rows: 1000
    })
})

In [3]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)


def tokenize_function(examples):
    text = examples["text"]
    tokenizer.truncation_side = "left"
    tokenized_inputs = tokenizer(
        text, return_tensors="np", truncation=True, max_length=512
    )
    return tokenized_inputs


if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({"pad_token": "[PAD]"})
    model.resize_token_embeddings(len(tokenizer))

tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


accuracy = evaluate.load("accuracy")


def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)
    return {"accuracy": accuracy.compute(predictions=predictions, references=labels)}

In [4]:
text_list = [
    "It was good",
    "Not a fan, don't recommend",
    "It was awful",
    "Absolutely loved it, would watch again",
    "Meh, it was okay but not great",
    "Worst experience ever",
    "Exceeded my expectations!",
    "Pretty boring, fell asleep halfway",
    "Decent, but I've seen better",
    "Highly recommend to everyone",
]

print("untrained")
print("----------------------")

for text in text_list:
    inputs = tokenizer.encode(text, return_tensors="pt")
    logits = model(inputs).logits
    predictions = torch.argmax(logits)

    print(text + " - " + id2label[predictions.tolist()])

untrained
----------------------
It was good - Positive
Not a fan, don't recommend - Positive
It was awful - Positive
Absolutely loved it, would watch again - Positive
Meh, it was okay but not great - Positive
Worst experience ever - Positive
Exceeded my expectations! - Positive
Pretty boring, fell asleep halfway - Positive
Decent, but I've seen better - Positive
Highly recommend to everyone - Positive


In [5]:
peft_config = LoraConfig(
    task_type="SEQ_CLS", r=4, lora_alpha=32, lora_dropout=0.01, target_modules=["q_lin"]
)

In [6]:
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 628,994 || all params: 67,584,004 || trainable%: 0.9307


In [7]:
lr = 1e-3
batch_size = 4
num_epochs = 10

training_args = TrainingArguments(
    output_dir=model_checkpoint + "-lora-text-classification",
    learning_rate=lr,
    per_device_eval_batch_size=batch_size,
    per_gpu_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    eval_strategy="epoch",
    load_best_model_at_end=True,
    save_strategy="epoch",
)

In [8]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)
trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.307356,{'accuracy': 0.876}
2,No log,0.432229,{'accuracy': 0.87}
3,No log,0.414737,{'accuracy': 0.895}
4,0.249500,0.554577,{'accuracy': 0.904}
5,0.249500,0.629825,{'accuracy': 0.899}
6,0.249500,0.670036,{'accuracy': 0.899}
7,0.249500,0.771188,{'accuracy': 0.901}
8,0.023000,0.817501,{'accuracy': 0.897}
9,0.023000,0.845029,{'accuracy': 0.901}
10,0.023000,0.851171,{'accuracy': 0.898}


Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_de

TrainOutput(global_step=1250, training_loss=0.11041775598526, metrics={'train_runtime': 665.0453, 'train_samples_per_second': 15.037, 'train_steps_per_second': 1.88, 'total_flos': 1253694805157184.0, 'train_loss': 0.11041775598526, 'epoch': 10.0})

In [10]:
device = torch.device("cuda")

print("Trained model predictions")

for text in text_list:
    inputs = tokenizer.encode(text, return_tensors="pt").to(device)

    logits = model(inputs).logits
    predictions = torch.argmax(logits, dim=1)

    print(text, " - ", id2label[predictions.item()])

Trained model predictions
It was good  -  Positive
Not a fan, don't recommend  -  Negative
It was awful  -  Negative
Absolutely loved it, would watch again  -  Positive
Meh, it was okay but not great  -  Negative
Worst experience ever  -  Negative
Exceeded my expectations!  -  Negative
Pretty boring, fell asleep halfway  -  Negative
Decent, but I've seen better  -  Negative
Highly recommend to everyone  -  Positive
