In [1]:
%pip install datasets
%pip install transformers
%pip install evaluate
#pip install accelerate
%pip install peft
from datasets import load_dataset, DatasetDict, Dataset

from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer
)

from peft import PeftModel, PeftConfig, LoraConfig, get_peft_model
import evaluate
import torch
import numpy as np

Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.9 MB/s[0m eta [36m0:00:0

In [2]:
model_checkpoint = "distilbert-base-uncased"

id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint,
    num_labels=2,
    id2label=id2label,
    label2id=label2id
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
dataset = load_dataset("shawhin/imdb-truncated")

dataset

README.md:   0%|          | 0.00/592 [00:00<?, ?B/s]

(…)-00000-of-00001-5a744bf76a1d84b2.parquet:   0%|          | 0.00/836k [00:00<?, ?B/s]

(…)-00000-of-00001-a3a52fabb70c739f.parquet:   0%|          | 0.00/853k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['label', 'text'],
        num_rows: 1000
    })
})

In [4]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint,add_prefix_space=True)

def tokenize_function(examples):
    text = examples["text"]
    tokenizer.truncation_side = "left"
    tokenizer_inputs = tokenizer(text, return_tensors="np", truncation=True, max_length=512)
    return tokenizer_inputs

if tokenizer.pad_token is None:
   tokenizer.add_special_tokens({'pad_token': '[PAD]'})
   model.resize_token_embeddings(len(tokenizer))

tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 1000
    })
})

In [5]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


In [6]:
accuracy = evaluate.load("accuracy")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)
    return {"accuracy": accuracy.compute(predictions=predictions, references=labels)}


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [7]:
text_list = ["it was good.", "it was bad.", "better than the first one.", "this is not worth watching even once." ,"this one is a pass."]

print("Untrained model predictions:")
print("----------------------------")

for text in text_list:
    inputs = tokenizer.encode(text, return_tensors="pt")
    logits = model(inputs).logits
    predictions = torch.argmax(logits)
    print(text+" : "+id2label[predictions.tolist()])


Untrained model predictions:
----------------------------
it was good. : POSITIVE
it was bad. : POSITIVE
better than the first one. : POSITIVE
this is not worth watching even once. : POSITIVE
this one is a pass. : POSITIVE


In [8]:
peft_config = LoraConfig(
    task_type="SEQ_CLS",
    r=4,
    lora_alpha=32,
    lora_dropout=0.01,
    target_modules=["q_lin"])


In [9]:
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 628,994 || all params: 67,584,004 || trainable%: 0.9307


In [10]:
lr=1e-3
batch_size=4
num_epochs=10

training_args = TrainingArguments(
    output_dir=model_checkpoint+"-lora-text-classification",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)



In [11]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.394214,{'accuracy': 0.874}
2,0.425000,0.523011,{'accuracy': 0.882}
3,0.425000,0.779962,{'accuracy': 0.863}
4,0.195000,0.629125,{'accuracy': 0.899}
5,0.195000,0.915033,{'accuracy': 0.871}
6,0.055700,0.953046,{'accuracy': 0.894}
7,0.055700,1.055084,{'accuracy': 0.894}
8,0.023600,0.985982,{'accuracy': 0.895}
9,0.023600,1.075716,{'accuracy': 0.886}
10,0.004100,1.00979,{'accuracy': 0.893}


Trainer is attempting to log a value of "{'accuracy': 0.874}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.882}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.863}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.899}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.871}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This i

TrainOutput(global_step=2500, training_loss=0.14066927766799928, metrics={'train_runtime': 463.204, 'train_samples_per_second': 21.589, 'train_steps_per_second': 5.397, 'total_flos': 1112883852759936.0, 'train_loss': 0.14066927766799928, 'epoch': 10.0})

In [12]:
model.to('cuda')

print("trained model predictions:")
print("--------------------------")

for text in text_list:
    inputs = tokenizer.encode(text, return_tensors="pt").to('cuda')
    logits = model(inputs).logits
    predictions = torch.max(logits, axis=1).indices

    # Loop through each prediction if there's more than one per input
    for pred in predictions.tolist():
        print(text + " : " + id2label[pred])

trained model predictions:
--------------------------
it was good. : POSITIVE
it was bad. : NEGATIVE
better than the first one. : POSITIVE
this is not worth watching even once. : POSITIVE
this one is a pass. : NEGATIVE
