In [None]:
# %pip install  peft
# %pip install datasets
# %pip install torch torchvision
# %pip install evaluate
# %pip install scikit-learn
# %pip install openai

In [None]:
from datasets import load_dataset

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification, 
    DataCollatorWithPadding, 
    TrainingArguments, 
    Trainer)

import evaluate
import torch
import numpy as np

from peft import get_peft_model, LoraConfig

In [3]:
# load dataset
dataset = load_dataset('shawhin/imdb-truncated')
dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['label', 'text'],
        num_rows: 1000
    })
})

In [4]:
# display % of training data with label=1
np.array(dataset['train']['label']).sum()/len(dataset['train']['label'])

0.5

In [5]:
model_checkpoint = 'distilbert/distilbert-base-uncased'

# define label maps
id2label = {0: "Negative", 1: "Positive"}
label2id = {"Negative":0, "Positive":1}

model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels =2, id2label=id2label, label2id=label2id)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
# model

#### Preprocess the model/dataset

In [7]:
# initialize the tokenizer for the pretrained model
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space = True)

In [8]:
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

In [9]:
def tokenize(examples):
    text = examples['text']

    tokenizer.truncation_side = "left"
    tokenized_input = tokenizer(
        text,
        return_tensors="np",
        truncation=True,
        max_length=512
    )
    return tokenized_input

In [10]:
tokenized_dataset = dataset.map(tokenize, batched=True)
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 1000
    })
})

In [11]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

#### Evaluation

In [12]:
# import accuracy evaluation metric
accuracy = evaluate.load("accuracy")

In [13]:
# define an evaluation function to pass into trainer later
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)

    return {"accuracy": accuracy.compute(predictions=predictions, references=labels)}

In [14]:
# define list of examples
text_list = ["I love this product!", 
             "I hate this experience.", 
             "I am definitely not smiling", 
             "What a beautiful day it is", 
             "My dad is an angry man", 
             "when i am hungy i am sad", 
             "a babys laugh is the most beautiful thing",
             "Not a fan, don't recommed.", 
             "Better than the first one.", 
             "This is not worth watching even once."]

print("Untrained model predictions:")
print("----------------------------")
for text in text_list:
    # tokenize text
    inputs = tokenizer.encode(text, return_tensors="pt")
    # compute logits
    logits = model(inputs).logits
    # convert logits to label
    predictions = torch.argmax(logits)

    print(text + " - " + id2label[predictions.tolist()])

Untrained model predictions:
----------------------------
I love this product! - Positive
I hate this experience. - Positive
I am definitely not smiling - Positive
What a beautiful day it is - Positive
My dad is an angry man - Positive
when i am hungy i am sad - Positive
a babys laugh is the most beautiful thing - Positive
Not a fan, don't recommed. - Positive
Better than the first one. - Positive
This is not worth watching even once. - Positive


In [15]:
peft_config = LoraConfig(
    task_type = 'SEQ_CLS',
    r = 4,
    lora_alpha = 32,
    lora_dropout = 0.01,
    target_modules = ['q_lin']
   )
peft_config

LoraConfig(task_type='SEQ_CLS', peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, inference_mode=False, r=4, target_modules={'q_lin'}, exclude_modules=None, lora_alpha=32, lora_dropout=0.01, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', loftq_config={}, eva_config=None, use_dora=False, layer_replication=None, runtime_config=LoraRuntimeConfig(ephemeral_gpu_offload=False), lora_bias=False)

In [16]:
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 628,994 || all params: 67,584,004 || trainable%: 0.9307


In [17]:
# hyperparameters
lr = 1e-3
batch_size = 4
num_epochs = 5

In [18]:
# define training arguments
training_args = TrainingArguments(
    output_dir= model_checkpoint + "-lora-text-classification",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)



In [19]:
# creater trainer object
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator, # this will dynamically pad examples in each batch to be equal length
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [20]:
# train model
trainer.train()

  0%|          | 0/1250 [00:00<?, ?it/s]

  0%|          | 0/250 [00:00<?, ?it/s]

{'eval_loss': 0.3471267819404602, 'eval_accuracy': {'accuracy': 0.892}, 'eval_runtime': 132.2889, 'eval_samples_per_second': 7.559, 'eval_steps_per_second': 1.89, 'epoch': 1.0}
{'loss': 0.42, 'grad_norm': 1.529360055923462, 'learning_rate': 0.0006, 'epoch': 2.0}


  0%|          | 0/250 [00:00<?, ?it/s]

{'eval_loss': 0.5276089906692505, 'eval_accuracy': {'accuracy': 0.874}, 'eval_runtime': 105.0386, 'eval_samples_per_second': 9.52, 'eval_steps_per_second': 2.38, 'epoch': 2.0}


  0%|          | 0/250 [00:00<?, ?it/s]

{'eval_loss': 0.6624936461448669, 'eval_accuracy': {'accuracy': 0.878}, 'eval_runtime': 148.5353, 'eval_samples_per_second': 6.732, 'eval_steps_per_second': 1.683, 'epoch': 3.0}
{'loss': 0.1698, 'grad_norm': 0.38184523582458496, 'learning_rate': 0.0002, 'epoch': 4.0}


  0%|          | 0/250 [00:00<?, ?it/s]

{'eval_loss': 0.725511372089386, 'eval_accuracy': {'accuracy': 0.885}, 'eval_runtime': 158.9696, 'eval_samples_per_second': 6.291, 'eval_steps_per_second': 1.573, 'epoch': 4.0}


  0%|          | 0/250 [00:00<?, ?it/s]

{'eval_loss': 0.7371582984924316, 'eval_accuracy': {'accuracy': 0.884}, 'eval_runtime': 173.6926, 'eval_samples_per_second': 5.757, 'eval_steps_per_second': 1.439, 'epoch': 5.0}
{'train_runtime': 1907.0933, 'train_samples_per_second': 2.622, 'train_steps_per_second': 0.655, 'train_loss': 0.24899883880615234, 'epoch': 5.0}


TrainOutput(global_step=1250, training_loss=0.24899883880615234, metrics={'train_runtime': 1907.0933, 'train_samples_per_second': 2.622, 'train_steps_per_second': 0.655, 'total_flos': 556790525519424.0, 'train_loss': 0.24899883880615234, 'epoch': 5.0})

In [21]:
model.to('mps') # moving to mps for Mac (can alternatively do 'cpu')

print("Trained model predictions:")
print("--------------------------")
for text in text_list:
    inputs = tokenizer.encode(text, return_tensors="pt").to("mps") # moving to mps for Mac (can alternatively do 'cpu')

    logits = model(inputs).logits
    predictions = torch.max(logits,1).indices

    print(text + " - " + id2label[predictions.tolist()[0]])

Trained model predictions:
--------------------------
I love this product! - Positive
I hate this experience. - Negative
I am definitely not smiling - Negative
What a beautiful day it is - Positive
My dad is an angry man - Negative
when i am hungy i am sad - Negative
a babys laugh is the most beautiful thing - Positive
Not a fan, don't recommed. - Negative
Better than the first one. - Positive
This is not worth watching even once. - Negative


#### Save model to Hugging Face Hub

In [22]:
from huggingface_hub import notebook_login

notebook_login()

In [24]:
model.push_to_hub("sentiment-analysis-model")
tokenizer.push_to_hub("sentiment-analysis-model")

adapter_model.safetensors:   0%|          | 0.00/2.52M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/D0te/sentiment-analysis-model/commit/43f39567c3bfce0f66b0422bffd876f524d2b7e5', commit_message='Upload tokenizer', commit_description='', oid='43f39567c3bfce0f66b0422bffd876f524d2b7e5', pr_url=None, repo_url=RepoUrl('https://huggingface.co/D0te/sentiment-analysis-model', endpoint='https://huggingface.co', repo_type='model', repo_id='D0te/sentiment-analysis-model'), pr_revision=None, pr_num=None)