In [1]:
from datasets import load_dataset, DatasetDict, Dataset

from transformers import(
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer
)

from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
import evaluate
import torch
import numpy as np

In [3]:
model_checkpoint = 'distilbert-base-uncased'

# Define label maps
id2label = {0: 'Negative' , 1: 'Positive'}
label2id = {'Negative': 0 , 'Positive' : 1}

# Generate Classification model from model_checkpoint
model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint, num_labels = 2 , id2label= id2label , label2id = label2id
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'classifier.weight', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
# Load dataset
dataset = load_dataset('shawhin/imdb-truncated')
dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['label', 'text'],
        num_rows: 1000
    })
})

In [8]:
# Create Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space = True)

In [10]:
# Create tokenize function
def tokenize_function(examples):
    # Extract text
    text = examples['text']
    
    # Tokenize and Truncate text
    tokenizer.truncation_side = "left"
    tokenized_inputs = tokenizer(
        text,
        return_tensors = 'np',
        truncation = True,
        max_length = 512
        )
    
    return tokenized_inputs

if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': ['PAD']})
    model.resize_token_embeddings(len(tokenizer))
    
# Tokenize training and validation datasets
tokenized_dataset = dataset.map(tokenize_function, batched= True)
tokenized_dataset

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 1000
    })
})

In [11]:
# Create Data Collator for Dynamic Padding
data_collator = DataCollatorWithPadding(tokenizer = tokenizer)

In [13]:
# Define Evaluation Metrics
accuracy = evaluate.load("accuracy")

# Define an Evaluation Function to pass into Trainer later.
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis = 1)
    
    return {'accuracy': accuracy.compute(predictions= predictions,
                                         references= labels)}

In [15]:
# Untrained Model Performance
# Define list of examples
text_list = ["It was good." , "Not a fan don't recommed.", "Better than the first one.", "Greatest of all the time.",
             "Worst app evet...", "This is not worth watching even once.", "This one is a pass."]

print("Untrained model Predictions:")
print("----------------------------")
for text in text_list:
    # Tokenize text
    inputs = tokenizer.encode(text, return_tensors='pt')
    # Compute Logits
    logits = model(inputs).logits
    # Convert Logits to Label
    predictions = torch.argmax(logits)
    
    print(text + " - " + id2label[predictions.tolist()])
    
# Really bad results with base model...

Untrained model Predictions:
----------------------------
It was good. - Negative
Not a fan don't recommed. - Positive
Better than the first one. - Negative
Greatest of all the time. - Negative
Worst app evet... - Positive
This is not worth watching even once. - Positive
This one is a pass. - Negative


In [17]:
# Fine tuning with LoRA
peft_config = LoraConfig(task_type= "SEQ_CLS", # Sequence Classification
                         r = 4, # Intrinsic rank of Trainable weight matrix
                         lora_alpha= 32, # This is like a learning rate.
                         lora_dropout= 0.01, # Probability of dropout
                         target_modules= ['q_lin'] # We apply lora to query layer.
                         )

In [18]:
model = get_peft_model(model, peft_config)
model.print_trainable_parameters() # Thats the power of LoRA

trainable params: 1,221,124 || all params: 67,584,004 || trainable%: 1.8068239934408148


In [21]:
# Define Hyperparameters
lr = 1e-3 
batch_size = 4
num_epochs = 10

# Define Training Arguments
training_args = TrainingArguments(
    output_dir=model_checkpoint + "-lora-text-classication",
    learning_rate= lr,
    per_device_train_batch_size= batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs= num_epochs,
    weight_decay = 0.01,
    evaluation_strategy= 'epoch',
    save_strategy= 'epoch',
    load_best_model_at_end= True
    )

In [22]:
# Define Trainer Object

trainer = Trainer(
    model= model, # Our peft model
    args = training_args, # hyperparameters
    train_dataset= tokenized_dataset['train'],
    eval_dataset = tokenized_dataset['validation'],
    tokenizer = tokenizer, # Our tokenizer
    data_collator= data_collator, # Dynamic padding
    compute_metrics= compute_metrics # Our Evaluation metrics function
)

trainer.train()

  0%|          | 0/2500 [00:00<?, ?it/s]

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


  0%|          | 0/250 [00:00<?, ?it/s]

Trainer is attempting to log a value of "{'accuracy': 0.867}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


{'eval_loss': 0.3731820285320282, 'eval_accuracy': {'accuracy': 0.867}, 'eval_runtime': 47.1111, 'eval_samples_per_second': 21.226, 'eval_steps_per_second': 5.307, 'epoch': 1.0}
{'loss': 0.4264, 'learning_rate': 0.0008, 'epoch': 2.0}


  0%|          | 0/250 [00:00<?, ?it/s]

Trainer is attempting to log a value of "{'accuracy': 0.877}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


{'eval_loss': 0.39617154002189636, 'eval_accuracy': {'accuracy': 0.877}, 'eval_runtime': 48.0948, 'eval_samples_per_second': 20.792, 'eval_steps_per_second': 5.198, 'epoch': 2.0}


KeyboardInterrupt: 

In [None]:
# Trained Model Performance
# model.to('gpu') # You can use 'cpu' too.

print('Trained model predictions:')
print("-------------------------")
for text in text_list:
    inputs = tokenizer.encode(text, return_tensors= 'pt')  #.to('gpu')
    
    logits = model(inputs).logits
    predictions = torch.max(logits, 1).indices
    
    print(text + " - " + id2label(predictions.tolist()[0]))
    

In [None]:
# Done

In [None]:
# Overfitting a little bit. YOu can use typical Transfer learning and fine tune only head of layers alternatively.