# Initialization
## Load libraries

In [14]:
from datasets import load_dataset, DatasetDict, Dataset

from transformers import (
    AutoTokenizer,
    AutoConfig, 
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback
)

from peft import get_peft_model, LoraConfig
import evaluate
import torch
from torch.utils.data import DataLoader
import numpy as np
from sklearn.metrics import accuracy_score
import plotly.graph_objects as go

# Load & Process Dataset

In [2]:
imdb_dataset = load_dataset("imdb")


N = 2000 
rand_idx = np.random.randint(len(imdb_dataset.get("train")) - 1, size=N)

x_train = imdb_dataset['train'][rand_idx]['text']
y_train = imdb_dataset['train'][rand_idx]['label']

x_test = imdb_dataset['test'][rand_idx]['text']
y_test = imdb_dataset['test'][rand_idx]['label']

dataset = DatasetDict(
    {
        'train':Dataset.from_dict({'label':y_train,'text':x_train}),
        'validation':Dataset.from_dict({'label':y_test,'text':x_test})
    }
)

In [3]:
positive_case_ratio = np.array(dataset['train']['label']).sum() / len(dataset['train']['label'])
print(f"{positive_case_ratio*100:.2f}% of the cases are positive cases.")  

49.65% of the cases are positive cases.


# Load model

In [4]:
# Small classification model for local training
model_checkpoint = 'distilbert-base-uncased'

id2label = {0: "Negative", 1: "Positive"}
label2id = {"Negative":0, "Positive":1}

model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint, 
    num_labels=2, 
    id2label=id2label, 
    label2id=label2id,
    force_download=True
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
# Model summary
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


# Process data

In [6]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)

if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

In [7]:
def tokenize_function(examples):
    text = examples["text"]

    tokenizer.truncation_side = "left"
    tokenized_inputs = tokenizer(
        text,
        return_tensors="np",
        truncation=True,
        max_length=512
    )

    return tokenized_inputs

In [8]:
tokenized_dataset = dataset.map(tokenize_function, batched=True)

Map: 100%|██████████| 2000/2000 [00:00<00:00, 15363.64 examples/s]
Map: 100%|██████████| 2000/2000 [00:00<00:00, 17055.70 examples/s]


In [9]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Base model evaluation

In [None]:
example_cases = dataset.get("validation")[0:5]
example_text = example_cases.get("text")
example_labels = example_cases.get("label")

print("Untrained model prediction examples:")
print("----------------------------")
for i in range(len(example_text)):
    input = tokenizer.encode(example_text[i], return_tensors="pt", truncation=True)
    logit = model(input).logits
    predictions = torch.argmax(logit)

    print(f"Prediction: {id2label[predictions.tolist()]} | Label: {id2label.get(example_labels[i])} - {example_text[i]}")

Untrained model predictions:
----------------------------
Prediction: Negative | Label: Positive - The Outsiders is undoubtedly a classic Australian TV series. Well defined characters, tight scripts, varied and interesting locales, great guest stars and a filmic ambiance all combined to make this series a special one.<br /><br />Sadly, Andrew Keir has passed on & Sascha Hehn from Germany does not appear (unfortunately) to have enjoyed small screen success in his native country. The ABC has repeated the series many times yet a DVD release is yet to happen.<br /><br />The series is one which is timeless. It is as likely to strike a resonant chord with viewers today as it did in its own day. Come on ABC...release The Outsiders on DVD!!!!!
Prediction: Negative | Label: Positive - Do not miss this picture that defies ages. With no hesitation, a masterpiece. Not only the script and the music but also choregraphy, casting,<br /><br />cut : everything contributes to the perfect achievement. No

In [29]:
def calculate_accuracy(dataset, model):

    all_predictions = []
    all_labels = []

    for example in dataset:
        inputs = tokenizer.encode(example["text"], return_tensors="pt", truncation=True, max_length=512)

        logits = model(inputs).logits
        predictions = torch.argmax(logits, dim=1).item()

        all_predictions.append(predictions)
        all_labels.append(example["label"])

    accuracy = accuracy_score(all_labels, all_predictions)

    print(f"Pre-training accuracy: {accuracy*100:.2f}%")

In [30]:
calculate_accuracy(dataset=dataset.get("validation"), model=model)

Pre-training accuracy: 50.35%


# Train model
## Model evaluation config

In [None]:
accuracy = evaluate.load("accuracy")

In [None]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)

    return accuracy.compute(predictions=predictions, references=labels)

## Training config

In [None]:

peft_config = LoraConfig(
    task_type="SEQ_CLS",
    r=6,
    lora_alpha=32,
    lora_dropout=0.01,
    target_modules = ['q_lin']
)

In [None]:
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 647,426 || all params: 67,602,436 || trainable%: 0.9577


In [None]:
lr = 5e-5
batch_size = 8
num_epochs = 20

training_args = TrainingArguments(
    output_dir= model_checkpoint + "-lora-text-classification",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    lr_scheduler_type="linear",
    metric_for_best_model="eval_accuracy",
    greater_is_better=True, 
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[
        EarlyStoppingCallback(early_stopping_patience=3)
    ],
)

In [None]:
hist = trainer.train()

                                                  
  5%|▌         | 250/5000 [00:21<04:40, 16.94it/s]

{'eval_loss': 0.32496020197868347, 'eval_accuracy': 0.861, 'eval_runtime': 6.2335, 'eval_samples_per_second': 320.845, 'eval_steps_per_second': 40.106, 'epoch': 1.0}


 10%|█         | 500/5000 [00:37<04:43, 15.86it/s]  

{'loss': 0.4454, 'grad_norm': 6.880882740020752, 'learning_rate': 4.5e-05, 'epoch': 2.0}


                                                  
 10%|█         | 500/5000 [00:43<04:43, 15.86it/s]

{'eval_loss': 0.2932773530483246, 'eval_accuracy': 0.8795, 'eval_runtime': 6.2757, 'eval_samples_per_second': 318.688, 'eval_steps_per_second': 39.836, 'epoch': 2.0}


                                                    
 15%|█▌        | 750/5000 [01:05<04:19, 16.41it/s]

{'eval_loss': 0.30461883544921875, 'eval_accuracy': 0.885, 'eval_runtime': 6.3635, 'eval_samples_per_second': 314.293, 'eval_steps_per_second': 39.287, 'epoch': 3.0}


 20%|██        | 1000/5000 [01:20<03:51, 17.29it/s] 

{'loss': 0.2617, 'grad_norm': 6.223516941070557, 'learning_rate': 4e-05, 'epoch': 4.0}


                                                   
 20%|██        | 1000/5000 [01:27<03:51, 17.29it/s]

{'eval_loss': 0.31465062499046326, 'eval_accuracy': 0.887, 'eval_runtime': 6.2874, 'eval_samples_per_second': 318.095, 'eval_steps_per_second': 39.762, 'epoch': 4.0}


                                                     
 25%|██▌       | 1250/5000 [01:48<03:42, 16.82it/s]

{'eval_loss': 0.31751206517219543, 'eval_accuracy': 0.885, 'eval_runtime': 6.2616, 'eval_samples_per_second': 319.409, 'eval_steps_per_second': 39.926, 'epoch': 5.0}


 25%|██▌       | 1250/5000 [01:49<05:27, 11.44it/s]

{'train_runtime': 109.2693, 'train_samples_per_second': 366.068, 'train_steps_per_second': 45.759, 'train_loss': 0.3329040771484375, 'epoch': 5.0}





In [None]:
_ = model.to("cpu")

# Evaluate model

In [None]:
print("Trained model prediction examples:")
print("----------------------------")
for i in range(len(example_text)):
    input = tokenizer.encode(example_text[i], return_tensors="pt")
    logit = model(input).logits
    # convert logits to label
    predictions = torch.argmax(logit)

    print(f"Prediction: {id2label[predictions.tolist()]} | Label: {id2label.get(example_labels[i])} - {example_text[i]}")

Trained model predictions:
----------------------------
Prediction: Positive | Label: Positive - There is so much that can be said about this film. It is not your typical nunsploitation. Of course, there is nudity and sex with nuns, but that is almost incidental to the story.<br /><br />It is set in 15th Century Italy, at the time of the martyrdom of 800 Christians at Otranto. The battle between the Muslims and the Christians takes up a good part of the film. It was interesting when everyone was running from the Muslim hoards, that the mother superior would ask, "Why do you fear the Muslims,; they will not do anything that the Christians have done to you?" Certainly, there was enough torture on both sides.<br /><br />Sister Flavia (Florinda Bolkan) is sent to a convent for defying her father. In the process, she witnesses and endures many things: the gelding of a stallion, the rape of a local woman by a new Duke, the torture of a nun who was overcome during a visit by the Tarantula Sec

In [None]:
eval_accuracy_lst = list()

for log in trainer.state.log_history:
    if "eval_accuracy" in log:
        eval_accuracy_lst.append(log['eval_accuracy'])

In [None]:
fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x=list(range(len(eval_accuracy_lst))),
        y=eval_accuracy_lst,
        mode='lines+markers',
        name='Eval Accuracy')
        )

fig.update_layout(
    title='Evaluation Accuracy Over Epochs',
    xaxis_title='Epoch',
    yaxis_title='Accuracy',
    template='seaborn',
    width=1000,
    height=500
)

fig.show()