<a href="https://colab.research.google.com/github/AnzorGozalishvili/active_learning_playground/blob/main/notebooks/active_learning_experiments_on_sms_spam_classification_problem_using_baal_library.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers

In [None]:
!pip install -U datasets
!pip install -U baal

# Load `sms spam` dataset from huggingface datasets

In [35]:
import datasets

In [2]:
sms_spam_dataset = datasets.load_dataset('sms_spam', )

Reusing dataset sms_spam (/root/.cache/huggingface/datasets/sms_spam/plain_text/1.0.0/53f051d3b5f62d99d61792c91acefe4f1577ad3e4c216fb0ad39e30b9f20019c)


  0%|          | 0/1 [00:00<?, ?it/s]

In [6]:
sms_spam_dataset.shape

{'train': (5574, 2)}

## Let's split our dataset into Train/Test splits (since it only contains train set)

Let's have around 500 samples in test set since the overal size is only around 5500.

In [7]:
RANDOM_SEED = 42

In [17]:
splitted_sms_spam_dataset = sms_spam_dataset['train'].train_test_split(test_size=500, shuffle=True, seed=RANDOM_SEED)

Loading cached split indices for dataset at /root/.cache/huggingface/datasets/sms_spam/plain_text/1.0.0/53f051d3b5f62d99d61792c91acefe4f1577ad3e4c216fb0ad39e30b9f20019c/cache-49fa1f1338b1121b.arrow and /root/.cache/huggingface/datasets/sms_spam/plain_text/1.0.0/53f051d3b5f62d99d61792c91acefe4f1577ad3e4c216fb0ad39e30b9f20019c/cache-c1d02dfc7f23e5bf.arrow


In [19]:
splitted_sms_spam_dataset.shape

{'test': (500, 2), 'train': (5074, 2)}

In [21]:
train_ds, test_ds = splitted_sms_spam_dataset['train'], splitted_sms_spam_dataset['test']

In [22]:
train_ds.shape, test_ds.shape

((5074, 2), (500, 2))

# Load small pretrained Language Model from huggingface transformers library

In [3]:
import transformers

In [105]:
model = transformers.AutoModelForSequenceClassification.from_pretrained("distilbert-base-cased", num_labels=2)

loading configuration file https://huggingface.co/distilbert-base-cased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/ebe1ea24d11aa664488b8de5b21e33989008ca78f207d4e30ec6350b693f073f.302bfd1b5e031cc1b17796e0b6e5b242ba2045d31d00f97589e12b458ebff27a
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-cased",
  "activation": "gelu",
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.14.0",
  "vocab_size": 28996
}

loading weights file https://huggingface.co/distilbert-base-cased/resolve/main/pytorch_model.bin from cache at /root/.cache/huggingface/transformers/9c9f39769dba4c5fe379b4bc82973eb01297bd607954621434

In [39]:
tokenizer = transformers.AutoTokenizer.from_pretrained("distilbert-base-cased")

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/426k [00:00<?, ?B/s]

In [40]:
tokenizer('sample sentence')

{'input_ids': [101, 6876, 5650, 102], 'attention_mask': [1, 1, 1, 1]}

In [42]:
tokenizer.decode(tokenizer('sample sentence')['input_ids'])

'[CLS] sample sentence [SEP]'

# Convert Huggingface Dataset into ActiveLearningDataset

In [36]:
from baal.active.dataset.nlp_datasets import active_huggingface_dataset

In [43]:
train_ds[0]

{'label': 0, 'sms': 'Well I might not come then...\n'}

In [45]:
active_set = active_huggingface_dataset(dataset=train_ds, tokenizer=tokenizer, target_key='label', input_key='sms')

# Define Active Learning Experiment Configurations

In [27]:
from dataclasses import dataclass

In [28]:
@dataclass
class ExperimentConfig:
    epoch: int = 4500//128
    batch_size: int = 32
    initial_pool: int = 500
    query_size: int = 128
    lr: float = 0.001
    heuristic: str = 'bald'
    iterations: int = 40
    training_duration: int = 2

In [46]:
hyperparams = ExperimentConfig()

In [47]:
hyperparams

ExperimentConfig(epoch=35, batch_size=32, initial_pool=500, query_size=128, lr=0.001, heuristic='bald', iterations=40, training_duration=2)

In [56]:
active_set.can_label = False

In [57]:
active_set.label_randomly(hyperparams.initial_pool)

In [58]:
active_set.n_labelled

500

In [59]:
active_set[0]

{'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0]),
 'input_ids': tensor([ 101, 6502,  119,  197,  197, 6346,  136,  197,  197,  102,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0

In [106]:
import random
import torch

In [109]:
use_cuda = torch.cuda.is_available()
torch.backends.cudnn.benchmark = True
random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

if not use_cuda:
    print("warning, the experiments would take ages to run on cpu")

use_cuda

True

In [110]:
# Out Dataset Shapes
len(active_set), len(test_ds)

(500, 500)

In [112]:
from baal.active import get_heuristic

In [111]:
# Get our model.
heuristic = get_heuristic(hyperparams.heuristic)

In [114]:
from baal.bayesian.dropout import patch_module
from copy import deepcopy

In [115]:
# change dropout layer to MCDropout
model = patch_module(model)

In [116]:
if use_cuda:
    model.cuda()
init_weights = deepcopy(model.state_dict())

In [117]:
from transformers import TrainingArguments
from baal.transformers_trainer_wrapper import BaalTransformersTrainer
from baal.active.active_loop import ActiveLearningLoop

In [118]:
#Initialization for the huggingface trainer
training_args = TrainingArguments(
    output_dir='.',  # output directory
    num_train_epochs=hyperparams.epoch,  # total # of training epochs per AL step
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,  # batch size for evaluation
    weight_decay=0.01,  # strength of weight decay
    logging_dir='.',  # directory for storing logs
    )

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [119]:
# create the trainer through Baal Wrapper
baal_trainer = BaalTransformersTrainer(model=model,
                                       args=training_args,
                                       train_dataset=active_set,
                                       tokenizer=None)

In [120]:
active_loop = ActiveLearningLoop(active_set,
                                 baal_trainer.predict_on_dataset,
                                 heuristic, 10, iterations=3)

In [123]:
for epoch in range(hyperparams.training_duration):
    baal_trainer.train()

    should_continue = active_loop.step()

    # We reset the model weights to relearn from the new train set.
    baal_trainer.load_state_dict(init_weights)
    baal_trainer.lr_scheduler = None
    if not should_continue:
        break

# at each Active step we add 10 samples to labelled data. At this point we should have 30 samples added
# to the labelled part of training set.
print(len(active_set))

***** Running training *****
  Num examples = 500
  Num Epochs = 35
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 1120


Step,Training Loss
500,0.0232
1000,0.0


Saving model checkpoint to ./checkpoint-500
Configuration saved in ./checkpoint-500/config.json
Model weights saved in ./checkpoint-500/pytorch_model.bin
Saving model checkpoint to ./checkpoint-1000
Configuration saved in ./checkpoint-1000/config.json
Model weights saved in ./checkpoint-1000/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)




[303-MainThread  ] [baal.transformers_trainer_wrapper:predict_on_dataset_generator:67] [2m2021-12-15T18:52:57.916156Z[0m [[32m[1minfo     [0m] [1mStart Predict                 [0m [36mdataset[0m=[35m4574[0m


100%|██████████| 72/72 [01:40<00:00,  1.39s/it]
***** Running training *****
  Num examples = 510
  Num Epochs = 35
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 1120


Step,Training Loss
500,0.0304
1000,0.0


Saving model checkpoint to ./checkpoint-500
Configuration saved in ./checkpoint-500/config.json
Model weights saved in ./checkpoint-500/pytorch_model.bin
Saving model checkpoint to ./checkpoint-1000
Configuration saved in ./checkpoint-1000/config.json
Model weights saved in ./checkpoint-1000/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)




[303-MainThread  ] [baal.transformers_trainer_wrapper:predict_on_dataset_generator:67] [2m2021-12-15T19:01:32.180269Z[0m [[32m[1minfo     [0m] [1mStart Predict                 [0m [36mdataset[0m=[35m4564[0m


100%|██████████| 72/72 [01:40<00:00,  1.39s/it]

520





In [124]:
model_weight = model.state_dict()
dataset = active_set.state_dict()
torch.save({'model':model_weight, 'dataset':dataset, 'labelling_progress':labelling_progress},
           'checkpoint.pth')
print(model.state_dict().keys(), dataset.keys(), labelling_progress)

odict_keys(['distilbert.embeddings.word_embeddings.weight', 'distilbert.embeddings.position_embeddings.weight', 'distilbert.embeddings.LayerNorm.weight', 'distilbert.embeddings.LayerNorm.bias', 'distilbert.transformer.layer.0.attention.q_lin.weight', 'distilbert.transformer.layer.0.attention.q_lin.bias', 'distilbert.transformer.layer.0.attention.k_lin.weight', 'distilbert.transformer.layer.0.attention.k_lin.bias', 'distilbert.transformer.layer.0.attention.v_lin.weight', 'distilbert.transformer.layer.0.attention.v_lin.bias', 'distilbert.transformer.layer.0.attention.out_lin.weight', 'distilbert.transformer.layer.0.attention.out_lin.bias', 'distilbert.transformer.layer.0.sa_layer_norm.weight', 'distilbert.transformer.layer.0.sa_layer_norm.bias', 'distilbert.transformer.layer.0.ffn.lin1.weight', 'distilbert.transformer.layer.0.ffn.lin1.bias', 'distilbert.transformer.layer.0.ffn.lin2.weight', 'distilbert.transformer.layer.0.ffn.lin2.bias', 'distilbert.transformer.layer.0.output_layer_norm.