In [None]:
! pip install modAL

In [None]:
%%bash

git clone https://github.com/skorch-dev/skorch.git
cd skorch
python -m pip install -r requirements.txt
python -m pip install .

In [2]:
import datasets
from modAL.models import ActiveLearner
import numpy as np
import torch
from sklearn.datasets import fetch_20newsgroups
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from skorch import NeuralNetClassifier
from skorch.callbacks import LRScheduler, ProgressBar
from skorch.hf import HuggingfacePretrainedTokenizer
from torch import nn
from torch.optim.lr_scheduler import LambdaLR
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer

In [3]:
# Configs
DATASET = "bergr7/weakly_supervised_ag_news"
TRANSFORMER_MODEL = "distilbert-base-uncased"
LABELS = datasets.load_dataset('ag_news')["train"].features["label"].names
NUM_SAMPLES = 3

# model hyper-parameters
OPTMIZER = torch.optim.AdamW
LR = 5e-5
MAX_EPOCHS = 3
CRITERION = nn.CrossEntropyLoss
BATCH_SIZE = 8

# device
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

Using custom data configuration default
Found cached dataset ag_news (/root/.cache/huggingface/datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548)


  0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
DEVICE

'cuda'

In [6]:
ag_news_data = datasets.load_dataset(DATASET)

Using custom data configuration bergr7--weakly_supervised_ag_news-4dbef0c6a5e5ed58
Found cached dataset csv (/root/.cache/huggingface/datasets/bergr7___csv/bergr7--weakly_supervised_ag_news-4dbef0c6a5e5ed58/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a)


  0%|          | 0/3 [00:00<?, ?it/s]

In [10]:
ag_news_data

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 37340
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 7600
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 24000
    })
})

In [9]:
num_training_steps = MAX_EPOCHS * (ag_news_data['train'].num_rows // BATCH_SIZE + 1)

def lr_schedule(current_step):
    factor = float(num_training_steps - current_step) / float(max(1, num_training_steps))
    assert factor > 0
    return factor

In [11]:
class BertModule(nn.Module):
    def __init__(self, name, num_labels):
        super().__init__()
        self.name = name
        self.num_labels = num_labels
        
        self.reset_weights()
        
    def reset_weights(self):
        self.bert = AutoModelForSequenceClassification.from_pretrained(
            self.name, num_labels=self.num_labels
        )
        
    def forward(self, **kwargs):
        pred = self.bert(**kwargs)
        return pred.logits

In [14]:
pipeline = Pipeline([
    ('tokenizer', HuggingfacePretrainedTokenizer(TRANSFORMER_MODEL)),
    ('net', NeuralNetClassifier(
        BertModule,
        module__name=TRANSFORMER_MODEL,
        module__num_labels=4,
        optimizer=OPTMIZER,
        lr=LR,
        max_epochs=1,
        criterion=CRITERION,
        batch_size=BATCH_SIZE,
        iterator_train__shuffle=True,
        device=DEVICE,
        callbacks=[
            LRScheduler(LambdaLR, lr_lambda=lr_schedule, step_every='batch'),
            ProgressBar(),
        ],
    )),
])

In [38]:
X_train = ag_news_data['train']['text']
y_train = np.array(ag_news_data['train']['label'])

In [24]:
torch.manual_seed(0)
torch.cuda.manual_seed(0)
torch.cuda.manual_seed_all(0)
np.random.seed(0)

In [39]:
%time pipeline.fit(X_train, y_train)

Re-initializing module because the following parameters were re-set: name, num_labels.


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'pre_classi

Re-initializing criterion.
Re-initializing optimizer.


  0%|          | 0/4668 [00:00<?, ?it/s]

  epoch    train_loss    valid_acc    valid_loss       dur
-------  ------------  -----------  ------------  --------
      1        [36m0.0955[0m       [32m0.9877[0m        [35m0.0413[0m  201.0551
CPU times: user 3min 35s, sys: 23.5 s, total: 3min 59s
Wall time: 3min 26s
