In [1]:
import transformers
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments, AdamW
import torch
from torch.utils.data import DataLoader
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from tqdm import tqdm

## Preprocessing

In [2]:
dataset = load_dataset("imdb")

Reusing dataset imdb (/root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/e3c66f1788a67a89c7058d97ff62b6c30531e05b549de56d3ab91891f0561f9a)


  0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [4]:
train_texts, train_labels = dataset['train']['text'], dataset['train']['label']
test_texts, test_labels = dataset['test']['text'], dataset['test']['label']

In [5]:
len(train_texts), len(test_texts)

(25000, 25000)

In [6]:
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=.2)

In [7]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

In [8]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

In [9]:
train_encodings.keys()

dict_keys(['input_ids', 'attention_mask'])

In [10]:
len(train_encodings['input_ids']), len(train_encodings['attention_mask'])

(20000, 20000)

In [11]:
print(train_encodings['input_ids'][0])

[101, 13970, 3367, 9496, 2912, 2081, 2009, 2153, 1012, 2178, 17743, 1012, 1037, 11034, 4038, 2440, 1997, 2010, 2219, 16209, 1010, 2007, 1037, 10424, 8625, 4588, 6348, 1998, 2116, 14013, 5312, 1010, 2057, 4191, 1998, 4191, 1010, 2054, 1037, 2283, 999, 1996, 2189, 2003, 7249, 1010, 1998, 2036, 1996, 5008, 1010, 1996, 4176, 1010, 1996, 4689, 21123, 1010, 3348, 1998, 6429, 11721, 28682, 1998, 21644, 1010, 2673, 14231, 2135, 5107, 2000, 20432, 2069, 1012, 5760, 5988, 1999, 11305, 1012, 1037, 6919, 3325, 2000, 3422, 1012, 1998, 2028, 2003, 11974, 8794, 2144, 2204, 22092, 2024, 2061, 4678, 1010, 1998, 2061, 6919, 1012, 2092, 1010, 2023, 2003, 2028, 1010, 1998, 2065, 2017, 5632, 13970, 3367, 9496, 3540, 1005, 1055, 3025, 3152, 1010, 2017, 1005, 2222, 2293, 2023, 1010, 2348, 1010, 2004, 1999, 2035, 22092, 1010, 2009, 2003, 2055, 1037, 5072, 4668, 1010, 1998, 2017, 2031, 2000, 2022, 1999, 1996, 6888, 2005, 2009, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [12]:
print(train_encodings['attention_mask'][0])

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

## Custom Dataset

In [13]:
class IMDbDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = IMDbDataset(train_encodings, train_labels)
val_dataset = IMDbDataset(val_encodings, val_labels)
test_dataset = IMDbDataset(test_encodings, test_labels)

In [14]:
print(next(iter(train_dataset)))

{'input_ids': tensor([  101, 13970,  3367,  9496,  2912,  2081,  2009,  2153,  1012,  2178,
        17743,  1012,  1037, 11034,  4038,  2440,  1997,  2010,  2219, 16209,
         1010,  2007,  1037, 10424,  8625,  4588,  6348,  1998,  2116, 14013,
         5312,  1010,  2057,  4191,  1998,  4191,  1010,  2054,  1037,  2283,
          999,  1996,  2189,  2003,  7249,  1010,  1998,  2036,  1996,  5008,
         1010,  1996,  4176,  1010,  1996,  4689, 21123,  1010,  3348,  1998,
         6429, 11721, 28682,  1998, 21644,  1010,  2673, 14231,  2135,  5107,
         2000, 20432,  2069,  1012,  5760,  5988,  1999, 11305,  1012,  1037,
         6919,  3325,  2000,  3422,  1012,  1998,  2028,  2003, 11974,  8794,
         2144,  2204, 22092,  2024,  2061,  4678,  1010,  1998,  2061,  6919,
         1012,  2092,  1010,  2023,  2003,  2028,  1010,  1998,  2065,  2017,
         5632, 13970,  3367,  9496,  3540,  1005,  1055,  3025,  3152,  1010,
         2017,  1005,  2222,  2293,  2023,  1010, 

## Fine-tuning with Trainer

In [15]:
# training_args = TrainingArguments(
#     output_dir='./results',          # output directory
#     num_train_epochs=3,              # total number of training epochs
#     per_device_train_batch_size=16,  # batch size per device during training
#     per_device_eval_batch_size=64,   # batch size for evaluation
#     warmup_steps=500,                # number of warmup steps for learning rate scheduler
#     weight_decay=0.01,               # strength of weight decay
#     logging_dir='./logs',            # directory for storing logs
#     logging_steps=10,
# )

# model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")

# trainer = Trainer(
#     model=model,                         # the instantiated 🤗 Transformers model to be trained
#     args=training_args,                  # training arguments, defined above
#     train_dataset=train_dataset,         # training dataset
#     eval_dataset=val_dataset             # evaluation dataset
# )

# trainer.train()

## Fine-tuning with PyTorch

In [16]:
epoch = 3
batch_size = 16

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
model.to(device)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)
optim = AdamW(model.parameters(), lr=5e-5)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'classifier

In [17]:
import numpy as np

In [18]:
model.train()
counter = 0
print_every = 125

for e in range(epoch):
    for batch in tqdm(train_loader):
        #counter += 1
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        loss.backward()
        optim.step()

        # loss stats
        if counter % print_every == 0:

            # Get validation loss
            with torch.no_grad():
                val_losses = []
                model.eval()
                for batch in val_loader:
                    input_ids = batch['input_ids'].to(device)
                    attention_mask = batch['attention_mask'].to(device)
                    labels = batch['labels'].to(device)
                    outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
                    val_loss = outputs[0]

                    val_losses.append(val_loss.item())

            model.train()
            print("Epoch: {}/{}...".format(e+1, epoch),
                    "Step: {}...".format(counter),
                    "Loss: {:.6f}...".format(loss.item()),
                    "Val Loss: {:.6f}".format(np.mean(val_losses)))
            torch.cuda.empty_cache()
        counter += 1

  0%|          | 1/1250 [02:54<60:35:27, 174.64s/it]

Epoch: 1/3... Step: 0... Loss: 0.679406... Val Loss: 0.706733


 10%|█         | 126/1250 [08:54<16:38:37, 53.31s/it]

Epoch: 1/3... Step: 125... Loss: 0.316720... Val Loss: 0.321483


 20%|██        | 251/1250 [14:53<14:47:30, 53.30s/it]

Epoch: 1/3... Step: 250... Loss: 0.212398... Val Loss: 0.257095


 30%|███       | 375/1250 [17:59<21:46,  1.49s/it]

Epoch: 1/3... Step: 375... Loss: 0.420270... Val Loss: 0.241729


 40%|████      | 501/1250 [26:52<11:05:48, 53.34s/it]

Epoch: 1/3... Step: 500... Loss: 0.204247... Val Loss: 0.239940


 50%|█████     | 625/1250 [29:58<15:36,  1.50s/it]

Epoch: 1/3... Step: 625... Loss: 0.218531... Val Loss: 0.259758


 60%|██████    | 750/1250 [35:57<12:24,  1.49s/it]

Epoch: 1/3... Step: 750... Loss: 0.514910... Val Loss: 0.228883


 70%|███████   | 876/1250 [44:51<5:32:21, 53.32s/it]

Epoch: 1/3... Step: 875... Loss: 0.304450... Val Loss: 0.223661


 80%|████████  | 1001/1250 [50:51<3:41:17, 53.32s/it]

Epoch: 1/3... Step: 1000... Loss: 0.638322... Val Loss: 0.240005


 90%|█████████ | 1126/1250 [56:51<1:50:12, 53.32s/it]

Epoch: 1/3... Step: 1125... Loss: 0.263447... Val Loss: 0.209437


100%|██████████| 1250/1250 [59:56<00:00,  2.88s/it]
  0%|          | 0/1250 [00:00<?, ?it/s]

Epoch: 2/3... Step: 1250... Loss: 0.211591... Val Loss: 0.235009


 10%|█         | 126/1250 [08:53<16:39:16, 53.34s/it]

Epoch: 2/3... Step: 1375... Loss: 0.045247... Val Loss: 0.247071


 20%|██        | 251/1250 [14:53<14:48:28, 53.36s/it]

Epoch: 2/3... Step: 1500... Loss: 0.155732... Val Loss: 0.250328


 30%|███       | 375/1250 [17:58<21:45,  1.49s/it]

Epoch: 2/3... Step: 1625... Loss: 0.330825... Val Loss: 0.316729


 40%|████      | 501/1250 [26:53<11:05:55, 53.34s/it]

Epoch: 2/3... Step: 1750... Loss: 0.024369... Val Loss: 0.267838


 50%|█████     | 625/1250 [29:58<15:37,  1.50s/it]

Epoch: 2/3... Step: 1875... Loss: 0.350336... Val Loss: 0.436761


 60%|██████    | 750/1250 [35:58<12:25,  1.49s/it]

Epoch: 2/3... Step: 2000... Loss: 0.167020... Val Loss: 0.254176


 70%|███████   | 875/1250 [41:57<09:21,  1.50s/it]

Epoch: 2/3... Step: 2125... Loss: 0.069914... Val Loss: 0.240855


 80%|████████  | 1000/1250 [47:57<06:14,  1.50s/it]

Epoch: 2/3... Step: 2250... Loss: 0.033023... Val Loss: 0.236016


 90%|█████████ | 1125/1250 [53:54<03:04,  1.48s/it]

Epoch: 2/3... Step: 2375... Loss: 0.149282... Val Loss: 0.232959


100%|██████████| 1250/1250 [59:50<00:00,  2.87s/it]
  0%|          | 1/1250 [02:52<59:47:05, 172.32s/it]

Epoch: 3/3... Step: 2500... Loss: 0.041254... Val Loss: 0.237078


 10%|█         | 125/1250 [05:56<27:48,  1.48s/it]

Epoch: 3/3... Step: 2625... Loss: 0.090395... Val Loss: 0.275787


 20%|██        | 251/1250 [14:44<14:38:41, 52.77s/it]

Epoch: 3/3... Step: 2750... Loss: 0.078724... Val Loss: 0.288673


 30%|███       | 376/1250 [20:41<12:49:39, 52.84s/it]

Epoch: 3/3... Step: 2875... Loss: 0.062338... Val Loss: 0.255020


 40%|████      | 500/1250 [23:45<18:30,  1.48s/it]

Epoch: 3/3... Step: 3000... Loss: 0.031759... Val Loss: 0.283907


 50%|█████     | 626/1250 [32:34<9:09:09, 52.80s/it]

Epoch: 3/3... Step: 3125... Loss: 0.018721... Val Loss: 0.258837


 60%|██████    | 751/1250 [38:31<7:19:14, 52.81s/it]

Epoch: 3/3... Step: 3250... Loss: 0.008591... Val Loss: 0.271964


 70%|███████   | 876/1250 [44:27<5:28:53, 52.76s/it]

Epoch: 3/3... Step: 3375... Loss: 0.121791... Val Loss: 0.284998


 80%|████████  | 1000/1250 [47:31<06:11,  1.49s/it]

Epoch: 3/3... Step: 3500... Loss: 0.027951... Val Loss: 0.312407


 90%|█████████ | 1125/1250 [53:27<03:05,  1.49s/it]

Epoch: 3/3... Step: 3625... Loss: 0.006860... Val Loss: 0.314242


100%|██████████| 1250/1250 [59:24<00:00,  2.85s/it]


In [26]:
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

acc = []

with torch.no_grad():
    model.eval()
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        preds = torch.argmax(outputs[1], dim=1)
        acc.append(preds==labels)


In [30]:
l = [item for sublist in acc for item in sublist]

In [33]:
sum(l)/len(l)

tensor(0.9132, device='cuda:0')

In [24]:
preds

tensor([[ 3.1015, -2.8244],
        [-2.4668,  2.7631],
        [-2.7337,  3.0756],
        [ 3.9998, -3.7140],
        [ 4.2666, -3.9509],
        [-3.0056,  3.3769],
        [-2.5205,  2.8736],
        [-3.0279,  3.4406],
        [ 2.8228, -2.6343],
        [ 4.0771, -3.7320],
        [-2.7734,  3.1581],
        [ 4.3178, -3.9619],
        [-1.1888,  1.2081],
        [-2.7504,  3.1162],
        [-0.8515,  0.9962],
        [ 3.9523, -3.6329]], device='cuda:0')