In [1]:
# Transformers installation
#! pip install transformers datasets
# To install from source instead of the last release, comment the command above and uncomment the following one.
# ! pip install git+https://github.com/huggingface/transformers.git


In [2]:
import pathlib
import sklearn
import datasets
import pandas as pd
import torch

import numpy as np
import transformers

In [3]:
dataset_path = 'datasets/imdb'

In [4]:
raw_datasets = datasets.load_from_disk(dataset_path)
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 20000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    valid: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
})

In [5]:
raw_datasets['train'].features

{'text': Value(dtype='string', id=None),
 'label': ClassLabel(num_classes=2, names=['neg', 'pos'], names_file=None, id=None)}

In [6]:
raw_datasets['train'][0]

{'text': 'This film is terrible. The story concerns a woman trying to find out what has happened to her sister. The film struggles with its identity, lurching from Noir/thriller to erotic, with elements of horror thrown in for good measure. The film has a very confused structure, for example with frequent use of flashbacks without tying these into the story. The plot is poorly developed, and the characterisation made it difficult to distinguish between who was who and the part they were playing. Some implausibilities exist in many films, but the scene where the main protagonist willingly accompanies a virtual stranger to his home, then agrees to go upstairs alone (to where he says she will find a phone), minus the gun she had brought with her, to call the Police, was too hard to believe. Some of the cinematography is very poor: we were watching on a 42" TV so how anyone with a smaller set could work out what was happening in the scenes taken in almost complete darkness is beyond me. Ov

In [7]:
labels = raw_datasets['train'].features['label'].names
labels

['neg', 'pos']

In [8]:
tokenizer = transformers.AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [9]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

Loading cached processed dataset at datasets/imdb/train/cache-0a48eff5a4233792.arrow


  0%|          | 0/25 [00:00<?, ?ba/s]

Loading cached processed dataset at datasets/imdb/valid/cache-29ae3ce3a2e3c446.arrow


In [10]:
subset = 1_000
subset = 25_000
subset = 1_000


train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(subset)) 
eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(subset)) 


Loading cached shuffled indices for dataset at datasets/imdb/train/cache-a1f5964dd7b7ee9a.arrow


In [11]:
model = transformers.AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=len(labels))

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.bias', 'classifier.w

In [12]:

training_args = transformers.TrainingArguments("test-trainer"
                                  , evaluation_strategy="epoch"
                                  , save_strategy="epoch"
                                  , logging_steps=100
                                 # , eval_steps=100
                                  , load_best_model_at_end=True
                                 
                                 )

In [13]:
metric = datasets.load_metric("accuracy")


In [14]:


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [15]:

trainer = transformers.Trainer(
    model=model, 
    args=training_args, 
    train_dataset=train_dataset, 
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
    

)




In [16]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text.
***** Running training *****
  Num examples = 1000
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 375
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
[34m[1mwandb[0m: Currently logged in as: [33mnpatta01[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.12.4 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


Epoch,Training Loss,Validation Loss,Accuracy
1,0.4717,0.430448,0.831
2,0.3066,0.407093,0.871
3,0.1775,0.404263,0.897


The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 8
Saving model checkpoint to test-trainer/checkpoint-125
Configuration saved in test-trainer/checkpoint-125/config.json
Model weights saved in test-trainer/checkpoint-125/pytorch_model.bin
The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 8
Saving model checkpoint to test-trainer/checkpoint-250
Configuration saved in test-trainer/checkpoint-250/config.json
Model weights saved in test-trainer/checkpoint-250/pytorch_model.bin
The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text.
***** Run

TrainOutput(global_step=375, training_loss=0.26826307169596353, metrics={'train_runtime': 214.5529, 'train_samples_per_second': 13.983, 'train_steps_per_second': 1.748, 'total_flos': 397402195968000.0, 'train_loss': 0.26826307169596353, 'epoch': 3.0})

In [17]:
trainer.evaluate()

The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 8


{'eval_loss': 0.4042629897594452,
 'eval_accuracy': 0.897,
 'eval_runtime': 17.0668,
 'eval_samples_per_second': 58.593,
 'eval_steps_per_second': 7.324,
 'epoch': 3.0}

In [18]:
tokenizer.encode_plus('men shoes', return_token_type_ids = True, return_attention_mask=True)

{'input_ids': [101, 2273, 6007, 102], 'token_type_ids': [0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1]}

In [19]:
tokenizer.convert_ids_to_tokens([101, 2273, 6007, 102])

['[CLS]', 'men', 'shoes', '[SEP]']

# Saving artifacts

In [20]:
?model.save_pretrained

[0;31mSignature:[0m
[0mmodel[0m[0;34m.[0m[0msave_pretrained[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0msave_directory[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mos[0m[0;34m.[0m[0mPathLike[0m[0;34m][0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0msave_config[0m[0;34m:[0m [0mbool[0m [0;34m=[0m [0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mstate_dict[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mdict[0m[0;34m,[0m [0mNoneType[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0msave_function[0m[0;34m:[0m [0mCallable[0m [0;34m=[0m [0;34m<[0m[0mfunction[0m [0msave[0m [0mat[0m [0;36m0x7f411729a680[0m[0;34m>[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mpush_to_hub[0m[0;34m:[0m [0mbool[0m [0;34m=[0m [0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0;34m**[0m[0mkwargs[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m


In [21]:
model_dir ='artifacts'

In [22]:
trainer.save_model(model_dir)

Saving model checkpoint to artifacts
Configuration saved in artifacts/config.json
Model weights saved in artifacts/pytorch_model.bin


In [23]:
tokenizer.save_pretrained(model_dir)

tokenizer config file saved in artifacts/tokenizer_config.json
Special tokens file saved in artifacts/special_tokens_map.json


('artifacts/tokenizer_config.json',
 'artifacts/special_tokens_map.json',
 'artifacts/vocab.txt',
 'artifacts/added_tokens.json',
 'artifacts/tokenizer.json')

In [26]:
model2 = transformers.AutoModelForSequenceClassification.from_pretrained(model_dir, num_labels=len(labels))

loading configuration file artifacts/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "problem_type": "single_label_classification",
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.11.1",
  "vocab_size": 30522
}

loading weights file artifacts/pytorch_model.bin
All model checkpoint weights were used when initializing DistilBertForSequenceClassification.

All the weights of DistilBertForSequenceClassification were initialized from the model checkpoint at artifacts.
If your task is similar to the task the model of 

# Loading Artifacts

In [28]:
tokenizer2 = transformers.AutoTokenizer.from_pretrained(
                model_dir
            )

Didn't find file artifacts/added_tokens.json. We won't load it.
loading file artifacts/vocab.txt
loading file artifacts/tokenizer.json
loading file None
loading file artifacts/special_tokens_map.json
loading file artifacts/tokenizer_config.json


# Predicting on new example

In [29]:
tokenizer2('men shoes')

{'input_ids': [101, 2273, 6007, 102], 'attention_mask': [1, 1, 1, 1]}

In [30]:
query = 'this movie sucks'
query = 'this movie is awesome'

In [31]:
res = tokenizer2.encode_plus(query, return_tensors="pt")

In [32]:
res

{'input_ids': tensor([[  101,  2023,  3185,  2003, 12476,   102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1]])}

In [33]:
model_res = model2(**res)
model_res

SequenceClassifierOutput(loss=None, logits=tensor([[-2.4571,  2.8177]], grad_fn=<AddmmBackward>), hidden_states=None, attentions=None)

In [34]:
model_res[0]

tensor([[-2.4571,  2.8177]], grad_fn=<AddmmBackward>)

In [35]:
list ( zip (labels , torch.softmax(model_res.logits, dim=1).tolist()[0] ) )

[('neg', 0.005093016661703587), ('pos', 0.9949069619178772)]

# FIN 

<a id='additional-resources'></a>

## Additional resources

To look at more fine-tuning examples you can refer to:

- [🤗 Transformers Examples](https://github.com/huggingface/transformers/tree/master/examples) which includes scripts
  to train on all common NLP tasks in PyTorch and TensorFlow.

- [🤗 Transformers Notebooks](https://huggingface.co/transformers/notebooks.html) which contains various notebooks and in particular one per task (look
  for the *how to finetune a model on xxx*).