In [1]:
%cd /content/drive/MyDrive/Code_challenge

/content/drive/MyDrive/Code_challenge


In [2]:
# install hugging face datasets module
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.3.2-py3-none-any.whl (362 kB)
[K     |████████████████████████████████| 362 kB 24.1 MB/s 
Collecting xxhash
  Downloading xxhash-3.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 68.5 MB/s 
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting fsspec[http]>=2021.05.0
  Downloading fsspec-2022.5.0-py3-none-any.whl (140 kB)
[K     |████████████████████████████████| 140 kB 57.5 MB/s 
[?25hCollecting aiohttp
  Downloading aiohttp-3.8.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 61.4 MB/s 
Collecting huggingface-hub<1.0.0,>=0.1.0
  Downloading huggingface_hub-0.7.0-py3-none-any.whl (86 kB)
[K     |███████████████████████████

In [3]:
# install transformers library
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.20.0-py3-none-any.whl (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 30.3 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 43.5 MB/s 
Installing collected packages: tokenizers, transformers
Successfully installed tokenizers-0.12.1 transformers-4.20.0


In [4]:
# import necessary packages
import transformers, pandas as pd
from datasets import Dataset, load_metric
import numpy as np
from transformers import AutoTokenizer, DataCollatorWithPadding, TrainingArguments, AutoModelForSequenceClassification, Trainer

### Prepare the Dataset

In [5]:
# create the sample emails texts
sample_mails = ["I will share your email",
"I shall share your email",
"I've shared your email",
"May I share your email",
"Should I share your email",
"I already shared the email",
"I've just shared your email",
"Am I allowed to share your email",
"Am I able to share your email",
"I am able to share your email",
"Will you help my friends if I share your email with them?"]

In [6]:
# manually create a label match for each sample mail
mail_labels = ["Student wants to know if can share",
"Student wants to know if can share",
"Student has shared",
"Student wants to know if can share",
"Student wants to know if can share",
"Student has shared",
"Student has shared",
"Student wants to know if can share",
"Student wants to know if can share",
"Student has shared",
"Student wants to know if can share"]

In [7]:
# load dataset as pandas Dataframe
df = pd.DataFrame({'sample_mails': sample_mails, 'labels': mail_labels})

# check
df

Unnamed: 0,sample_mails,labels
0,I will share your email,Student wants to know if can share
1,I shall share your email,Student wants to know if can share
2,I've shared your email,Student has shared
3,May I share your email,Student wants to know if can share
4,Should I share your email,Student wants to know if can share
5,I already shared the email,Student has shared
6,I've just shared your email,Student has shared
7,Am I allowed to share your email,Student wants to know if can share
8,Am I able to share your email,Student wants to know if can share
9,I am able to share your email,Student has shared


In [8]:
# load the dataframe in a hugging face compatible format
dataset = Dataset.from_pandas(df)

# check the type
type(dataset)

datasets.arrow_dataset.Dataset

### Preprocessing the dataset

In [9]:
# encode the dataset labels as integers
dataset = dataset.class_encode_column('labels')



Casting to class labels:   0%|          | 0/1 [00:00<?, ?ba/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ?ba/s]

In [10]:
# view a sample of the dataset
dataset[2]

{'labels': 0, 'sample_mails': "I've shared your email"}

From the above output, we see that label `0` indicates the label `Student has shared` therefore label `1` will indicate `Student wants to know if can share`

In [11]:
# verify the dataset features
dataset.features

{'labels': ClassLabel(num_classes=2, names=['Student has shared', 'Student wants to know if can share'], id=None),
 'sample_mails': Value(dtype='string', id=None)}

### Tokenization

In [12]:
# declare the checkpoint
checkpoint = "bert-base-uncased"

# call the tokenizer for training
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [13]:
# create a function for tokenizing the sample_mails
def tokenize_function(example):
    return tokenizer(example["sample_mails"], truncation=True)

In [14]:
# tokenize the dataset with the map function
tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets

  0%|          | 0/1 [00:00<?, ?ba/s]

Dataset({
    features: ['sample_mails', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 11
})

In [15]:
# apply dynamic padding -- pad all the sample_mails to the length of the longest element when we batch elements together
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

To test this new toy, we'll slice our dataset that we would like to batch together. Here, we remove the columns idx and sample_mails as they won’t be needed and contain strings (and we can’t create tensors with strings) and have a look at the lengths of each entry in the batch:

In [16]:
samples = tokenized_datasets[:]
samples = {k: v for k, v in samples.items() if k not in ["idx", "sample_mails"]}
[len(x) for x in samples["input_ids"]]

[7, 7, 8, 7, 7, 7, 9, 9, 9, 9, 15]

No surprise, we get samples of varying length, from 7 to 15. Dynamic padding means the samples in this batch should all be padded to a length of 15, the maximum length inside the batch. Without dynamic padding, all of the samples would have to be padded to the maximum length in the whole dataset, or the maximum length the model can accept. Let’s double-check that our data_collator is dynamically padding the batch properly:

In [17]:
batch = data_collator(samples)
{k: v.shape for k, v in batch.items()}

{'attention_mask': torch.Size([11, 15]),
 'input_ids': torch.Size([11, 15]),
 'labels': torch.Size([11]),
 'token_type_ids': torch.Size([11, 15])}

In [18]:
# check if we're using a fast tokenizer
tokenizer.is_fast

True

In [19]:
# we can convert the tokenized dataset back to text as follows
tokenizer.convert_ids_to_tokens(tokenized_datasets['input_ids'][-1])

['[CLS]',
 'will',
 'you',
 'help',
 'my',
 'friends',
 'if',
 'i',
 'share',
 'your',
 'email',
 'with',
 'them',
 '?',
 '[SEP]']

### Training

The first step before we can define our Trainer is to define a TrainingArguments class that will contain all the hyperparameters the Trainer will use for training and evaluation. The only argument you have to provide is a directory where the trained model will be saved, in our case we want to also modify the number of epochs for training,  the checkpoints along the way are also saved in this directory. For all the rest, you can leave the defaults, which should work pretty well for a basic fine-tuning.

In [20]:
# define a metric to monitor during training
metric = load_metric("accuracy")

# create a function that helps compute the specified metric
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [21]:
# define the training arguments
training_args = TrainingArguments('training_args',
                                  num_train_epochs=20)

The second step is to define our model. We will use the AutoModelForSequenceClassification class, with two labels:

In [22]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

You will notice that you get a warning after instantiating this pretrained model. This is because BERT has not been pretrained on classifying pairs of sentences, so the head of the pretrained model has been discarded and a new head suitable for sequence classification has been added instead. The warnings indicate that some weights were not used (the ones corresponding to the dropped pretraining head) and that some others were randomly initialized (the ones for the new head). It concludes by encouraging you to train the model, which is exactly what we are going to do now.

In [23]:
# define trainer object
trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

To fine-tune the model on our dataset, we just have to call the train() method of our Trainer:

In [24]:
# train the model
trainer.train()

The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: sample_mails. If sample_mails are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 11
  Num Epochs = 20
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 40


Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=40, training_loss=0.1826045036315918, metrics={'train_runtime': 3.2339, 'train_samples_per_second': 68.029, 'train_steps_per_second': 12.369, 'total_flos': 1399833036720.0, 'train_loss': 0.1826045036315918, 'epoch': 20.0})

In [25]:
# save the trained model together with the tokenizer in a directory
trainer.save_model('custom_model')

Saving model checkpoint to custom_model
Configuration saved in custom_model/config.json
Model weights saved in custom_model/pytorch_model.bin
tokenizer config file saved in custom_model/tokenizer_config.json
Special tokens file saved in custom_model/special_tokens_map.json


### Evaluation

For this task, we will evaluate the model on the training set, given that the dataset is extremely small and could not be split into train-test sets

In [26]:
predictions = trainer.predict(tokenized_datasets)
print(predictions.predictions.shape, predictions.label_ids.shape, '\n')
print(predictions)

The following columns in the test set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: sample_mails. If sample_mails are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 11
  Batch size = 8


(11, 2) (11,) 

PredictionOutput(predictions=array([[-2.8567455,  2.6125207],
       [-2.8680387,  2.5676455],
       [ 2.3883843, -2.2533398],
       [-2.8665829,  2.6149411],
       [-2.7801325,  2.618008 ],
       [ 2.4355724, -2.1244042],
       [ 2.404393 , -2.2528536],
       [-2.8637493,  2.6661575],
       [-2.777838 ,  2.69716  ],
       [ 2.2913795, -2.1420698],
       [-2.8798716,  2.544542 ]], dtype=float32), label_ids=array([1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1]), metrics={'test_loss': 0.006455942988395691, 'test_accuracy': 1.0, 'test_runtime': 0.0593, 'test_samples_per_second': 185.544, 'test_steps_per_second': 33.735})


The output of the `predict()` method is another named tuple with three fields: predictions, `label_ids`, and `metrics`. The metrics field now contains the loss on the dataset passed, some time metrics (how long it took to predict, in total and on average), and the accuracy of training

As we can see, predictions is a two-dimensional array with shape 11 x 2 (11 being the number of elements in the dataset we used). Those are the logits for each element of the dataset we passed to `predict()`. To transform them into predictions that we can compare to our labels, we need to take the index with the maximum value on the second axis:

In [27]:
preds = np.argmax(predictions.predictions, axis=-1)

We can now compare those preds to the labels. To build our `compute_metric()` function, we will rely on the metrics from the 🤗 Datasets library. We can load the metrics associated with the MRPC dataset as easily as we loaded the dataset, this time with the `load_metric()` function. The object returned has a `compute()` method we can use to do the metric calculation. Wrapping everything together, we get our `compute_metrics_mrpc()` function:

In [29]:
def compute_metrics_mrpc(eval_preds):
    metric = load_metric("glue", "mrpc")
    logits, labels = eval_preds.predictions, eval_preds.label_ids
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [30]:
compute_metrics_mrpc(predictions)

{'accuracy': 1.0, 'f1': 1.0}

In [31]:
predictions.predictions

array([[-2.8567455,  2.6125207],
       [-2.8680387,  2.5676455],
       [ 2.3883843, -2.2533398],
       [-2.8665829,  2.6149411],
       [-2.7801325,  2.618008 ],
       [ 2.4355724, -2.1244042],
       [ 2.404393 , -2.2528536],
       [-2.8637493,  2.6661575],
       [-2.777838 ,  2.69716  ],
       [ 2.2913795, -2.1420698],
       [-2.8798716,  2.544542 ]], dtype=float32)

In [32]:
predictions.label_ids

array([1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1])

From the above, we see that the model has a perfect prediction on the data it was trained on. This is highly flawed and can be ascribed to overfitting, but since we have no test set to evaluate on given the size of the sample data, we can assume that for the model to overfit at 20 epochs, it actually did well in learning the training dataset.

### Inference

In [37]:
# get the directory where the model was saved to
inf_model = AutoModelForSequenceClassification.from_pretrained('/content/drive/MyDrive/Code_challenge/custom_model/')

loading configuration file /content/drive/MyDrive/Code_challenge/custom_model/config.json
Model config BertConfig {
  "_name_or_path": "/content/drive/MyDrive/Code_challenge/custom_model/",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.20.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file /content/drive/MyDrive/Code_challenge/custom_model/pytorch_model.bin
All model checkpoint weights were 

In [39]:
# load the tokenizer by pointing to the same directory as the pretrained model
inf_tokenizer = AutoTokenizer.from_pretrained('/content/drive/MyDrive/Code_challenge/custom_model/')

Didn't find file /content/drive/MyDrive/Code_challenge/custom_model/added_tokens.json. We won't load it.
loading file /content/drive/MyDrive/Code_challenge/custom_model/vocab.txt
loading file /content/drive/MyDrive/Code_challenge/custom_model/tokenizer.json
loading file None
loading file /content/drive/MyDrive/Code_challenge/custom_model/special_tokens_map.json
loading file /content/drive/MyDrive/Code_challenge/custom_model/tokenizer_config.json


In [50]:
# generate sequence for inference
sequences = ['I want to know if I should send your email', 'I sent your email a long time ago']

In [43]:
# create a pipeline for inference
from transformers import pipeline
classifier = pipeline(task='text-classification', model=inf_model, tokenizer=inf_tokenizer)

In [51]:
classifier(sequences)

Disabling tokenizer parallelism, we're using DataLoader multithreading already


[{'label': 'LABEL_1', 'score': 0.6547055840492249},
 {'label': 'LABEL_0', 'score': 0.9795986413955688}]

From the above output, we can confidently say the model is performing well on inference