# Fine Tuning a Pre-trained Model

Since our dataset is large(>10K), we'll free layers close to input and train the later layers.

In [15]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification,  TrainingArguments, Trainer
from datasets import Dataset, Image, Value, ClassLabel, Features

import os

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        os.path.join(dirname, filename)
base_dir = "../input/dogsvscatsredux"
train_dir = os.path.join(base_dir, "train/train")

In [16]:
pd.set_option('max_colwidth', 60)
train_filenames = os.listdir(base_dir+ '/train/train')
train_data = pd.DataFrame(columns=['image', 'labels'])
train_data['image'] = train_filenames 
train_data['labels'] = train_data['image'].str[0:3].map({"cat":0,"dog":1})
train_data['image'] = (train_dir +"/" +train_data['image']).astype(str)
train_data.head()

Unnamed: 0,image,labels
0,../input/dogsvscatsredux/train/train/cat.12461.jpg,0
1,../input/dogsvscatsredux/train/train/dog.3443.jpg,1
2,../input/dogsvscatsredux/train/train/dog.7971.jpg,1
3,../input/dogsvscatsredux/train/train/dog.10728.jpg,1
4,../input/dogsvscatsredux/train/train/dog.1942.jpg,1


In [17]:
features = Features({"image": Value("string"), "labels": ClassLabel(num_classes=2, names=["cat","dog"])})
ds_raw = Dataset.from_pandas(train_data[['labels','image']],features=features).cast_column('image', Image())
ds_raw

Dataset({
    features: ['image', 'labels'],
    num_rows: 25000
})

In [18]:
from transformers import ViTFeatureExtractor

model_name_or_path = 'google/vit-base-patch16-224-in21k'
feature_extractor = ViTFeatureExtractor.from_pretrained(model_name_or_path)

loading feature extractor configuration file https://huggingface.co/google/vit-base-patch16-224-in21k/resolve/main/preprocessor_config.json from cache at /root/.cache/huggingface/transformers/7c7f3e780b30eeeacd3962294e5154788caa6d9aa555ed6d5c2f0d2c485eba18.c322cbf30b69973d5aae6c0866f5cba198b5fe51a2fe259d2a506827ec6274bc
Feature extractor ViTFeatureExtractor {
  "do_normalize": true,
  "do_resize": true,
  "feature_extractor_type": "ViTFeatureExtractor",
  "image_mean": [
    0.5,
    0.5,
    0.5
  ],
  "image_std": [
    0.5,
    0.5,
    0.5
  ],
  "resample": 2,
  "size": 224
}



In [19]:
def transform(example_batch):
    # Take a list of PIL images and turn them to pixel values
    inputs = feature_extractor([x for x in example_batch["image"]], return_tensors='pt')

    # Don't forget to include the labels!
    inputs["labels"] = example_batch["labels"]
    return inputs

In [20]:
prepared_ds = ds_raw.with_transform(transform)

In [21]:
def collate_fn(batch):
    return {
        'pixel_values': torch.stack([x['pixel_values'] for x in batch]),
        'labels': torch.tensor([x['labels'] for x in batch])
    }

In [22]:
from datasets import load_metric
metric = load_metric("accuracy")
def compute_metrics(p):
    return metric.compute(predictions=np.argmax(p.predictions, axis=1), references=p.label_ids)

We'll freeze the first two layers.

In [23]:
from transformers import ViTForImageClassification

labels = ds_raw.features['labels'].names

model = ViTForImageClassification.from_pretrained(
    model_name_or_path,
    num_labels=len(labels),
    id2label={str(i): c for i, c in enumerate(labels)},
    label2id={c: str(i) for i, c in enumerate(labels)}
)
for name,param in model.named_parameters():
    if name.startswith("vit.embeddings"):
        param.requires_grad = False
    if any(x in name for x in ['.'+ str(x)+'.' for x in range (0,2)]): 
        param.requires_grad = False

loading configuration file https://huggingface.co/google/vit-base-patch16-224-in21k/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/7bba26dd36a6ff9f6a9b19436dec361727bea03ec70fbfa82b70628109163eaa.92995a56e2eabab0c686015c4ad8275b4f9cbd858ed228f6a08936f2c31667e7
Model config ViTConfig {
  "_name_or_path": "google/vit-base-patch16-224-in21k",
  "architectures": [
    "ViTModel"
  ],
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "id2label": {
    "0": "cat",
    "1": "dog"
  },
  "image_size": 224,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "cat": "0",
    "dog": "1"
  },
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 12,
  "num_channels": 3,
  "num_hidden_layers": 12,
  "patch_size": 16,
  "qkv_bias": true,
  "transformers_version": "4.20.1"
}

loading weights file https://huggingface.co/google/vit-base-

In [24]:
for name,param in model.named_parameters():
    print(name,param.requires_grad)

vit.embeddings.cls_token False
vit.embeddings.position_embeddings False
vit.embeddings.patch_embeddings.projection.weight False
vit.embeddings.patch_embeddings.projection.bias False
vit.encoder.layer.0.attention.attention.query.weight False
vit.encoder.layer.0.attention.attention.query.bias False
vit.encoder.layer.0.attention.attention.key.weight False
vit.encoder.layer.0.attention.attention.key.bias False
vit.encoder.layer.0.attention.attention.value.weight False
vit.encoder.layer.0.attention.attention.value.bias False
vit.encoder.layer.0.attention.output.dense.weight False
vit.encoder.layer.0.attention.output.dense.bias False
vit.encoder.layer.0.intermediate.dense.weight False
vit.encoder.layer.0.intermediate.dense.bias False
vit.encoder.layer.0.output.dense.weight False
vit.encoder.layer.0.output.dense.bias False
vit.encoder.layer.0.layernorm_before.weight False
vit.encoder.layer.0.layernorm_before.bias False
vit.encoder.layer.0.layernorm_after.weight False
vit.encoder.layer.0.layer

In [25]:
prepared_ds

Dataset({
    features: ['image', 'labels'],
    num_rows: 25000
})

In [26]:
train_prop = 0.85
ds_train = prepared_ds.select(range(int(len(prepared_ds)*train_prop)))
ds_eval = prepared_ds.select(range(int(len(prepared_ds)*train_prop), len(prepared_ds)))

In [27]:
os.environ["WANDB_DISABLED"] = "true"

In [28]:
training_args = TrainingArguments(num_train_epochs=1,
                                  per_device_train_batch_size=10,
                                  per_device_eval_batch_size=10,
                                  evaluation_strategy="steps",
                                  eval_steps=300,
                                  do_train=True,
                                  report_to=None,
                                  learning_rate= 5e-06, 
                                  output_dir="/kaggle/working",
                                  remove_unused_columns=False,
)
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=collate_fn,
    compute_metrics=compute_metrics,
    train_dataset= ds_train,
    eval_dataset= ds_eval,
    tokenizer=feature_extractor,
)
train_results = trainer.train()
trainer.evaluate(ds_eval)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
***** Running training *****
  Num examples = 21250
  Num Epochs = 1
  Instantaneous batch size per device = 10
  Total train batch size (w. parallel, distributed & accumulation) = 10
  Gradient Accumulation steps = 1
  Total optimization steps = 2125


Step,Training Loss,Validation Loss,Accuracy
300,No log,0.131856,0.9952
600,0.276500,0.056331,0.993333
900,0.276500,0.040671,0.994133
1200,0.048000,0.031742,0.995733
1500,0.036100,0.031429,0.993867
1800,0.036100,0.027466,0.9952
2100,0.026400,0.026739,0.995467


***** Running Evaluation *****
  Num examples = 3750
  Batch size = 10
Saving model checkpoint to /kaggle/working/checkpoint-500
Configuration saved in /kaggle/working/checkpoint-500/config.json
Model weights saved in /kaggle/working/checkpoint-500/pytorch_model.bin
Feature extractor saved in /kaggle/working/checkpoint-500/preprocessor_config.json
***** Running Evaluation *****
  Num examples = 3750
  Batch size = 10
***** Running Evaluation *****
  Num examples = 3750
  Batch size = 10
Saving model checkpoint to /kaggle/working/checkpoint-1000
Configuration saved in /kaggle/working/checkpoint-1000/config.json
Model weights saved in /kaggle/working/checkpoint-1000/pytorch_model.bin
Feature extractor saved in /kaggle/working/checkpoint-1000/preprocessor_config.json
***** Running Evaluation *****
  Num examples = 3750
  Batch size = 10
***** Running Evaluation *****
  Num examples = 3750
  Batch size = 10
Saving model checkpoint to /kaggle/working/checkpoint-1500
Configuration saved in /

{'eval_loss': 0.026735765859484673,
 'eval_accuracy': 0.9954666666666667,
 'eval_runtime': 48.9278,
 'eval_samples_per_second': 76.643,
 'eval_steps_per_second': 7.664,
 'epoch': 1.0}

The pre-trained model contains 11 layers plus the classifier layer we added. I've tried to freeze from 1 to 10 layers close to input in the model but it doesn't seem to impact the accuracy.