In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install transformers datasets

In [3]:
import torch
from torch import nn

from torch.utils.data import Dataset, DataLoader

import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

from transformers import ViTImageProcessor, ViTForImageClassification
from transformers import AutoFeatureExtractor, ResNetForImageClassification
from datasets import load_dataset

from tqdm.notebook import tqdm

torch.manual_seed(0);

In [4]:
import os
import time

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

from datasets import load_dataset, load_metric
from transformers import TrainingArguments, Trainer

In [5]:
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
device

'cuda:0'

In [38]:
feature_extractor = AutoFeatureExtractor.from_pretrained("microsoft/resnet-18")
student_model = ResNetForImageClassification.from_pretrained("microsoft/resnet-18")




In [39]:
student_model.classifier[1] = nn.Linear(512, 2)

In [7]:
processor = ViTImageProcessor.from_pretrained('google/vit-base-patch16-224')
teacher_model = ViTForImageClassification.from_pretrained('google/vit-base-patch16-224')
teacher_model.classifier = nn.Linear(768, 2)

In [8]:
total_params_deep = "{:,}".format(sum(p.numel() for p in teacher_model.parameters()))
print(f"teacher_model parameters: {total_params_deep}")
total_params_light = "{:,}".format(sum(p.numel() for p in student_model.parameters()))
print(f"student_model parameters: {total_params_light}")

teacher_model parameters: 85,800,194
student_model parameters: 11,178,050


In [None]:
teacher_model.load_state_dict(torch.load('/content/drive/MyDrive/models_weights/ViT_2.pt', map_location=torch.device('cpu')))
teacher_model.to(device)
teacher_model.eval()

In [10]:
ds = load_dataset('cats_vs_dogs')

In [11]:
indexes = list(range((len(ds['train']))))
train, test = train_test_split(indexes, test_size=0.2, random_state=0)

In [12]:
class CustomDataset(Dataset):
    def __init__(self, ids, dataset):
        self.ids = ids
        self.ds = dataset

    def __len__(self):
        return len(self.ids)

    def __getitem__(self, index):
      image = self.ds['train'][index]['image']
      label = self.ds['train'][index]['labels']

      image = processor(
          image.convert("RGB"),
          return_tensors='pt'
          )

      image['pixel_values'] = image['pixel_values'].squeeze(0)

      return image, label

In [13]:
train_dataset = CustomDataset(
    ids=train,
    dataset=ds
)

val_dataset = CustomDataset(
    ids=test,
    dataset=ds
)

In [14]:
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, num_workers=2, drop_last=True)
val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False, num_workers=2)

In [15]:
def train(model, train_loader, epochs, learning_rate, device):
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

    model.train()

    for epoch in range(epochs):
        running_loss = 0.0
        for inputs, labels in train_loader:
            # inputs: A collection of batch_size images
            # labels: A vector of dimensionality batch_size with integers denoting class of each image
            inputs, labels = inputs.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(inputs)

            # outputs: Output of the network for the collection of images. A tensor of dimensionality batch_size x num_classes
            # labels: The actual labels of the images. Vector of dimensionality batch_size
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        print(f"Epoch {epoch+1}/{epochs}, Loss: {running_loss / len(train_loader)}")

In [51]:
def test_2(model, test_loader, device):
  val_loss = []
  val_targets = []
  val_preds = []

  with torch.no_grad():
    for i, (batch, targets) in enumerate(tqdm(val_loader)):

      batch = batch.to(device)
      targets = targets.to(device)

      outputs = model(**batch)
      logits = outputs.logits
      val_targets.extend(targets.cpu().numpy())
      val_preds.extend(logits.argmax(axis=1).cpu().numpy())
  return f1_score(val_targets, val_preds, average='macro')

In [18]:
teacher_accuracy = test_2(teacher_model, val_loader, device)

Epoch: 5:   0%|          | 0/1171 [00:00<?, ?it/s]

In [19]:
teacher_accuracy

1.0

In [None]:
student_model.to(device)

In [21]:
torch.cuda.empty_cache()

In [41]:
def train_knowledge_distillation(teacher, student, train_loader, epochs, learning_rate, T, soft_target_loss_weight, ce_loss_weight, device):
    ce_loss = nn.CrossEntropyLoss()
    optimizer = torch.optim.AdamW(student.parameters(), lr=learning_rate)

    teacher.eval()  # Teacher set to evaluation mode
    student.train() # Student to train mode

    for epoch in range(epochs):
        running_loss = 0.0
        for i, (batch, targets) in enumerate(tqdm(val_loader, desc=f"Epoch: {epoch}")):
            batch = batch.to(device)
            targets = targets.to(device)
            teacher_outputs = teacher(**batch)
            student_outputs = student(**batch)

            optimizer.zero_grad()

            # Forward pass with the teacher model - do not save gradients here as we do not change the teacher's weights
            with torch.no_grad():
                teacher_logits = teacher_outputs.logits

            # Forward pass with the student model
            student_logits = student_outputs.logits

            #Soften the student logits by applying softmax first and log() second
            soft_targets = nn.functional.softmax(teacher_logits / T, dim=-1)
            soft_prob = nn.functional.log_softmax(student_logits / T, dim=-1)

            # Calculate the soft targets loss. Scaled by T**2 as suggested by the authors of the paper "Distilling the knowledge in a neural network"
            soft_targets_loss = -torch.sum(soft_targets * soft_prob) / soft_prob.size()[0] * (T**2)

            # Calculate the true label loss
            label_loss = ce_loss(student_logits, targets)

            # Weighted sum of the two losses
            loss = soft_target_loss_weight * soft_targets_loss + ce_loss_weight * label_loss

            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        print(f"Epoch {epoch+1}/{epochs}, Loss: {running_loss / len(train_loader)}")

# Apply ``train_knowledge_distillation`` with a temperature of 2. Arbitrarily set the weights to 0.75 for CE and 0.25 for distillation loss.
train_knowledge_distillation(teacher=teacher_model, student=student_model, train_loader=train_loader, epochs=5, learning_rate=0.001, T=2, soft_target_loss_weight=0.25, ce_loss_weight=0.75, device=device)
test_accuracy_light_ce_and_kd = test_2(student_model, val_loader, device)

# Compare the student test accuracy with and without the teacher, after distillation
print(f"Teacher accuracy: {teacher_accuracy:.2f}%")
# print(f"Student accuracy without teacher: {test_accuracy_light_ce:.2f}%")
print(f"Student accuracy with CE + KD: {test_accuracy_light_ce_and_kd:.2f}%")

Epoch: 0:   0%|          | 0/1171 [00:00<?, ?it/s]

Epoch 1/5, Loss: 0.023831283516898616


Epoch: 1:   0%|          | 0/1171 [00:00<?, ?it/s]

Epoch 2/5, Loss: 0.023079245342104825


Epoch: 2:   0%|          | 0/1171 [00:00<?, ?it/s]

Epoch 3/5, Loss: 0.023073922084474962


Epoch: 3:   0%|          | 0/1171 [00:00<?, ?it/s]

Epoch 4/5, Loss: 0.02307420998051069


Epoch: 4:   0%|          | 0/1171 [00:00<?, ?it/s]

Epoch 5/5, Loss: 0.023070158878486922


Epoch: 5:   0%|          | 0/1171 [00:00<?, ?it/s]

Teacher accuracy: 1.00%
Student accuracy with CE + KD: 1.00%


In [52]:
indexes = list(range((len(ds['train'][:100]))))

In [53]:
test_dataset = CustomDataset(
    ids=indexes,
    dataset=ds
)

In [54]:
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False, num_workers=2)

In [56]:
start = time.time()
f1_student = test_2(student_model, test_loader, device)
total_time_student_model = time.time() - start
print("Общее время обработки 100 запросов моделью-учеником:", total_time_student_model)
print("F1 моделью-учеником:", f1_student)

  0%|          | 0/1171 [00:00<?, ?it/s]

Общее время обработки 100 запросов моделью-учеником: 55.200899839401245
F1 моделью-учеником: 1.0


In [57]:
start = time.time()
f1_teacher = test_2(teacher_model, test_loader, device)
total_time_teacher_model = time.time() - start
print("Общее время обработки 100 запросов моделью-учителем:", total_time_teacher_model)
print("F1 моделью-учителем:", f1_teacher)

  0%|          | 0/1171 [00:00<?, ?it/s]

Общее время обработки 100 запросов моделью-учителем: 74.99162483215332
F1 моделью-учителем: 1.0


In [58]:
decrease_in_time = (total_time_teacher_model - total_time_student_model) / total_time_teacher_model

print(f'Модель студента классифицирует быстрее на {decrease_in_time*100:.2f} %')

Модель студента классифицирует быстрее на 26.39 %
