In [1]:
import torch
from torchvision import transforms, datasets
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn

from transformers import ViTFeatureExtractor, ViTForImageClassification

import albumentations as A
from albumentations.pytorch import ToTensorV2

import random
import time
from datetime import datetime

import matplotlib
import matplotlib.pyplot as plt
import cv2
import os
import copy

import pandas as pd

import timm
from timm.optim import create_optimizer_v2
from timm.utils import CheckpointSaver
import torch.nn.functional as F

matplotlib.use('TkAgg')

In [2]:
random.seed(42)
num_classes = 36
using_dataset = f"food-{num_classes}"
# with open(f"./{using_dataset}/meta/train.txt", 'r') as f_train: 
#     correct_images_filepaths = [f"./{using_dataset}/images/{line[:-1]}.jpg" for line in f_train.readlines()]
#     #correct_images_filepaths = [i for i in images_filepaths if cv2.imread(i) is not None]
#     random.shuffle(correct_images_filepaths)
#     train_images_paths = correct_images_filepaths

# print(len(train_images_paths))

# with open(f"./{using_dataset}/meta/test.txt", 'r') as f_test: 
#     test_images_paths = [f"./{using_dataset}/images/{line[:-1]}.jpg" for line in f_test.readlines()]
#     #test_images_paths = [i for i in test_images_paths if cv2.imread(i) is not None]
#     #random.shuffle(test_images_paths)
#     #test_images_paths = test_images_paths[:500]

# print(len(test_images_paths))
# train_images_paths[:20]

In [3]:
data_transform_train = transforms.Compose(
            [transforms.Resize([256,256]),
             transforms.RandomCrop([224,224]),
             transforms.RandomHorizontalFlip(),
             transforms.ToTensor(),
             transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                  std=[0.229, 0.224, 0.225]),
             ])
data_transform_test = transforms.Compose(
            [transforms.Resize([224,224]),
             transforms.ToTensor(),
             transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                  std=[0.229, 0.224, 0.225]),
             ])

ds_train = datasets.ImageFolder(f"{using_dataset}/train", transform=data_transform_train)
ds_test = datasets.ImageFolder(f"{using_dataset}/test", transform=data_transform_test)


label2id = ds_test.class_to_idx
id2label = {identifier: label for label, identifier in label2id.items()}

label2class = id2label
class2label = label2id

label2id

{'apple_pie': 0,
 'baby_back_ribs': 1,
 'baklava': 2,
 'beef_carpaccio': 3,
 'beef_tartare': 4,
 'beet_salad': 5,
 'beignets': 6,
 'bibimbap': 7,
 'bread_pudding': 8,
 'breakfast_burrito': 9,
 'bruschetta': 10,
 'caesar_salad': 11,
 'cannoli': 12,
 'caprese_salad': 13,
 'carrot_cake': 14,
 'ceviche': 15,
 'cheese_plate': 16,
 'cheesecake': 17,
 'chicken_curry': 18,
 'chicken_quesadilla': 19,
 'chicken_wings': 20,
 'chocolate_cake': 21,
 'chocolate_mousse': 22,
 'churros': 23,
 'clam_chowder': 24,
 'club_sandwich': 25,
 'crab_cakes': 26,
 'creme_brulee': 27,
 'croque_madame': 28,
 'cup_cakes': 29,
 'deviled_eggs': 30,
 'donuts': 31,
 'dumplings': 32,
 'edamame': 33,
 'eggs_benedict': 34,
 'escargots': 35}

In [4]:
BATCH_SIZE = 4
dl_train = DataLoader(ds_train, batch_size=BATCH_SIZE, shuffle=True)
dl_test = DataLoader(ds_test, batch_size=BATCH_SIZE, shuffle=False)
dl_train

<torch.utils.data.dataloader.DataLoader at 0x7fdfdc37fbb0>

In [5]:
def get_accuracy(predictions, labels):
    return (predictions.argmax(dim=1) == labels).float().mean()

def get_loss(predictions, labels):
    predictions = predictions.reshape(-1, predictions.shape[-1])
    #labels = labels.unsqueeze(1).expand(-1, 1).reshape(-1)
    return F.cross_entropy(predictions, labels)

In [6]:
from pprint import pprint
model_names = timm.list_models(pretrained=True)
pprint(model_names)

['adv_inception_v3',
 'cait_m36_384',
 'cait_m48_448',
 'cait_s24_224',
 'cait_s24_384',
 'cait_s36_384',
 'cait_xs24_384',
 'cait_xxs24_224',
 'cait_xxs24_384',
 'cait_xxs36_224',
 'cait_xxs36_384',
 'coat_lite_mini',
 'coat_lite_small',
 'coat_lite_tiny',
 'coat_mini',
 'coat_tiny',
 'convit_base',
 'convit_small',
 'convit_tiny',
 'cspdarknet53',
 'cspresnet50',
 'cspresnext50',
 'deit_base_distilled_patch16_224',
 'deit_base_distilled_patch16_384',
 'deit_base_patch16_224',
 'deit_base_patch16_384',
 'deit_small_distilled_patch16_224',
 'deit_small_patch16_224',
 'deit_tiny_distilled_patch16_224',
 'deit_tiny_patch16_224',
 'densenet121',
 'densenet161',
 'densenet169',
 'densenet201',
 'densenetblur121d',
 'dla34',
 'dla46_c',
 'dla46x_c',
 'dla60',
 'dla60_res2net',
 'dla60_res2next',
 'dla60x',
 'dla60x_c',
 'dla102',
 'dla102x',
 'dla102x2',
 'dla169',
 'dm_nfnet_f0',
 'dm_nfnet_f1',
 'dm_nfnet_f2',
 'dm_nfnet_f3',
 'dm_nfnet_f4',
 'dm_nfnet_f5',
 'dm_nfnet_f6',
 'dpn68',
 'dpn

In [7]:
model = timm.create_model('deit_tiny_distilled_patch16_224', pretrained=True, num_classes=num_classes)
opt = create_optimizer_v2(model, learning_rate=1e-5)

# PATH = f"./checkpoints/{using_dataset}/ViT/20211028-093107/model.pt"
# checkpoint = torch.load(PATH)
# model.load_state_dict(checkpoint['model_state_dict'])
# opt.load_state_dict(checkpoint['optimizer_state_dict'])
# last_epoch = checkpoint['epoch']
# loss = checkpoint['loss']
# acc = checkpoint['acc']


model.eval()

VisionTransformer(
  (patch_embed): PatchEmbed(
    (proj): Conv2d(3, 192, kernel_size=(16, 16), stride=(16, 16))
    (norm): Identity()
  )
  (pos_drop): Dropout(p=0.0, inplace=False)
  (blocks): Sequential(
    (0): Block(
      (norm1): LayerNorm((192,), eps=1e-06, elementwise_affine=True)
      (attn): Attention(
        (qkv): Linear(in_features=192, out_features=576, bias=True)
        (attn_drop): Dropout(p=0.0, inplace=False)
        (proj): Linear(in_features=192, out_features=192, bias=True)
        (proj_drop): Dropout(p=0.0, inplace=False)
      )
      (drop_path): Identity()
      (norm2): LayerNorm((192,), eps=1e-06, elementwise_affine=True)
      (mlp): Mlp(
        (fc1): Linear(in_features=192, out_features=768, bias=True)
        (act): GELU()
        (fc2): Linear(in_features=768, out_features=192, bias=True)
        (drop): Dropout(p=0.0, inplace=False)
      )
    )
    (1): Block(
      (norm1): LayerNorm((192,), eps=1e-06, elementwise_affine=True)
      (attn): 

In [8]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)



VisionTransformer(
  (patch_embed): PatchEmbed(
    (proj): Conv2d(3, 192, kernel_size=(16, 16), stride=(16, 16))
    (norm): Identity()
  )
  (pos_drop): Dropout(p=0.0, inplace=False)
  (blocks): Sequential(
    (0): Block(
      (norm1): LayerNorm((192,), eps=1e-06, elementwise_affine=True)
      (attn): Attention(
        (qkv): Linear(in_features=192, out_features=576, bias=True)
        (attn_drop): Dropout(p=0.0, inplace=False)
        (proj): Linear(in_features=192, out_features=192, bias=True)
        (proj_drop): Dropout(p=0.0, inplace=False)
      )
      (drop_path): Identity()
      (norm2): LayerNorm((192,), eps=1e-06, elementwise_affine=True)
      (mlp): Mlp(
        (fc1): Linear(in_features=192, out_features=768, bias=True)
        (act): GELU()
        (fc2): Linear(in_features=768, out_features=192, bias=True)
        (drop): Dropout(p=0.0, inplace=False)
      )
    )
    (1): Block(
      (norm1): LayerNorm((192,), eps=1e-06, elementwise_affine=True)
      (attn): 

In [9]:
# df_val = pd.DataFrame()
# val_labels = []
# val_pred = []
# losses, accs = [], []
# with torch.no_grad():
#     for images, labels in dl_test:
#         images = images.to(device)
#         labels = labels.to(device)
#         pred = model(images)
#         loss = get_loss(pred, labels)
#         acc = get_accuracy(pred, labels)
#         accs.append(acc * images.shape[0])            
#         losses.append(loss * images.shape[0])
#         val_labels.append(labels)
#         val_pred.append(pred)

# df_val[f"epoch_prova_pred"] = val_pred
# df_val[f"epoch_prova_labels"] = val_labels
# df_val.to_csv(f"epoch_prova_val.csv")
# loss = torch.stack(losses).sum() / len(dl_test.dataset)
# acc = torch.stack(accs).sum() / len(dl_test.dataset)

# print(f'Epoch: {0+1:>2}    Loss: {loss.item():.3f}    Accuracy: {acc:.3f}')

In [None]:
EPOCHS = 20
df_val = pd.DataFrame()
df_train = pd.DataFrame()
output_dir = f"./checkpoints/{using_dataset}/DeiT/{datetime.now().strftime('%Y%m%d-%H%M%S')}"
os.system(f"mkdir {output_dir}")

for epoch in range(EPOCHS):
    losses, accs = [], []
    model.train()
    train_labels = []
    train_pred = []
    for images, labels in dl_train:
        opt.zero_grad()
        images = images.to(device)
        labels = labels.to(device)
        pred = model(images)
        loss = get_loss(pred[0], labels)
        train_pred.append(pred)
        train_labels.append(labels)
        loss.backward()
        opt.step()
        images.to('cpu')
        labels.to('cpu')
    df_train[f"epoch_{epoch}_pred"] = train_pred
    df_train[f"epoch_{epoch}_labels"] = train_labels
    df_train.to_csv(f"epoch_{epoch}_train.csv")
        
    model.eval()
    val_labels = []
    val_pred = []
    with torch.no_grad():
        for images, labels in dl_test:
            images = images.to(device)
            labels = labels.to(device)
            pred = model(images)
            loss = get_loss(pred[0], labels)
            acc = get_accuracy(pred, labels)
            accs.append(acc * images.shape[0])            
            losses.append(loss * images.shape[0])
            images.to('cpu')
            labels.to('cpu')
            val_labels.append(labels)
            val_pred.append(pred)
            
    df_val[f"epoch_{epoch}_pred"] = val_pred
    df_val[f"epoch_{epoch}_labels"] = val_labels
    df_val.to_csv(f"epoch_{epoch}_val.csv")
    loss = torch.stack(losses).sum() / len(dl_test.dataset)
    acc = torch.stack(accs).sum() / len(dl_test.dataset)

    print(f'Epoch: {epoch+1:>2}    Loss: {loss.item():.3f}    Accuracy: {acc:.3f}')
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': opt.state_dict(),
        'loss': loss.item(),
        'acc': acc
    }, f"{output_dir}/model.pt")