# (0) Download ImageNet (Tiny) 

In [14]:
!kaggle datasets download -d akash2sharma/tiny-imagenet

Downloading butterfly-images40-species.zip to /home/jovyan/work/project
 99%|███████████████████████████████████████▌| 395M/399M [00:11<00:00, 44.1MB/s]
100%|████████████████████████████████████████| 399M/399M [00:11<00:00, 36.1MB/s]


In [1]:
import zipfile
with zipfile.ZipFile('tiny-imagenet.zip', 'r') as zip_ref:
    zip_ref.extractall('data/tiny_imagenet')

# (1) Process TinyImageNet Data

In [None]:
import torch
import torchvision.datasets as datasets
import torchvision.transforms as transforms
from PIL import Image
from torch.utils.data import Dataset, DataLoader, random_split
import random
import os

class ImageDataset(Dataset):
    def __init__(self, path_label, transform=None):
        self.path_label = path_label
        self.transform = transform

    def __len__(self):
        return len(self.path_label)

    def __getitem__(self, idx):
        path, label = self.path_label[idx]
        img = Image.open(path).convert('RGB')

        if self.transform is not None:
            img = self.transform(img)

        return img, label
    
# Create dataset from raw structure obtained from Kaggle
# (1): Download dataset fron Kaggle: https://www.kaggle.com/datasets/gpiosenka/butterfly-images40-species?select=train
# (2): Unzip the downloaded dataset to './data/butterfly'
# (3): Run the function
def create_butterfly_dataset(path = './data/butterfly/', img_reshape=(3, 224, 224)):
    C, H, W = img_reshape
    transform = transforms.Compose([
                                    transforms.Resize((H, W)),
                                    transforms.ToTensor()])
      
    train_path = path + 'train'
    test_path = path + 'test'
    
    class_names=sorted(os.listdir(train_path))
    N=list(range(len(class_names)))
    normal_mapping=dict(zip(class_names,N)) 
    reverse_mapping=dict(zip(N,class_names))

    paths0=[]
    # store an image for each class for adversarial attack
    sample_img_dataset = torch.zeros((len(class_names), C, H, W))
    seen = list()
    for dirname, _, filenames in os.walk(train_path):
        for filename in filenames:
            if filename[-4:]=='.jpg':
                path=os.path.join(dirname, filename)
                label=dirname.split('/')[-1]
                if label == '.ipynb_checkpoints':
                    continue
                paths0+=[(path,normal_mapping[label])]
            if label not in seen:
                image = Image.open(path).convert('RGB')
                image = transform(image)
                sample_img_dataset[normal_mapping[label], :, :, :] = image
                seen.append(label)
            
    tpaths0=[]
    for dirname, _, filenames in os.walk(test_path):
        for filename in filenames:
            if filename[-4:]=='.jpg':
                path=os.path.join(dirname, filename)
                label=dirname.split('/')[-1]
                if label == '.ipynb_checkpoints':
                    continue
                tpaths0+=[(path,normal_mapping[label])]
                
    random.seed(123)
    random.shuffle(paths0)            
    random.shuffle(tpaths0)  

    trainset = ImageDataset(paths0, transform)
    testset = ImageDataset(tpaths0, transform)
    
    return trainset, testset, normal_mapping, reverse_mapping, sample_img_dataset

def create_imagenet_dataset(path='./data/tiny_imagenet/tiny-imagenet-200/', img_reshape=(3, 224, 224), split_ratio = 0.8, num_classes = 10):
    C, H, W = img_reshape
    transform = transforms.Compose([
                                    transforms.Resize((H, W)),
                                    transforms.ToTensor()])
      
    data_path = path + 'train'
    
    class_names = sorted(os.listdir(data_path))
    N = list(range(len(class_names)))
    normal_mapping = dict(zip(class_names, N)) 
    reverse_mapping = dict(zip(N, class_names))

    paths0 = []
    sample_img_dataset = torch.zeros((len(class_names), C, H, W))
    seen = list()

    for label in class_names:
        if len(seen) > num_classes:
            break
        label_path = os.path.join(data_path, label, 'images')
        if not os.path.isdir(label_path):
            continue
        for filename in os.listdir(label_path):
            if filename.endswith('.JPEG'):
                path = os.path.join(label_path, filename)
                paths0.append((path, normal_mapping[label]))
                if label not in seen:
                    image = Image.open(path).convert('RGB')
                    image = transform(image)
                    sample_img_dataset[normal_mapping[label], :, :, :] = image
                    seen.append(label)
                
    random.seed(123)
    random.shuffle(paths0) 
    
    data = ImageDataset(paths0, transform)

    total_size = len(data)
    train_size = int(split_ratio * total_size)
    test_size = total_size - train_size

    trainset, testset = random_split(data, [train_size, test_size])

    return trainset, testset, normal_mapping, reverse_mapping, sample_img_dataset

In [None]:
from utils.data import create_imagenet_dataset
trainset, testset, normal_mapping, reverse_mapping, sample_img_dataset = create_imagenet_dataset()
assert len(trainset) == 80000, 'Size of train set not match'
assert len(testset) == 20000, 'Size of test set not match'

In [5]:
import torch
model = torch.hub.load('pytorch/vision:v0.10.','inception_v3', pretrained=True)

ValueError: Cannot find v0.10. in https://github.com/pytorch/vision. If it's a commit from a forked repo, please call hub.load() with forked repo directly.

# (2) Train Classifier

In [None]:
from model.butterfly_classifier import DenseNet121
from utils.base import train_classifier
from torch.utils.data import DataLoader
from torch import nn
import torch

# Params
batch_size=100
lr = 0.0001
device='cuda'

# Data loader
train_loader = DataLoader(dataset=trainset, batch_size=batch_size)
test_loader = DataLoader(dataset=testset, batch_size=batch_size)

# Model
model = DenseNet121(num_classes=len(normal_mapping)).to(device)
optimizer = torch.optim.AdamW(model.parameters(),lr=lr)
criterion = nn.CrossEntropyLoss()

# load states
# model.load_state_dict(torch.load('./model/states/butterfly_classifier.pth'))

In [None]:
from tqdm import trange

train_hist = list()
test_hist = list()
for epoch in trange(1, 5 + 1):
    train_loss, test_loss = train_classifier(model, train_loader, test_loader, optimizer, criterion, epoch)
    train_hist.append(train_loss)
    test_hist.append(test_loss)
    print('Epoch {}: Train: {}, Test: {}'.format(epoch, train_loss, test_loss))
    
torch.save(model.state_dict(), 'butterfly_classifier.pth')

# (3) Evaluate Classifier

In [None]:
from torch.utils.data import DataLoader
from utils.base import eval_accuracy


acc = eval_accuracy(model, testset,  batch_size=100)
print('Accuracy on test set is {}'.format(acc))