<a href="https://colab.research.google.com/github/AIFFEL-GN-2nd/TotochTeam3/blob/main/ex2_train_classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!nvidia-smi
!python --version

# 1. Custom Dataset과 DataLoader 만들기

Kaggle의 [Dogs vs. Cats dataset](https://www.kaggle.com/c/dogs-vs-cats/data)에서 데이터셋을 내려받습니다.  
(제 구글드라이브에 이미 파일을 업로드해두었으므로, 아래 명령어를 통해 바로 내려받을 수 있습니다.)

```
$ pip install gdown
$ gdown https://drive.google.com/uc?id=1RBVuPoOFTTSGioAOsUzy76wPSFg3hw5J
$ gdown https://drive.google.com/uc?id=1AIIsrHpftkGEw0-8iKoLLrPoRCNWHq5R
$ unzip -q ./train.zip
!unzip -q ./test1.zip
```

## 1-1. 데이터셋 확인

In [None]:
# 데이터셋의 압축을 풀어보자.

!pip install gdown
!gdown https://drive.google.com/uc?id=1RBVuPoOFTTSGioAOsUzy76wPSFg3hw5J
!gdown https://drive.google.com/uc?id=1AIIsrHpftkGEw0-8iKoLLrPoRCNWHq5R
!unzip -q ./train.zip
!unzip -q ./test1.zip

In [None]:
# train, test 데이터의 갯수를 확인해보자.

import os

train_files = os.listdir("./train")
test_files = os.listdir("./test1")

print(len(train_files))
print(len(test_files))

In [None]:
# train data가 어떻게 구성되어있는지 확인하자.

import cv2
import matplotlib.pyplot as plt

train_root = "./train"

fig = plt.figure(figsize=(10, 10))
for i, f in enumerate(train_files[:16], 1):
    file_path = os.path.join(train_root, f)
    img_bgr = cv2.imread(file_path, cv2.IMREAD_COLOR)
    img_rgb = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB)
    
    ax = fig.add_subplot(4, 4, i)
    ax.set_title(f)
    ax.axis("off")
    
    plt.imshow(img_rgb)
    
plt.show()

In [None]:
# test data가 어떻게 구성되어있는지 확인하자.

test_root = "./test1"

fig = plt.figure(figsize=(10, 10))
for i, f in enumerate(test_files[:16], 1):
    file_path = os.path.join(test_root, f)
    img_bgr = cv2.imread(file_path, cv2.IMREAD_COLOR)
    img_rgb = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB)
    
    ax = fig.add_subplot(4, 4, i)
    ax.set_title(f)
    ax.axis("off")
    
    plt.imshow(img_rgb)

### 정리

#### 데이터 갯수
- train dataset : 25000장
- test datset : 12500장

#### train 데이터 형태
- 파일 경로에 class명(dog/cat)이 포함되어 있음

#### test 데이터 형태
- 파일마다 numbering이 되어있음

## 1-2. Custom Dataset class 정의

파이토치 튜토리얼에서 Custom Dataset class는 다음과 같다.

```python
class CustomImageDataset(Dataset):
    def __init__(self):
        ...
        
    def __len__(self):
        ...

    def __getitem__(self):
        ...
```

`__init__`
- image와 label의 경로와 transform 등을 설정합니다.

`__len__`
- 데이터셋의 샘플 개수를 반환합니다.

`__getitem__`
- 데이터셋에서 데이터를 불러옵니다.

In [None]:
# Custom Dataset 클래스를 정의하자
# 이미지 파일 경로, 레이블 정보, transform을 입력으로 받는다고 가정하자.

from torch.utils.data import Dataset, DataLoader
from PIL import Image

class DogCatDataset(Dataset):
    
    def __init__(self, data_dir, file_list, label_list=None, train=True, transform=None):
        self.data_dir = data_dir
        self.file_list = file_list
        self.transform = transform
        self.train = train
        if self.train:
            self.label_list = label_list
    
    def __len__(self):
        return len(self.file_list)
    
    def __getitem__(self, idx):
        image = Image.open(os.path.join(self.data_dir, self.file_list[idx]))        
        if self.transform:
            image = self.transform(image)
            
        if self.train:
            # image, label
            return image, self.label_list[idx]
        else:
            # image, test file number
            return image, self.file_list[idx][:-4]

## 1-3. Dataset 생성

In [None]:
# train file의 label을 만들자 (cat: 0, dog: 1)

train_files = sorted(os.listdir("./train"))
train_labels = [0 if "cat" in f else 1 for f in train_files]

print(len(train_files), len(train_labels))

In [None]:
# train set을 train과 validation set으로 나누자

from sklearn.model_selection import train_test_split

train_files, val_files, train_labels, val_labels = train_test_split(
    train_files, train_labels, stratify=train_labels, random_state=42
)

# 잘 나누어졌는지 확인
print(len(train_files), len(train_labels), train_labels.count(0), train_labels.count(1))
print(len(val_files), len(val_labels), val_labels.count(0), val_labels.count(1))

In [None]:
# transform을 정의하자.
# 참고 링크: https://pytorch.org/vision/stable/transforms.html

import torchvision.transforms as transforms

train_transform = transforms.Compose([
    transforms.Resize(256),
    transforms.RandomCrop(224),
    transforms.ToTensor(), # 0~1로 scaling
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)), # zero-centering
])

eval_transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(), # 0~1로 scaling
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)), # -1~1로 zero-centering
])

In [None]:
# 이제 train, valid dataset을 만들 수 있다.

train_dataset = DogCatDataset(data_dir="./train",
                              file_list=train_files,
                              label_list=train_labels,
                              train=True,
                              transform=train_transform)

val_dataset = DogCatDataset(data_dir="./train",
                            file_list=val_files,
                            label_list=val_labels,
                            train=True,
                            transform=eval_transform)

In [None]:
# 동일한 방식으로, test dataset도 만들어주자.

test_files = sorted(os.listdir("./test1"))
test_dataset = DogCatDataset(data_dir="./test1",
                             file_list=test_files,
                             train=False,
                             transform=eval_transform)

## 1-4. DataLoader 생성

In [None]:
# DataLoader를 만들어보자.

batch_size = 64

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=2)
valid_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=2)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=2)

In [None]:
# train_loader가 잘 동작하는지 테스트해보자.

images, labels = next(iter(train_loader))
print(images.shape)
print(labels)

plt.imshow(images[0].permute(1, 2, 0))
plt.title(labels[0].item())
plt.axis("off")
plt.show()

In [None]:
# val_loader가 잘 동작하는지 테스트해보자.

images, labels = next(iter(valid_loader))
print(images.shape)
print(labels)

plt.imshow(images[0].permute(1, 2, 0))
plt.title(labels[0].item())
plt.axis("off")
plt.show()

In [None]:
# test_loader가 잘 동작하는지 테스트해보자.

images, labels = next(iter(test_loader))
print(images.shape)
print(labels)

plt.imshow(images[0].permute(1, 2, 0))
plt.title(labels[0])
plt.axis("off")
plt.show()

# 2. 모델 구성하기

<img src="https://miro.medium.com/max/1400/1*gU5m4XO2awEM6Zp4DkirFA.png" width="75%" >

- Input shape
    - 3 x 224 x 224
- Convolution
    - 3x3 kernel size
    - stride 1
    - padding 1 (input, output resolution 유지)
- MaxPooling
    - 2x2 kernel size
    - stride 2 (output resolution = (1/2) * input resolution)
- Activation
    - 모든 hidden layer의 뒤에는 ReLU 사용

In [None]:
import torch
import torch.nn as nn

device = "cuda:0" if torch.cuda.is_available() else "cpu"
print(device)

In [None]:
# VGG16 - D
class VGG16(nn.Module):

    def __init__(self, n_classes=1000):
        super().__init__()
        
        # conv3-64 x 2
        self.conv_block1 = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2, padding=0),
        )
        # conv3-128 x 2
        self.conv_block2 = nn.Sequential(
            nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.Conv2d(128, 128, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2, padding=0),
        )
        # conv3-256 x 3
        self.conv_block3 = nn.Sequential(
            nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2, padding=0),
        )
        # conv4-512 x 3
        self.conv_block4 = nn.Sequential(
            nn.Conv2d(256, 512, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2, padding=0),
        )
        # conv5-512 x 3
        self.conv_block5 = nn.Sequential(
            nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2, padding=0),
        )
        # fc layer x 3
        self.fc1 = nn.Linear(512 * 7 * 7, 4096)
        self.dropout1 = nn.Dropout(p=0.5)
        self.fc2 = nn.Linear(4096, 4096)
        self.dropout2 = nn.Dropout(p=0.5)
        self.fc3 = nn.Linear(4096, n_classes)
        
        # weight initialization
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
                nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.Linear):
                nn.init.xavier_normal_(m.weight)
                nn.init.constant_(m.bias, 0)

    def forward(self, x):
        x = self.conv_block1(x)
        x = self.conv_block2(x)
        x = self.conv_block3(x)
        x = self.conv_block4(x)
        x = self.conv_block5(x)

        x = x.view(-1, 512 * 7 * 7)

        x = self.fc1(x)
        x = self.dropout1(x)
        x = self.fc2(x)
        x = self.dropout2(x)
        x = self.fc3(x)

        return x


model = VGG16(n_classes=2).to(device)
print(model)

In [None]:
from torch import optim

loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-5)

In [None]:
def train_loop(data_loader, model, loss_fn, optimizer):
    size = len(data_loader.dataset)

    model.train()
    for batch, (X, y) in enumerate(data_loader):
        X, y = X.to(device), y.to(device)

        pred = model(X)
        loss = loss_fn(pred, y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch % 100 == 0:
            loss = loss.item()
            current = batch * len(X)
            print(f"loss: {loss:>7f} [{current:>5d} / {size:>5d}]")

In [None]:
def test_loop(data_loader, model, loss_fn):
    size = len(data_loader.dataset)
    num_batches = len(data_loader)
    test_loss = 0
    correct = 0

    model.eval() # for dropout
    with torch.no_grad():
        for X, y in data_loader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()

    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n"
          f"Accuracy: {100 * correct:>0.1f}%, Avg loss: {test_loss:>8f} \n")

In [None]:
epochs = 50
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train_loop(train_loader, model, loss_fn, optimizer)
    test_loop(valid_loader, model, loss_fn)
print("Done!")

In [None]:
file_nums_list = []
pred_labels_list = []

model.eval()
for X, files in test_loader:A
    with torch.no_grad():
        X = X.to(device)
        pred = model(X)
        pred_labels = pred.argmax(1)
        pred_labels_list += [label.item() for label in pred_labels]
        file_nums_list += [f for f in files]


submission = pd.DataFrame({"id":file_nums_list, "label":pred_labels_list})
submission.to_csv('preds_vgg16.csv', index=False)

In [None]:
!ls