In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.24.0-py3-none-any.whl (5.5 MB)
[K     |████████████████████████████████| 5.5 MB 13.7 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.10.1-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 70.2 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 60.8 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.10.1 tokenizers-0.13.2 transformers-4.24.0


In [3]:
import os

os.chdir("drive/MyDrive/Colab Notebooks/AOI")

In [4]:
import pandas as pd
import os
from PIL import Image
import torch
import torch.nn as nn
import torchvision.models as models
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torchvision.transforms as transforms
from tqdm import tqdm
from transformers import get_scheduler
from transformers import ViTFeatureExtractor, ViTModel

# the class for training data
class AOI_Dataset(Dataset):
    def __init__(self, root_dir, annotation_file, feature_extractor=None, transform1=None, transform2=None):
        self.root_dir = root_dir
        self.annotations = pd.read_csv(annotation_file)

        # feature extraction
        self.transform1 = transform1
        self.transform2 = transform2
        self.feature_extractor = feature_extractor

    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, index):
        img_id = self.annotations.iloc[index, 0]
        img = Image.open(os.path.join(self.root_dir, img_id)).convert("RGB")
        y_label = torch.tensor(float(self.annotations.iloc[index, 1]))
        img1 = self.transform1(img)
        img1 = self.feature_extractor(img1)
        img2 = self.transform2(img)
        img2 = self.feature_extractor(img2)
        return (img1, img2 , y_label)

In [5]:
# the structure of the model
class AOI_network(nn.Module):
  def __init__(self, num_classes=6):
    super(AOI_network, self).__init__()
    self.ViT1 = ViTModel.from_pretrained('google/vit-base-patch16-224')
    self.ViT2 = ViTModel.from_pretrained('google/vit-base-patch16-224')
    self.out_fc = nn.Linear(self.ViT1.config.hidden_size * 2, num_classes)
    self.dropout = nn.Dropout(0.1)

  def forward(self, img1, img2):
    features1 = self.ViT1(img1).pooler_output
    features2 = self.ViT2(img2).pooler_output
    features_t = torch.cat([features1, features2], axis=1)
    out = self.dropout(self.out_fc(features_t))
    return out

In [6]:
device = ("cuda" if torch.cuda.is_available() else "cpu")

# transformations can be specified here
# the goal of the second transformation is sharpening the images as the second
# input images for the model
transform_01 = transforms.Compose(
        [
            transforms.Resize((224,224)),
            transforms.RandomRotation((10)),
            transforms.ToTensor(),
        ]
    )
transform_02 = transforms.Compose(
        [
            transforms.RandomAdjustSharpness(4, p=1),
            transforms.Resize((224,224)),
            transforms.RandomRotation((10)),
            transforms.ToTensor(),
        ]
    )
num_epochs = 5
learning_rate = 2e-4
batch_size = 16
shuffle = True
num_workers = 1

In [7]:
# all the training images must be in the directory "train_images"
model_name_or_path = 'google/vit-base-patch16-224-in21k'
feature_extractor_01 = ViTFeatureExtractor.from_pretrained(model_name_or_path)
dataset = AOI_Dataset("train_images","train.csv", feature_extractor=feature_extractor_01, transform1=transform_01, transform2=transform_02)
train_loader = DataLoader(dataset=dataset, shuffle=shuffle, batch_size=batch_size,num_workers=num_workers)

model = AOI_network().to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

learning_rate_scheduler = get_scheduler(
        "linear",
        optimizer=optimizer,
        num_warmup_steps=0,
        num_training_steps=num_epochs * len(train_loader),
    )

Downloading:   0%|          | 0.00/160 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/69.7k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/346M [00:00<?, ?B/s]

Some weights of the model checkpoint at google/vit-base-patch16-224 were not used when initializing ViTModel: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing ViTModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ViTModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ViTModel were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized: ['vit.pooler.dense.weight', 'vit.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of the model checkpoint at google/vit-base-patch16-224 were not used when initializing ViTM

In [None]:
# load model if needed
checkpoint = torch.load("checkpoint", map_location=device)
model.load_state_dict(checkpoint['state_dict'])
optimizer.load_state_dict(checkpoint['optimizer'])
epoch = checkpoint['epoch']

In [None]:
print(optimizer)

AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    eps: 1e-08
    foreach: None
    initial_lr: 0.0002
    lr: 0.0002
    maximize: False
    weight_decay: 0.01
)


In [8]:
for epoch in range(num_epochs):
  model.train()
  loop = tqdm(train_loader, total = len(train_loader), leave = True)
  for imgs1, imgs2, labels in loop:
    imgs1 = imgs1['pixel_values'][0].type(torch.FloatTensor).to(device)
    imgs2 = imgs2['pixel_values'][0].type(torch.FloatTensor).to(device)
    labels = labels.type(torch.LongTensor).to(device)
    optimizer.zero_grad()
    outputs = model(imgs1, imgs2)
    # print(outputs.shape, labels.shape)
    loss = criterion(outputs, labels)
    loss.backward()
    optimizer.step()
    learning_rate_scheduler.step()
    loop.set_description(f"Epoch [{epoch+1}/{num_epochs}]")
    loop.set_postfix(loss = loss.item())
  checkpoint = {
                'epoch': epoch,
                'state_dict': model.state_dict(),
                'optimizer': optimizer.state_dict()
  }
  torch.save(checkpoint, "checkpoint")

Epoch [1/5]: 100%|██████████| 158/158 [19:16<00:00,  7.32s/it, loss=0.355]
Epoch [2/5]: 100%|██████████| 158/158 [03:02<00:00,  1.16s/it, loss=0.0452]
Epoch [3/5]: 100%|██████████| 158/158 [03:00<00:00,  1.14s/it, loss=0.00228]
Epoch [4/5]: 100%|██████████| 158/158 [03:00<00:00,  1.15s/it, loss=0.00309]
Epoch [5/5]: 100%|██████████| 158/158 [03:02<00:00,  1.16s/it, loss=0.0485]


In [9]:
# an extra class for testing data
class AOI_Dataset_TEST(Dataset):
    def __init__(self, root_dir, annotation_file, feature_extractor=None, transform1=None, transform2=None):
        self.root_dir = root_dir
        self.annotations = pd.read_csv(annotation_file)

        # feature extraction
        self.transform1 = transform1
        self.transform2 = transform2
        self.feature_extractor = feature_extractor

    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, index):
        img_id = self.annotations.iloc[index, 0]
        img = Image.open(os.path.join(self.root_dir, img_id)).convert("RGB")
        img1 = self.transform1(img)
        img1 = self.feature_extractor(img1)
        img2 = self.transform2(img)
        img2 = self.feature_extractor(img2)

        return (img1, img2)

In [10]:
import numpy as np
model.eval()
# all the testing images must be in the directory "test_images"
transform_01 = transforms.Compose(
        [
            transforms.Resize((224,224)),
            transforms.ToTensor(),
        ]
    )
transform_02 = transforms.Compose(
        [
            transforms.RandomAdjustSharpness(4, p=1),
            transforms.Resize((224,224)),
            transforms.ToTensor(),
        ]
    )
dataset_TEST = AOI_Dataset_TEST("test_images", "test.csv", feature_extractor=feature_extractor_01, transform1=transform_01, transform2=transform_02)
test_loader = DataLoader(dataset=dataset_TEST, shuffle=False, batch_size=batch_size,num_workers=num_workers)

loop = tqdm(test_loader, total = len(test_loader), leave = True)
pred_labels = []
res = []

for imgs1, imgs2 in loop:
  imgs1 = imgs1['pixel_values'][0].type(torch.FloatTensor).to(device)
  imgs2 = imgs2['pixel_values'][0].type(torch.FloatTensor).to(device)
  outputs = model(imgs1, imgs2)

  pred = torch.argmax(torch.softmax(outputs, dim=1), dim=-1).to(device, dtype=torch.int8)
  pred_labels.append(pred.cpu().detach().numpy().tolist())

# the original csv file "test.csv" containing the names of all images
pred_labels = np.asarray(pred_labels)
for i in pred_labels:
  for j in i:
    res.append(j)
res = np.array(res)
out_df = pd.read_csv("test.csv")
print(res)
print(len(out_df))
out_df["Label"] = res

# the new output file can be specified here
out_df.to_csv("ViT_multi_input_full.csv", encoding='utf-8', index=False)

100%|██████████| 634/634 [1:33:39<00:00,  8.86s/it]

[1 2 5 ... 1 3 1]
10142



