In [2]:
from pycocotools.coco import COCO

In [3]:
coco = COCO(annotation_file="/scratch/lt2316-h18-resources/coco/annotations/instances_train2017.json")

loading annotations into memory...
Done (t=19.33s)
creating index...
index created!


In [4]:
cat_cat = coco.getCatIds(catNms="cat")
horse_cat = coco.getCatIds(catNms="horse")

In [5]:
cat_cat, horse_cat

([17], [19])

In [6]:
cat_imgs = coco.getImgIds(catIds=cat_cat)

In [7]:
horse_imgs = coco.getImgIds(catIds=horse_cat)

In [8]:
import random

In [9]:
random.shuffle(cat_imgs)
cat_imgs_train = cat_imgs[:2200]
cat_imgs_val = cat_imgs[2200:2420]
cat_imgs_test = cat_imgs[2420:2640]


random.shuffle(horse_imgs)
horse_imgs_train = horse_imgs[:2200]
horse_imgs_test = horse_imgs[2200:2420]
horse_imgs_val = horse_imgs[2420:2640]

In [10]:
len(cat_imgs_train), len(cat_imgs_test)

(2200, 220)

In [11]:
len([x for x in cat_imgs[0:600] if x in horse_imgs[0:600]])

0

In [12]:
cat_meta_train = coco.loadImgs(ids=cat_imgs_train)
cat_meta_test = coco.loadImgs(ids=cat_imgs_test)
horse_meta_train = coco.loadImgs(ids=horse_imgs_train)
horse_meta_test = coco.loadImgs(ids=horse_imgs_test)
cat_meta_val = coco.loadImgs(ids=cat_imgs_val)
horse_meta_val = coco.loadImgs(ids=horse_imgs_val)

In [13]:
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

In [14]:
%matplotlib inline

In [15]:
from PIL import Image

In [16]:
import numpy as np

In [17]:
def get_data(meta, datadir="/scratch/lt2316-h18-resources/coco/train2017"):
    return [(x['file_name'], Image.open("{}/{}".format(datadir, x['file_name'])).resize((100,100))) for x in meta]

In [18]:
cat_data_train = get_data(cat_meta_train)

In [19]:
len(cat_data_train)

2200

In [20]:
horse_data_train = get_data(horse_meta_train)
cat_data_test = get_data(cat_meta_test)
horse_data_test = get_data(horse_meta_test)
cat_data_val = get_data(cat_meta_val)
horse_data_val = get_data(horse_meta_val)

In [21]:
import pandas as pd

In [22]:
cat_data_train_df = pd.DataFrame(cat_data_train)

In [23]:
cat_data_train_df['class'] = 'cat'

In [24]:
horse_data_train_df = pd.DataFrame(horse_data_train)
horse_data_train_df['class'] = 'horse'

cat_data_test_df = pd.DataFrame(cat_data_test)
cat_data_test_df['class'] = 'cat'

horse_data_test_df = pd.DataFrame(horse_data_test)
horse_data_test_df['class'] = 'horse'

cat_data_val_df = pd.DataFrame(cat_data_val)
cat_data_val_df['class'] = 'cat'

horse_data_val_df = pd.DataFrame(horse_data_val)
horse_data_val_df['class'] = 'horse'

In [25]:
train_df = pd.concat([cat_data_train_df, horse_data_train_df])
test_df = pd.concat([cat_data_test_df, horse_data_test_df])
val_df =pd.concat([cat_data_val_df, horse_data_val_df])

In [26]:
train_df.rename(columns={0: 'Path', 1: 'imgs', 'class':'label'}, inplace=True)
test_df.rename(columns={0: 'Path', 1: 'imgs', 'class':'label'}, inplace=True)
val_df.rename(columns={0: 'Path', 1: 'imgs', 'class':'label'}, inplace=True)
train_df.head(-10)

Unnamed: 0,Path,imgs,label
0,000000262727.jpg,<PIL.Image.Image image mode=RGB size=100x100 a...,cat
1,000000185030.jpg,<PIL.Image.Image image mode=RGB size=100x100 a...,cat
2,000000160743.jpg,<PIL.Image.Image image mode=RGB size=100x100 a...,cat
3,000000343552.jpg,<PIL.Image.Image image mode=RGB size=100x100 a...,cat
4,000000342244.jpg,<PIL.Image.Image image mode=RGB size=100x100 a...,cat
...,...,...,...
2185,000000279550.jpg,<PIL.Image.Image image mode=RGB size=100x100 a...,horse
2186,000000303227.jpg,<PIL.Image.Image image mode=RGB size=100x100 a...,horse
2187,000000129072.jpg,<PIL.Image.Image image mode=RGB size=100x100 a...,horse
2188,000000346849.jpg,<PIL.Image.Image image mode=RGB size=100x100 a...,horse


In [27]:
# Geting unique labels
labels = train_df['label'].unique()

print(labels)

label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label

['cat' 'horse']


In [28]:
from transformers import AutoImageProcessor

checkpoint = "google/vit-base-patch16-224-in21k"

image_processor = AutoImageProcessor.from_pretrained(checkpoint)

In [29]:
from torchvision.transforms import RandomResizedCrop, Compose, Normalize, ToTensor

normalize = Normalize(mean=image_processor.image_mean, std=image_processor.image_std)

size = (

    image_processor.size["shortest_edge"]

    if "shortest_edge" in image_processor.size

    else (image_processor.size["height"], image_processor.size["width"])

)

_transforms = Compose([RandomResizedCrop(size), ToTensor(), normalize])

In [30]:
def transforms(examples):
    examples["pixel_values"] = [_transforms(img) for img in examples["image"]]
    del examples["image"]
    return examples

In [31]:
from torch.utils.data import Dataset, DataLoader
import torch

class CustomDatasetCOCO(Dataset):
    def __init__(self, dataframe, transforms=None):
        self.dataframe = dataframe
        self.transforms = transforms
        self.dataframe['label'] = self.dataframe['label'].apply(lambda x: 0 if x == 'cat' else 1)

        
    def __len__(self):
        return len(self.dataframe)
    
    def __getitem__(self, idx):
        image = self.dataframe.iloc[idx]['imgs']
        label = torch.tensor(self.dataframe.iloc[idx]['label'], dtype=torch.long)
        
        # Ensuring the image is RGB
        if image.mode != 'RGB':
            image = image.convert('RGB')


        if self.transforms:
            image = self.transforms(image)

        return {'pixel_values': image, 'label': label}


In [32]:
dataset_train = CustomDatasetCOCO(dataframe=train_df, transforms=_transforms)
dataset_test = CustomDatasetCOCO(dataframe=test_df, transforms=_transforms)
dataset_val = CustomDatasetCOCO(dataframe=val_df, transforms=_transforms)

In [33]:
import torchvision
sample = dataset_test[-10]
print("Sample keys:", sample.keys())
print("Image type:", type(sample['pixel_values']))
print("Label type:", type(sample['label']))
print("Label:", sample['label'])

image = sample['pixel_values']

# Converting from torch tensor back to PIL image for checking the mode
image = torchvision.transforms.ToPILImage()(image)

print("Image mode:", image.mode)

Sample keys: dict_keys(['pixel_values', 'label'])
Image type: <class 'torch.Tensor'>
Label type: <class 'torch.Tensor'>
Label: tensor(1)
Image mode: RGB


In [34]:
num_labels = int(len(id2label))

In [35]:
print(id2label)

{'0': 'cat', '1': 'horse'}


In [36]:
print(num_labels)

2


In [37]:
from transformers import DefaultDataCollator

data_collator = DefaultDataCollator()

In [38]:
import evaluate

accuracy = evaluate.load("accuracy")

In [39]:
import numpy as np


def compute_metrics(eval_pred):

    predictions, labels = eval_pred

    predictions = np.argmax(predictions, axis=1)

    return accuracy.compute(predictions=predictions, references=labels)

In [40]:
from transformers import AutoModelForImageClassification, TrainingArguments, Trainer

model = AutoModelForImageClassification.from_pretrained(

    checkpoint,

    num_labels=len(labels),

    id2label=id2label,

    label2id=label2id,

)

Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [41]:
training_args = TrainingArguments(

    output_dir="my_awesome_cat_model",

    remove_unused_columns=False,

    evaluation_strategy="epoch",

    save_strategy="epoch",

    learning_rate=5e-5,

    per_device_train_batch_size=16,

    gradient_accumulation_steps=4,

    per_device_eval_batch_size=16,

    num_train_epochs=3,

    warmup_ratio=0.1,

    logging_steps=10,

    load_best_model_at_end=True,

    metric_for_best_model="accuracy",

    push_to_hub=False,


)




trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset_train,
    eval_dataset=dataset_test,
    tokenizer=image_processor,
    compute_metrics=compute_metrics,
)


trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy
0,0.2308,0.146226,0.965909
2,0.0778,0.08313,0.981818




TrainOutput(global_step=102, training_loss=0.20783318549978966, metrics={'train_runtime': 178.4335, 'train_samples_per_second': 73.977, 'train_steps_per_second': 0.572, 'total_flos': 1.0092556727404462e+18, 'train_loss': 0.20783318549978966, 'epoch': 2.96})

In [65]:
ds = val_df

image = ds.iloc[5]

In [66]:
print(ds.iloc[5])

Path                                      000000499054.jpg
imgs     <PIL.Image.Image image mode=RGB size=100x100 a...
label                                                    0
Name: 5, dtype: object


In [67]:
print(image['imgs'])

<PIL.Image.Image image mode=RGB size=100x100 at 0x7F7A61044550>


In [68]:
from transformers import pipeline

classifier = pipeline("image-classification", model="./my_awesome_cat_model/checkpoint-93")

classifier(image['imgs'])


[{'score': 0.9908796548843384, 'label': 'horse'},
 {'score': 0.009120305068790913, 'label': 'cat'}]

In [69]:
from transformers import AutoImageProcessor

import torch

image_processor = AutoImageProcessor.from_pretrained("./my_awesome_cat_model/checkpoint-93")

inputs = image_processor(image['imgs'], return_tensors="pt")

In [70]:
from transformers import AutoModelForImageClassification

model = AutoModelForImageClassification.from_pretrained("./my_awesome_cat_model/checkpoint-93")

with torch.no_grad():

    logits = model(**inputs).logits

In [71]:
predicted_label = logits.argmax(-1).item()

model.config.id2label[predicted_label]


'horse'