In [1]:
from transformers.utils import send_example_telemetry

send_example_telemetry("image_classification_notebook", framework="pytorch")

In [None]:
pip install -U tf-keras

In [1]:
import pandas as pd
import numpy as np
import evaluate

import os
import gc

from tqdm.notebook import tqdm
import PIL

import torch
from transformers import AutoModelForImageClassification, TrainingArguments, Trainer, AutoImageProcessor
from datasets import Dataset, Image
from torchvision.transforms import RandomResizedCrop, Compose, Normalize, ToTensor




In [2]:
df = pd.read_csv('data/train.csv')
df

Unnamed: 0,unified_class,class_id,image_name
0,Оленевые,5,3cf4207b958eade893a2f1618cf062b8.JPG
1,Кошки,2,37698901280c871f426d40afe5c373cd.JPG
2,Заяц,0,20e7b30026001cbfe0b5c0ee16c9ff56.JPG
3,Кошки,2,a1bc8ea546206ee8fc0f1836fda9a5c1.JPG
4,Оленевые,5,54eb76914b84db8a0d56f98125abf588.JPG
...,...,...,...
28010,Оленевые,5,07b420b4fe265b4ed918b46435c025d7.JPG
28011,Пантеры,6,2d1c5918357bbdd729bf79085e55d35e.JPG
28012,Заяц,0,1531efa9f8687e390adf780355acd606.JPG
28013,Кабан,1,2b15eaef0ce9b57b6570709f95a4bea4.JPG


In [3]:
#del df 
#gc.collect()

In [4]:
len(set(df.class_id))

10

In [5]:
df.drop(columns=['unified_class'], inplace = True)
df

Unnamed: 0,class_id,image_name
0,5,3cf4207b958eade893a2f1618cf062b8.JPG
1,2,37698901280c871f426d40afe5c373cd.JPG
2,0,20e7b30026001cbfe0b5c0ee16c9ff56.JPG
3,2,a1bc8ea546206ee8fc0f1836fda9a5c1.JPG
4,5,54eb76914b84db8a0d56f98125abf588.JPG
...,...,...
28010,5,07b420b4fe265b4ed918b46435c025d7.JPG
28011,6,2d1c5918357bbdd729bf79085e55d35e.JPG
28012,0,1531efa9f8687e390adf780355acd606.JPG
28013,1,2b15eaef0ce9b57b6570709f95a4bea4.JPG


In [6]:
df['image_name'] = 'data/train/'+df['image_name']

In [7]:
df = df[df['image_name'].apply(lambda x: len(np.array(PIL.Image.open(x)).shape)==3)]

In [8]:
df

Unnamed: 0,class_id,image_name
0,5,data/train/3cf4207b958eade893a2f1618cf062b8.JPG
1,2,data/train/37698901280c871f426d40afe5c373cd.JPG
2,0,data/train/20e7b30026001cbfe0b5c0ee16c9ff56.JPG
3,2,data/train/a1bc8ea546206ee8fc0f1836fda9a5c1.JPG
4,5,data/train/54eb76914b84db8a0d56f98125abf588.JPG
...,...,...
28010,5,data/train/07b420b4fe265b4ed918b46435c025d7.JPG
28011,6,data/train/2d1c5918357bbdd729bf79085e55d35e.JPG
28012,0,data/train/1531efa9f8687e390adf780355acd606.JPG
28013,1,data/train/2b15eaef0ce9b57b6570709f95a4bea4.JPG


In [9]:
df = df.reset_index(drop=True)

In [10]:
df = Dataset.from_pandas(df)

In [11]:
dataset = df.cast_column("image_name", Image())

In [12]:
dataset = dataset.class_encode_column("class_id")

Stringifying the column:   0%|          | 0/28014 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/28014 [00:00<?, ? examples/s]

In [13]:
dataset

Dataset({
    features: ['class_id', 'image_name'],
    num_rows: 28014
})

In [None]:
m = "microsoft/resnet-50" # model setting

In [None]:
image_processor = AutoImageProcessor.from_pretrained(m) #preprocess image

preprocessor_config.json:   0%|          | 0.00/228 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [16]:
splits = dataset.train_test_split(test_size=0.2,stratify_by_column='class_id')
train = splits['train']
val = splits['test']

In [17]:
def load_image(example_batch):
    # Process each image individually and collect pixel values as tensors
    di = [image_processor(image.convert("RGB"), return_tensors="pt")['pixel_values'].squeeze(0) 
          for image in example_batch["image_name"]]
    # Stack the list of tensors into a single tensor with batch dimension
    example_batch["pixel_values"] = torch.stack(di)  
    example_batch["labels"] = torch.tensor(example_batch["class_id"])

    # Remove original columns no longer needed
    del example_batch["image_name"]
    del example_batch["class_id"]

    return example_batch


In [18]:
train.set_transform(load_image)
val.set_transform(load_image)

In [19]:
train[0]

{'pixel_values': tensor([[[-0.8510, -0.8353, -0.8353,  ..., -0.0039, -0.0275, -0.0980],
          [-0.8588, -0.8431, -0.8353,  ..., -0.0667, -0.0588, -0.0902],
          [-0.8667, -0.8431, -0.8275,  ..., -0.1059, -0.0667, -0.0745],
          ...,
          [-0.9451, -0.9451, -0.9451,  ...,  0.4196,  0.4118,  0.4353],
          [-0.9451, -0.9451, -0.9451,  ...,  0.4745,  0.4745,  0.4510],
          [-0.9451, -0.9451, -0.9451,  ...,  0.4902,  0.5137,  0.4510]],
 
         [[-0.8902, -0.8745, -0.8824,  ..., -0.0824, -0.0980, -0.1451],
          [-0.8980, -0.8824, -0.8745,  ..., -0.1373, -0.1137, -0.1373],
          [-0.9059, -0.8824, -0.8745,  ..., -0.1529, -0.1137, -0.1216],
          ...,
          [-0.9451, -0.9451, -0.9451,  ...,  0.3176,  0.3098,  0.3333],
          [-0.9451, -0.9451, -0.9451,  ...,  0.3647,  0.3647,  0.3412],
          [-0.9451, -0.9451, -0.9451,  ...,  0.3804,  0.4039,  0.3412]],
 
         [[-0.8431, -0.8275, -0.8275,  ..., -0.0745, -0.0902, -0.1451],
          [-

In [20]:
metric = evaluate.load("f1")

In [21]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels, average = 'macro' )

In [22]:
model = AutoModelForImageClassification.from_pretrained(
    m,
    num_labels=10,
    ignore_mismatched_sizes=True
)

config.json:   0%|          | 0.00/791 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/343M [00:00<?, ?B/s]

Invalid model-index. Not loading eval results into CardData.
Some weights of ViTForImageClassification were not initialized from the model checkpoint at ritheshSree/animal-classifier and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([4, 768]) in the checkpoint and torch.Size([10, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([4]) in the checkpoint and torch.Size([10]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
args = TrainingArguments(
    remove_unused_columns=False,
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=3e-4,
    per_device_train_batch_size=64,
    #gradient_accumulation_steps=4,
    per_device_eval_batch_size=64,
    num_train_epochs=5,
    #warmup_ratio=0.1,
    #logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    output_dir = 'save'
)



In [24]:
trainer = Trainer(
    model = model,
    args  =  args,
    train_dataset=train,
    eval_dataset=val,
    tokenizer=image_processor,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


In [25]:
trainer.train()
torch.cuda.empty_cache()

  0%|          | 0/1755 [00:00<?, ?it/s]

  context_layer = torch.nn.functional.scaled_dot_product_attention(


model.safetensors:   0%|          | 0.00/343M [00:00<?, ?B/s]

KeyboardInterrupt: 

In [None]:
sample = pd.read_csv('sample_submission.csv')
sample

Unnamed: 0,image_name,predicted_class
0,cc27b9b56583a615fb8501e352402eb9.JPG,0
1,87872711fe672676fd34a97e997f9c47.JPG,0
2,424aa1aa8eb5bbdd07275f88077bc86c.JPG,0
3,c5537eaa60525efd7bad4a5560607e83.JPG,0
4,e9f15b67ca49453e281b2b4f245eac13.JPG,0
...,...,...
12953,028668e733cd17ec9b9f1c7e2c657b36.JPG,0
12954,eb1f1152941fdfdd50ff9954010e622a.JPG,0
12955,bfd2dde9f4a5753c9f85b2a93bee9c03.JPG,0
12956,2eaf9c794958a93bb9984441fd5d7f61.JPG,0


In [None]:
sample['image_name'] = 'data/test/'+sample['image_name']

In [None]:
samp = Dataset.from_pandas(sample.drop('predicted_class',axis=1))

In [None]:
testset = samp.cast_column("image_name", Image())

In [None]:
testset

Dataset({
    features: ['image_name'],
    num_rows: 12958
})

In [None]:
def load_test(example_batch):
    # Process each image individually and collect pixel values as tensors
    di = [image_processor(image.convert("RGB"), return_tensors="pt")['pixel_values'].squeeze(0) 
          for image in example_batch["image_name"]]
    # Stack the list of tensors into a single tensor with batch dimension
    example_batch["pixel_values"] = torch.stack(di)
    
    del example_batch["image_name"]
    
    return example_batch

In [None]:
testset.set_transform(load_test)

In [None]:
testset

Dataset({
    features: ['image_name'],
    num_rows: 12958
})

In [None]:
lst=[]
with torch.no_grad():
    for input in tqdm(testset):
        outputs = model(input['pixel_values'].to('cuda').reshape(1,3,224,224)).logits.argmax(-1)
        lst.append(int(outputs.cpu()))
lst[:5]

  0%|          | 0/12958 [00:00<?, ?it/s]

[4, 5, 0, 1, 6]

In [None]:
sample['predicted_class'] = lst

In [None]:
sample['image_name'] = sample['image_name'].apply(lambda x: x[10:])

In [None]:
sample

Unnamed: 0,image_name,predicted_class
0,cc27b9b56583a615fb8501e352402eb9.JPG,4
1,87872711fe672676fd34a97e997f9c47.JPG,5
2,424aa1aa8eb5bbdd07275f88077bc86c.JPG,0
3,c5537eaa60525efd7bad4a5560607e83.JPG,1
4,e9f15b67ca49453e281b2b4f245eac13.JPG,6
...,...,...
12953,028668e733cd17ec9b9f1c7e2c657b36.JPG,1
12954,eb1f1152941fdfdd50ff9954010e622a.JPG,4
12955,bfd2dde9f4a5753c9f85b2a93bee9c03.JPG,5
12956,2eaf9c794958a93bb9984441fd5d7f61.JPG,6


In [None]:
sample.to_csv('submission.csv',index=False)