This notebook is meant to run in Google Colab.

# Installations

In [None]:
!pip install transformers -q
!pip install datasets -q
!pip install transformers[torch] -q
!pip install kornia -q

# Downloading the dataset from Kaggle

In [None]:
from google.colab import files
files.upload()

!rm -r ~/.kaggle
!mkdir ~/.kaggle
!mv ./kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle competitions download -c ukraine-ml-bootcamp-2023
!unzip -q ./ukraine-ml-bootcamp-2023.zip  -d ./dataset/

# Logging into Huggingface

In [None]:
!huggingface-cli login

# Generating a huggingface dataset

In [None]:
import pandas as pd
csv = pd.read_csv('/content/dataset/train.csv',
                  header=0, names=['file_name', 'label'])
csv.to_csv('/content/dataset/images/train_images/metadata.csv', index=False)

In [None]:
from datasets import load_dataset
dataset = load_dataset("imagefolder",
                       data_dir="/content/dataset/images/train_images",
                       split="train")
dataset[0], len(dataset)

# (Optional) Saving the dataset to hub and loading it

In [None]:
dataset.push_to_hub("dariia-artemova/yoga", private=True)

In [None]:
from datasets import load_dataset
dataset = load_dataset("dariia-artemova/yoga")

# Training

In [None]:
import torch
#import numpy as np
#from torch import nn
#from torch.utils.data import DataLoader
from torchvision.transforms import v2, InterpolationMode
from transformers import AutoModelForImageClassification, TrainingArguments, Trainer
from kornia.augmentation import RandomCutMixV2

cutmix = RandomCutMixV2(cut_size=(0.7, 0.85), data_keys=['input', 'class'], p=1)

def train_collator(examples):
  pixel_values = torch.stack([example['pixel_values'] for example in examples])
  labels = torch.tensor([example['label'] for example in examples])
  pixel_values, new_labels = cutmix(pixel_values, labels)
  new_labels = new_labels.squeeze()
  labels = torch.zeros([len(new_labels), 6], dtype=torch.float32)
  for i in range(len(new_labels)):
    new_label = new_labels[i]
    if (new_label[0] == new_label[1]):
      labels[i, int(new_label[0])] = 1
      continue
    labels[i, int(new_label[0])] = 1 - new_label[2]
    labels[i, int(new_label[1])] = new_label[2]
  return {'pixel_values' : pixel_values, 'labels' : labels}

train_transforms = v2.Compose([
    v2.RandomResizedCrop(size=(384, 384),
                         scale=(0.4, 1),
                         ratio=(0.9, 1.1),
                         antialias=True,
                         interpolation=InterpolationMode.BICUBIC),
    v2.ColorJitter(brightness=0.3,
                   contrast=0.3,
                   saturation=(0.6, 1.2),
                   hue=0.02),
    v2.RandomGrayscale(p=0.05),
    v2.RandomHorizontalFlip(p=0.5),
    v2.ToImageTensor(),
    v2.ConvertImageDtype(dtype=torch.float32)
    ])

def transform_train(examples):
  examples['pixel_values'] = [
      train_transforms(image.convert('RGB')) for image in examples['image']
      ]
  return examples

dataset.set_transform(transform_train)
steps_per_epoch = int(len(dataset)/16)

## Model 1

In [None]:
model1 = AutoModelForImageClassification.from_pretrained(
    'microsoft/cvt-21-384-22k',
    num_labels=6,
    ignore_mismatched_sizes=True
    )

In [None]:
training_args = TrainingArguments(
    output_dir='yoga-competition-cvt-1',
    num_train_epochs=100,
    remove_unused_columns=False,
    per_device_train_batch_size=16,
    logging_strategy='epoch',
    dataloader_num_workers=2,
    overwrite_output_dir=True,
    report_to='tensorboard',
    weight_decay=0.05,
    learning_rate=0.00002,
    lr_scheduler_type='cosine',
    warmup_steps=steps_per_epoch*5,
    save_strategy='steps',
    save_steps=steps_per_epoch*5,
    save_total_limit=2,
    push_to_hub=True,
    hub_strategy='all_checkpoints',
    hub_private_repo=True
    )

trainer = Trainer(
    model=model1,
    args=training_args,
    train_dataset=dataset,
    data_collator=train_collator
    )

trainer.train()

## Model 2

In [None]:
model2 = AutoModelForImageClassification.from_pretrained(
    'microsoft/cvt-21-384-22k',
    num_labels=6,
    ignore_mismatched_sizes=True
    )

In [None]:
# These are pretty much the same as in model 1
training_args = TrainingArguments(
    output_dir='yoga-competition-cvt-2',
    num_train_epochs=100,
    remove_unused_columns=False,
    per_device_train_batch_size=16,
    logging_strategy='epoch',
    dataloader_num_workers=2,
    overwrite_output_dir=True,
    report_to='tensorboard',
    weight_decay=0.05,
    learning_rate=0.00002,
    lr_scheduler_type='cosine',
    warmup_steps=steps_per_epoch*5,
    save_strategy='steps',
    save_steps=steps_per_epoch*5,
    save_total_limit=2,
    push_to_hub=True,
    hub_strategy='all_checkpoints',
    hub_private_repo=True
    )

trainer = Trainer(
    model=model2,
    args=training_args,
    train_dataset=dataset,
    data_collator=train_collator
    )

trainer.train()

## Model 3

In [None]:
model3 = AutoModelForImageClassification.from_pretrained(
    'microsoft/cvt-21-384-22k',
    num_labels=6,
    ignore_mismatched_sizes=True
    )

In [None]:
# Same thing again, just more epochs
training_args = TrainingArguments(
    output_dir='yoga-competition-cvt-3',
    num_train_epochs=180,
    remove_unused_columns=False,
    per_device_train_batch_size=16,
    logging_strategy='epoch',
    dataloader_num_workers=2,
    overwrite_output_dir=True,
    report_to='tensorboard',
    weight_decay=0.05,
    learning_rate=0.00002,
    lr_scheduler_type='cosine',
    warmup_steps=steps_per_epoch*5,
    save_strategy='steps',
    save_steps=steps_per_epoch*5,
    save_total_limit=2,
    push_to_hub=True,
    hub_strategy='all_checkpoints',
    hub_private_repo=True
    )

trainer = Trainer(
    model=model3,
    args=training_args,
    train_dataset=dataset,
    data_collator=train_collator
    )

trainer.train()

## Model 4 (only used in 1 of the 2 selected submissions)

In [None]:
model4 = AutoModelForImageClassification.from_pretrained(
    'microsoft/cvt-21-384-22k',
    num_labels=6,
    ignore_mismatched_sizes=True
    )

In [None]:
# Same thing again, just more epochs
training_args = TrainingArguments(
    output_dir='yoga-competition-cvt-3',
    num_train_epochs=100,
    remove_unused_columns=False,
    per_device_train_batch_size=16,
    logging_strategy='epoch',
    dataloader_num_workers=2,
    overwrite_output_dir=True,
    report_to='tensorboard',
    weight_decay=0,
    learning_rate=0.00002,
    lr_scheduler_type='cosine',
    warmup_steps=steps_per_epoch*5,
    save_strategy='steps',
    save_steps=steps_per_epoch*5,
    save_total_limit=2,
    push_to_hub=True,
    hub_strategy='all_checkpoints',
    hub_private_repo=True
    )

trainer = Trainer(
    model=model3,
    args=training_args,
    train_dataset=dataset,
    data_collator=train_collator
    )

trainer.train()

# (Optional) Loading your trained models from the hub

In [None]:
from transformers import AutoModelForImageClassification
model1 = AutoModelForImageClassification.from_pretrained('dariia-artemova/yoga-competition-cvt-1')
model2 = AutoModelForImageClassification.from_pretrained('dariia-artemova/yoga-competition-cvt-2')
model3 = AutoModelForImageClassification.from_pretrained('dariia-artemova/yoga-competition-cvt-3')
model4 = AutoModelForImageClassification.from_pretrained('dariia-artemova/yoga-competition-cvt-4')

# Generating submisssion.csv

In [None]:
import torch
if torch.cuda.is_available():
  device = torch.device('cuda')
else:
  device = torch.device('cpu')
print(device)

In [None]:
model1.to(device)
model2.to(device)
model3.to(device)
model4.to(device)
model1.eval()
model2.eval()
model3.eval()
model4.eval()

In [None]:
test_dir = '/content/dataset/images/test_images/'

In [None]:
import os
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision.io import read_image, ImageReadMode
from torchvision.transforms.v2 import functional as F
from torchvision.transforms import InterpolationMode


class YogaDataset(Dataset):
  def __init__(self, img_dir):
    self.img_dir = img_dir
    self.img_ids = os.listdir(img_dir)

  def __len__(self):
    return len(self.img_ids)

  def __getitem__(self, idx):
    img_id = self.img_ids[idx]
    img = read_image(self.img_dir + img_id, mode=ImageReadMode.RGB)
    img = F.resize(img, size=384, antialias=True,
                   interpolation=InterpolationMode.BICUBIC)
    center_crop = F.center_crop(img, output_size=384)
    return {'img_id': img_id, 'img': img, 'center crop' : center_crop}

def collate(element):
  img = element['img']
  img_hf = F.convert_image_dtype(F.hflip(img), dtype=torch.float32)
  img = F.convert_image_dtype(img, dtype=torch.float32)
  batched_imgs = torch.stack((img, img_hf), dim=0)

  img = element['center crop']
  img_hf = F.convert_image_dtype(F.hflip(img), dtype=torch.float32)
  img = F.convert_image_dtype(img, dtype=torch.float32)
  batched_cc = torch.stack((img, img_hf), dim=0)
  return {'img_id' : element['img_id'],
          'imgs' : batched_imgs, 'center crops' : batched_cc}

test_data = YogaDataset(test_dir)
test_loader = DataLoader(test_data,
                         collate_fn=collate,
                         shuffle=True,
                         batch_size=None,
                         num_workers=2)

In [None]:
# Use this to generate 1 of my 2 submissions...
models = [model1, model2, model3]

In [None]:
# ...and  this to generate the other one
models = [model1, model2, model3, model4]

In [None]:
from tqdm import tqdm

img_ids = []
labels = []

for batch in tqdm(test_loader):
  img_ids.append(batch['img_id'])
  with torch.no_grad():
    imgs = batch['imgs'].to(device)
    for i, model in enumerate(models):
      logits = model(imgs).logits
      if i == 0:
        predictions = logits.softmax(dim=-1)
      else:
        prediction = logits.softmax(dim=-1)
        predictions = torch.cat((predictions, prediction), dim=0)

    img_size = F.get_spatial_size(batch['imgs'][0])
    if img_size[0] != img_size[1]:
      cc_imgs = batch['center crops'].to(device)
      for model in models:
        logits = model(cc_imgs).logits
        prediction = logits.softmax(dim=-1)
        predictions = torch.cat((predictions, prediction), dim=0)

    #aggregation
    final_prediction = torch.mean(predictions, dim=0)
    final_prediction = final_prediction.argmax().item()
    labels.append(final_prediction)

In [None]:
import pandas as pd

columns = {'image_id' : img_ids, 'class_6' : labels}

submission = pd.DataFrame(columns)
submission.to_csv('submission.csv', index=False)
submission