In [1]:
%matplotlib inline
import json

import cv2
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import torch
import torch.nn as nn
from datasets import load_dataset

import torch.optim as optim
from torch.utils.data import DataLoader, IterableDataset

import torch.nn.functional as F

# from torchvision import transforms
from sklearn.metrics import accuracy_score

from typing import Any, Tuple, List

from cv2 import Mat
from numpy import dtype, floating, integer, ndarray

from tqdm.autonotebook import tqdm

plt.rcParams["figure.figsize"] = (16, 10)  # (w, h)

In [2]:
with open("kaggle/input/annotations/iwildcam2020_train_annotations.json") as f:
    data = json.load(f)


annotations = pd.DataFrame.from_dict(data["annotations"])
images_metadata = pd.DataFrame.from_dict(data["images"])
categories = pd.DataFrame.from_dict(data["categories"])

In [3]:
# convert datetime type and split into day/night time
def split_day_night_time(
    data: pd.DataFrame, day_start: str = "06:00:00", day_end: str = "18:00:00"
) -> pd.DataFrame:
    data = data.copy()
    data["datetime"] = pd.to_datetime(data["datetime"])
    data["is_day"] = data["datetime"].apply(
        lambda x: True
        if pd.Timestamp(day_start).time() <= x.time() < pd.Timestamp(day_end).time()
        else False
    )
    return data

In [4]:
def preprocess_dark_images(
    image: np.ndarray,
) -> Mat | ndarray[Any, dtype[integer[Any] | floating[Any]]]:
    img = cv2.cvtColor(image, cv2.COLOR_RGB2LUV)
    img_eq = img.copy()
    img_eq[:, :, 0] = cv2.equalizeHist(img[:, :, 0])
    final_img = cv2.cvtColor(img_eq, cv2.COLOR_LUV2RGB)
    return final_img

In [13]:
from pathlib import Path

class iWildCam2020Dataset(IterableDataset):
    def __init__(
        self,
        dataset: str,
        metadata: pd.DataFrame,
        batch_size: int = 16,
        resize_dim: Tuple[int, int] | None = None,
        num_samples: int = 1000,
        mean: np.ndarray | None = None,
        std: np.ndarray | None = None,
        save_dir: str | None = None,
        overwrite: bool = False,
        split: str = "train",
        val_ratio: float = 0.2,
    ):
        super().__init__()
        self.metadata = metadata

        self.split = split
        self.val_ratio = val_ratio
        self.train_size = int((1 - val_ratio) * num_samples)
        self.val_size = num_samples - self.train_size

        self.dataset = dataset
        self.batch_size = batch_size
        self.resize_dim = resize_dim

        self.num_samples = num_samples
        if self.split == "train":
            self.num_batches = (self.train_size + batch_size - 1) // batch_size
        else:
            self.num_batches= (self.val_size + batch_size - 1) // batch_size

        self.mean = torch.tensor(mean if mean is not None else [0.0, 0.0, 0.0]).view(
            3, 1, 1
        )
        self.std = torch.tensor(std if std is not None else [1.0, 1.0, 1.0]).view(
            3, 1, 1
        )

        self.save_dir = Path(save_dir) if save_dir else None
        if self.save_dir:
            self.save_dir.mkdir(parents=True, exist_ok=True)
        self.overwrite = overwrite

    def save_image(self, img_tensor: torch.Tensor, idx: int):
        if self.save_dir:
            save_path = self.save_dir / f"image_{idx}.pt"
            torch.save(img_tensor, save_path)
    
    def load_image(self, idx: int) -> torch.Tensor | None:
        if self.save_dir:
            save_path = self.save_dir / f"image_{idx}.pt"
            if save_path.exists():
                return torch.load(save_path, weights_only=True)
        return None
    
    def __len__(self):
        return self.num_batches

    def __iter__(self):
        if self.split == "train":
            start_idx, end_idx = 0, self.train_size
        else:
            start_idx, end_idx = self.train_size, self.num_samples
        
        for idx, image_batch in enumerate(self.dataset.iter(self.batch_size)):
            # to get consistent part of dataset + val / train split
            batch_start = idx * self.batch_size
            if batch_start >= end_idx:
                break
            if batch_start < start_idx:
                continue
            
            is_day = self.metadata[idx * self.batch_size : (idx + 1) * self.batch_size][
                "is_day"
            ].values
            image_batch = image_batch["image"]
            imgs_ = []

            dark_idx = set(np.where(~is_day)[0].tolist())
            for i in range(len(image_batch)):
                img_tensor = self.load_image(idx * self.batch_size + i)
                if img_tensor is None:
                    img = np.transpose(image_batch[i].numpy())
                    if i in dark_idx:
                        img = preprocess_dark_images(img)
                    img = cv2.resize(img, self.resize_dim, interpolation=cv2.INTER_AREA)
                    img_tensor = (
                        torch.tensor(np.transpose(img, (2, 0, 1)), dtype=torch.float32)
                        / 255.0
                    )

                    if self.save_dir:
                        self.save_image(img_tensor, idx * self.batch_size + i)

                imgs_.append(img_tensor)
            yield torch.stack(imgs_)

In [5]:
def calculate_mean_std(dataset, batch_size=32, resize_dim=(224, 224), num_samples=1000):
    means = []
    stds = []
    for idx, image_batch in tqdm(enumerate(dataset.iter(batch_size)), total = ((num_samples + batch_size - 1) // batch_size)):
        if idx * batch_size >= num_samples:
            break

        imgs_ = []
        for image in image_batch["image"]:
            img = np.transpose(image.numpy(), (1, 2, 0))
            img = cv2.resize(img, resize_dim, interpolation=cv2.INTER_AREA)
            img = img / 255.0
            imgs_.append(img)

        imgs_array = np.stack(imgs_)
        means.append(imgs_array.mean(axis=(0, 1, 2)))
        stds.append(imgs_array.std(axis=(0, 1, 2)))

    mean = np.mean(means, axis=0)
    std = np.mean(stds, axis=0)
    return mean, std

In [23]:
def train(
    model,
    criterion,
    optimizer,
    train_loader,
    val_loader,
    batch_size,
    device,
    num_epochs=1,
    ckpt_path="best.pt"
):
    best = 0.0
    for epoch in range(num_epochs):
        train_loop = tqdm(
            enumerate(train_loader, 0),
            total=len(train_loader),
            desc=f"Epoch {epoch}: train",
        )

        model.train()
        train_loss = 0.0

        for i, batch in train_loop:
            images = batch.to(device)
            labels = torch.tensor(
                annotations["category_id"][
                    epoch * (len(train_loader) * batch_size) + batch_size * i : min(
                        epoch * (len(train_loader) * batch_size) + batch_size * (i + 1),
                        len(annotations["category_id"]),
                    )
                ].values
            ).to(device)

            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs.to(device), labels)
            loss.backward()
            optimizer.step()

            train_loss += loss.item()
            train_loop.set_postfix({"loss": loss.item()})

        correct = 0
        total = 0
        val_loss = 0
        with torch.no_grad():
            model.eval()

            val_loop = tqdm(
                enumerate(val_loader, 0),
                total=len(val_loader),
                desc=f"Val",
            )

            for i, batch in val_loop:
                images = batch.to(device)
                labels = torch.tensor(
                    annotations["category_id"][
                        epoch * len(train_loader) * batch_size
                        + batch_size * i : min(
                            epoch * len(train_loader) * batch_size
                            + batch_size * (i + 1),
                            len(annotations["category_id"]),
                        )
                    ].values
                ).to(device)

                outputs = model(images)
                _, predicted = torch.max(outputs, 1)

                loss = criterion(outputs.to(device), labels)
                val_loss += loss.item()

                total += labels.size(0)
                correct += (predicted == labels).sum().item()
                val_loop.set_postfix({"acc": correct / total, "loss": val_loss / (i + 1)})


            val_accuracy = correct / total

            print(f"Epoch [{epoch + 1}/{num_epochs}], Training Loss: {train_loss / len(train_loader):.6f}")
            print(f"Epoch [{epoch + 1}/{num_epochs}], Validation Accuracy: {val_accuracy:.6f}, Validation Loss: {val_loss:.6f}")

            if val_accuracy > best:
                torch.save(model.state_dict(), ckpt_path)
                best = correct / total

In [7]:
dataset = load_dataset(
    "anngrosha/iWildCam2020", split="train", streaming=True
).with_format("torch")

Resolving data files:   0%|          | 0/190 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/52 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/190 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/52 [00:00<?, ?it/s]

In [8]:
images_metadata = split_day_night_time(images_metadata)

Performing singe-batch overfitting to see if model capable enought for our task

In [10]:
batch_size = 5
img_size = 640
resize_dim = (img_size, img_size)
num_classes = max(annotations["category_id"])

num_samples = 5000
val_ratio = 0.1

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [11]:
mean, std = calculate_mean_std(
    dataset, batch_size=batch_size, resize_dim=resize_dim, num_samples=num_samples
)
mean, std

  0%|          | 0/1000 [00:00<?, ?it/s]

(array([0.38960105, 0.39468021, 0.36564374]),
 array([0.25637901, 0.25818906, 0.26048803]))

In [17]:
train_dataset = iWildCam2020Dataset(
    dataset=dataset,
    metadata=images_metadata,
    batch_size=batch_size,
    resize_dim=resize_dim,
    num_samples=num_samples,
    mean=mean,
    std=std,
    save_dir="working/data/train",
    split="train",
    val_ratio=val_ratio
)

val_dataset = iWildCam2020Dataset(
    dataset=dataset,
    metadata=images_metadata,
    batch_size=batch_size,
    resize_dim=resize_dim,
    num_samples=num_samples,
    mean=mean,
    std=std,
    save_dir="working/data/val",
    split="val",
    val_ratio=val_ratio
)


train_loader = DataLoader(train_dataset, batch_size=None)
val_loader = DataLoader(val_dataset, batch_size=None)

In [18]:
from IPython.display import clear_output

In [19]:
!pip install ultralytics
!pip install -U ipywidgets
clear_output()

This is baseline YOLO to compare our trained model to the SOTA model as YOLO (YOLO could be used for classification, so this mode is used). 

Basic issues:
1. YOLO trained on COCO dataset with general animal classes (we get them by hand down in the code + tried to make auto-mapping)
2. For now we will judge YOLO if it finds animals (general classes) in the images (same data chunks as other model(s)), so this is easier task (12 classes vs. 267 classes)

In [20]:
from ultralytics import YOLO

model = YOLO("yolo11n-cls.yaml").load("yolo11n-cls.pt").to(device)

Creating new Ultralytics Settings v0.0.6 file ✅ 
View Ultralytics Settings with 'yolo settings' or at '/Users/diazzz/Library/Application Support/Ultralytics/settings.json'
Update Settings with 'yolo settings key=value', i.e. 'yolo settings runs_dir=path/to/dir'. For help see https://docs.ultralytics.com/quickstart/#ultralytics-settings.
YOLO11n-cls summary: 151 layers, 1,633,584 parameters, 1,633,584 gradients, 3.3 GFLOPs
Downloading https://github.com/ultralytics/assets/releases/download/v8.3.0/yolo11n-cls.pt to 'yolo11n-cls.pt'...
⚠️ Download failure, retrying 1/3 https://github.com/ultralytics/assets/releases/download/v8.3.0/yolo11n-cls.pt...


######################################################################## 100.0%


Transferred 234/236 items from pretrained weights


In [21]:
model = torch.load("yolo11n-cls.pt")["model"].to(device)
for param in model.parameters():
    param.requires_grad = True
model.float()

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adadelta(model.parameters(), lr=1, rho=0.95, weight_decay=1e-5)

In [24]:
train(
    model,
    criterion,
    optimizer,
    train_loader,
    val_loader,
    batch_size,
    device,
    num_epochs=5
)

Epoch 0: train:   0%|          | 0/900 [00:00<?, ?it/s]

Val:   0%|          | 0/100 [00:00<?, ?it/s]

Epoch [1/5], Training Loss: 1.316656
Epoch [1/5], Validation Accuracy: 0.250000, Validation Loss: 682.500370


Epoch 1: train:   0%|          | 0/900 [00:00<?, ?it/s]

Val:   0%|          | 0/100 [00:00<?, ?it/s]

Epoch [2/5], Training Loss: 1.197736
Epoch [2/5], Validation Accuracy: 0.000000, Validation Loss: 690.946805


Epoch 2: train:   0%|          | 0/900 [00:00<?, ?it/s]

Val:   0%|          | 0/100 [00:00<?, ?it/s]

Epoch [3/5], Training Loss: 1.297397
Epoch [3/5], Validation Accuracy: 0.142000, Validation Loss: 676.848558


Epoch 3: train:   0%|          | 0/900 [00:00<?, ?it/s]

Val:   0%|          | 0/100 [00:00<?, ?it/s]

Epoch [4/5], Training Loss: 0.515178
Epoch [4/5], Validation Accuracy: 0.000000, Validation Loss: 690.676773


Epoch 4: train:   0%|          | 0/900 [00:00<?, ?it/s]

Val:   0%|          | 0/100 [00:00<?, ?it/s]

Epoch [5/5], Training Loss: 0.824677
Epoch [5/5], Validation Accuracy: 0.036000, Validation Loss: 688.079607


In [25]:
import wandb
num_epochs = 5

# Инициализация сессии W&B
wandb.init(project="yolov11n-training-v1", config={
    "learning_rate": 0.001,
    "epochs": 5,
    "batch_size": 5,
})

wandb.log({'epoch': 1, 'train_loss': 1.316656, 'val_loss': 682.500370, 'accuracy': 0.25})
wandb.log({'epoch': 2, 'train_loss': 1.197736, 'val_loss': 690.946805, 'accuracy': 0})
wandb.log({'epoch': 3, 'train_loss': 1.297397, 'val_loss': 676.848558, 'accuracy': 0.142})
wandb.log({'epoch': 4, 'train_loss': 0.515178, 'val_loss': 690.676773, 'accuracy': 0})
wandb.log({'epoch': 5, 'train_loss': 0.824677, 'val_loss': 688.079697, 'accuracy': 0.36})

wandb.finish()

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mshinbayeva[0m ([33mshinbayeva-shinbayeva[0m). Use [1m`wandb login --relogin`[0m to force relogin


VBox(children=(Label(value='0.003 MB of 0.003 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
accuracy,▆▁▄▁█
epoch,▁▃▅▆█
train_loss,█▇█▁▄
val_loss,▄█▁█▇

0,1
accuracy,0.36
epoch,5.0
train_loss,0.82468
val_loss,688.0797


In [26]:
iwildcam_class_names = categories["name"].values.tolist()

In [28]:
import yaml

def load_yolo_classes(yolo_yaml_path):
    with open(yolo_yaml_path, 'r') as f:
        data = yaml.safe_load(f)
    return data['names'] 

yolo_classes = load_yolo_classes("kaggle/input/yolo-coco-names/yolo_coco.yaml")

In [29]:
class_mapping = {}

for yolo_id, yolo_class in yolo_classes.items():
    if yolo_class in iwildcam_class_names:
        iwildcam_index = iwildcam_class_names.index(yolo_class)
        class_mapping[yolo_id] = iwildcam_index
        print(yolo_class)

print("Class Mapping:")
print(class_mapping)

empty
tayassu pecari
dasyprocta punctata
cuniculus paca
puma concolor
tapirus terrestris
pecari tajacu
mazama americana
leopardus pardalis
geotrygon montana
nasua nasua
dasypus novemcinctus
eira barbara
didelphis marsupialis
penelope jacquacu
procyon cancrivorus
aramides cajaneus
panthera onca
myrmecophaga tridactyla
tinamus major
crypturellus sp
sylvilagus brasiliensis
priodontes maximus
tamandua tetradactyla
tigrisoma lineatum
cochlearius cochlearius
puma yagouaroundi
leopardus wiedii
buteogallus urubitinga
mazama gouazoubira
philander opossum
capra aegagrus
bos taurus
ovis aries
canis lupus
lepus saxatilis
human
turtur calcospilos
papio anubis
unknown
genetta genetta
tragelaphus scriptus
equus africanus
herpestes sanguineus
loxodonta africana
cricetomys gambianus
raphicerus campestris
hyaena hyaena
aepyceros melampus
crocuta crocuta
caracal caracal
equus ferus
panthera leo
tragelaphus oryx
kobus ellipsiprymnus
phacochoerus africanus
panthera pardus
ichneumia albicauda
canis mesomela

In [None]:
coco_idx = [0, 3, 15, 16, 17, 18, 19, 20, 21, 22, 23]

In [None]:
def count_valid_classes(model, dataloader, coco_animal_idx, device):
    counts = {idx: 0 for idx in coco_animal_idx}
    correct = 0
    total = 0
    with torch.no_grad():
        main_loop = tqdm(
            enumerate(dataloader, 0),
            total=len(dataloader),
            desc=f"Evaluating",
        )
        
        for i, images in main_loop:
            print("Batch index:", i)
            images = images.to(device)
            results = model.predict(images, augment=False, save=False, verbose=False, device=device)
            
            for i, result in enumerate(results):
                probs = result.probs.data

                predicted_class = torch.argmax(probs).item()
                total += 1
                
                if predicted_class in coco_animal_idx:
                    counts[predicted_class] += 1
                    correct += 1
            
            main_loop.set_postfix({"acc": correct / total})

    return counts, correct / total

In [None]:
model_yolo = YOLO("yolo11n-cls.yaml").load("yolo11n-cls.pt").to(device)
valid_counts, acc = count_valid_classes(model_yolo, val_loader, coco_idx, device)
print(acc)