# Introduction

J'ai tout d'abord commencé par uploader le dataset sur [Hugging Face](https://huggingface.co/datasets/Alanox/stanford-dogs) à la fois pour apprendre à utiliser l'upload de dataset mais également pour faire partager ce dataset facilement à la communauté.

Testons que cela fonctione bien

In [None]:
import datasets

dataset = datasets.load_dataset("Alanox/stanford-dogs", split="full")
dataset

In [None]:
dataset[0]["image"]

L'avantage est que le dataset entier n'est pas chargé ! On charge uniquement ce dont on a besoin

# Données

Regardons un peu les données.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

plt.style.use(['ggplot', 'https://raw.githubusercontent.com/AlanBlanchet/matplotlib_styles/master/vscode_blue.mplstyle'])

def add_shape(batch):
    batch["size"] = batch["image"].size
    return batch

ds = dataset.map(add_shape).select_columns(["name", "target", "annotations", "size"])

df = pd.DataFrame(ds.to_dict())
df.head()

In [None]:
len(df)

In [None]:
plt.figure(figsize=(24,9))
counts = df["target"].value_counts()

plt.title("Target distribution")
plt.bar(counts.index, counts)
plt.xticks(ha="right", rotation=45);

In [None]:
df_annots = pd.DataFrame(df["annotations"].explode().reset_index(drop=True).tolist(), columns=["xmin", "ymin", "xmax", "ymax"])
df_annots.head()

In [None]:
df_annots.describe()

In [None]:
fig, axs = plt.subplots(2, 2, figsize=(16,9), sharex=True, sharey=True)
axs:list[plt.Axes] = np.array(axs).flatten()

fig.suptitle("Coordinate distributions")
for ax, (name, coords) in zip(axs, df_annots.T.iterrows()):
    sorted_coords = coords.sort_values()
    ax.set_title(name)
    ax.plot(sorted_coords.values)

In [None]:
def pixel_area(box:pd.DataFrame):
    box_annots = box["annotations"].explode()
    box_area = box_annots.apply(lambda r: (r[2] - r[0]) * (r[3] - r[1]))
    return box_area

In [None]:
target_areas = df.groupby("target").apply(pixel_area)
target_mean_areas = target_areas.groupby("target").apply(np.mean)
target_mean_areas_full = df.groupby("target").apply(lambda x: np.mean(x["size"].apply(lambda y: y[0] * y[1])))
target_mean_areas.head(), target_mean_areas_full.head()

In [None]:
target_mean_areas = target_mean_areas.sort_values()

plt.figure(figsize=(24,9))

plt.title("Mean annotation box area per target")
plt.bar(target_mean_areas.index, target_mean_areas)
plt.ylabel("pixel**2")
plt.xticks(ha="right", rotation=45);

On remarque qu'il y a plus de pixels représentant un "Irish Water Spaniel" qu'un "English Foxhound".

In [None]:
first_targets = df.drop_duplicates(["target"], keep="first").reset_index().set_index("target")
first_targets.head()

In [None]:
idx_english_foxhound = int(first_targets.loc["English Foxhound"]["index"])
idx_irish_water_spaniel = int(first_targets.loc["Irish Water Spaniel"]["index"])
idx_english_foxhound, idx_irish_water_spaniel

In [None]:
img = dataset[idx_english_foxhound]["image"]
print(img.size)
img

In [None]:
img = dataset[idx_irish_water_spaniel]["image"]
print(img.size)
img

Normalisons nos résultats pour avoir un ratio par rapport aux images. Certains chiens peuvent prendre beaucoup d'espace sur une image tandis que d'autres en prennent peut être moins.

In [None]:
target_mean_areas_ratio = (target_mean_areas / target_mean_areas_full).sort_values()

plt.figure(figsize=(24,9))

plt.title("Mean annotation box area per target normalized")
plt.bar(target_mean_areas_ratio.index, target_mean_areas_ratio)
plt.ylabel("Mean pixel ratio")
plt.xticks(ha="right", rotation=45);

In [None]:
first_targets = df.drop_duplicates(["target"], keep="first").reset_index().set_index("target")

idx_chesapeake_bay_retriever = int(first_targets.loc["Chesapeake Bay Retriever"]["index"])
idx_irish_water_spaniel = int(first_targets.loc["Irish Water Spaniel"]["index"])
idx_chesapeake_bay_retriever, idx_irish_water_spaniel

In [None]:
img = dataset[idx_chesapeake_bay_retriever]["image"]
img.size

# Data augmentation

In [None]:
import torchvision
import torchvision.transforms.functional as F
import torchvision.transforms.v2 as T

torchvision.disable_beta_transforms_warning()

dog = dataset.with_format("pytorch")[0]
img = dog["image"]
img_name = dog["name"]
transforms = T.Compose([
    lambda x: x.permute(2, 0, 1),
    T.Resize(400, antialias=True)
])
F.to_pil_image(transforms(img))

In [None]:
applies = [
    T.AugMix(),
    T.AutoAugment(),
    T.CenterCrop(200),
    T.ColorJitter(),
    T.ElasticTransform(50.0, 1.0),
    T.Grayscale(),
    T.GaussianBlur(5),
    T.Pad(30),
    T.RandomAdjustSharpness(2, p=1),
    T.RandomAutocontrast(p=1),
    T.RandomCrop(200, 200),
    T.RandomHorizontalFlip(p=1),
    T.RandomVerticalFlip(p=1),
    T.RandomInvert(p=1),
    T.RandomPerspective(p=1),
    T.RandomPhotometricDistort(p=1),
    T.RandomPosterize(4, p=1),
    T.RandomZoomOut(p=1),
    T.RandomSolarize(0.5, p=1),
]

n = len(applies)

cols = 4
rows = -(-n // 4)  # ceil

fig, axs = plt.subplots(rows, cols, sharex=True, sharey=True, figsize=(24, 26))
for ax, transform in zip(axs.flatten(), applies):
    ax.set_title(type(transform).__name__)
    ax.grid(False)
    ax.imshow(transform(F.to_pil_image(transforms(img))))

# Bounding box

In [None]:
import cv2

masks = dog["annotations"]
masks

### Border

In [None]:
img_annot = img.numpy().copy()

for mask in masks:
    x, y, w, h = mask.numpy()

    res = img.shape[0] / img.shape[1]

    img_annot = cv2.rectangle(img_annot, (x,y), (x+w, y+h), (0,0,255), thickness=max(int(res*3), 1))

F.to_pil_image(img_annot)

### Mask

In [None]:
img_annot = img.numpy().copy()

for mask in masks:
    x, y, w, h = mask.numpy()

    res = img.shape[0] / img.shape[1]

    zeros = np.zeros([*img_annot.shape[:-1], 3], dtype=np.uint8)
    zeros = cv2.rectangle(zeros, (x,y), (x+w, y+h), (0,0,255), thickness=-1)

    img_annot = cv2.addWeighted(img_annot, 1, zeros, 1, 0)

print(img_annot.shape)
F.to_pil_image(img_annot)

# Normalisation

Some images don't have a normalized luminosity or contrast. The model could be biased by the correlated data from each neighbor pixel.

A remedy for this is [PCA/ZCA Whitening](https://www-cs.stanford.edu/~acoates/papers/coatesng_nntot2012.pdf) that I will implement on an example for a dog image.

[Here](https://github.com/hadrienj/Preprocessing-for-deep-learning) is also a really good explanation of how ZCA works

In [None]:
import torch
import seaborn as sns
import warnings
warnings.filterwarnings("ignore", "is_categorical_dtype")
warnings.filterwarnings("ignore", "use_inf_as_na")

def square_return_pt_PIL_from_PIL(img):
    size = min([*img.size])
    img = img.resize((size,size))
    return img, F.pil_to_tensor(img).float()


def plot_channels(row):
    img, img_t = square_return_pt_PIL_from_PIL(row["image"])

    # Put channel as first dimension
    img_data:torch.Tensor = img_t.permute(2, 0, 1)

    fig, axs = plt.subplots(1, 3, figsize=(21,7), sharey=True)
    axs:list[plt.Axes] = axs.flatten()

    fig.suptitle(f"Channels intensities count of {row['name']} {img.size}")
    # Color itensities
    for chan, c, ax in zip(img_data, img.mode, axs):

        intensities = chan.flatten()

        ax.set_title(c)

        sns.histplot(intensities, ax=ax, color=c.lower(), bins=range(255))

plot_channels(dataset[0])

We need to find the whitening per-pixel, thus we need to iterate threw all the images.

The images all need to be of same size. We will save the values of the transformation for the specific size to apply it during training.

We will be using a sample of the data since our dataset is large.

First we resize the images

In [None]:
def resize(size, col_name):
    def resize_col(batch):
        batch[col_name] = [i.resize((size, size)) for i in batch["image"]]
        return batch
    
    return resize_col

size = 32
col_size = f"s{size}"
# Keep images as arrow (on disk)
dataset = dataset.map(resize(size, col_size), batched=True, num_proc=8)
dataset

We get the mean per-pixel, covariance matrix and calculate the SVD.

SVD for high covariance matrix shapes is heavy in memory and calculations. We will only do whitening for images lower than 32x32x3 and skip the heavy computation steps.

In [None]:
np.random.seed(0)
rand_idx = np.random.choice(range(len(dataset)), size=5000)
subset = dataset.select(rand_idx)

imgs = subset.with_format("pt")[col_size] / 255
imgs_f = imgs.flatten(start_dim=1)
imgs_mean = imgs_f.mean(axis=0)
print("Mean/Cov...")
imgs_f -= imgs_mean
imgs_cov = imgs_f.T.cov()
# Vector decomposition
print("SVD...")
U,S,V = torch.svd(imgs_cov)
e = 0.1

In [None]:
e = 0.1

ZCA_mat = (U @ np.diag(1.0 / np.sqrt(S + e))) @ U.T
print("ZCA shape = ", ZCA_mat.shape)

def zca(imgs:np.ndarray):
    shape = imgs.shape
    imgs = imgs.flatten(start_dim=1)
    imgs_zca = (ZCA_mat @ imgs.T).T
    imgs_zca = (imgs_zca - imgs_zca.min()) / (imgs_zca.max() - imgs_zca.min())
    return imgs_zca.view(shape)

imgs_zca = zca(imgs)
print(imgs_zca.shape)
imgs_zca = imgs_zca.permute(0, 3, 1, 2)

In [None]:
num_shown = 8
num_offset = 8

plt.tight_layout()
fig, axs = plt.subplots(2, num_shown, figsize=(16,4))

for col_axs, imgs_axs in zip(axs, ([F.to_pil_image(i).resize((size, size)) for i in imgs_zca[num_offset:num_offset+num_shown]], subset.select(np.arange(num_offset, num_offset+num_shown))[col_size])):
    for ax, img in zip(col_axs, imgs_axs):
        ax.imshow(img)
        ax.axis("off")
        ax.grid(False)

# Supervised

Here we randomly chose classes and run UMAP on the data

In [None]:
targets = df["target"].unique()
num_classes = 5

np.random.seed(7)
rand_classes = np.random.choice(range(len(targets)), size=num_classes)
select_targets = targets[rand_classes]
selected_idx = df.loc[df["target"].isin(select_targets)].index

print("Classes = ", select_targets)

mapped = dataset.select(selected_idx).with_format("pt")

In [None]:
from sklearn.preprocessing import StandardScaler

flats = mapped[col_size].flatten(start_dim=1)

print(flats.shape)
scaled = StandardScaler().fit_transform(flats)
print(scaled.shape)

In [None]:
import umap
import seaborn as sns

reducer = umap.UMAP(n_epochs=100, random_state=0)
dd = reducer.fit_transform(scaled)
plt.figure(figsize=(16,9))
sns.scatterplot(x=dd[:,0],y=dd[:,1], hue=mapped["target"])

With ZCA

In [None]:
mapped_zca = zca(mapped[col_size].float())

scaled = StandardScaler().fit_transform(mapped_zca.flatten(start_dim=1))

reducer = umap.UMAP(n_epochs=100, random_state=0)
dd = reducer.fit_transform(scaled)
plt.figure(figsize=(16,9))
sns.scatterplot(x=dd[:,0],y=dd[:,1], hue=mapped["target"])