In [1]:
from radiocovid.core import RadioCovidDataset, RadioCovidDataModule
from radiocovid.core.data import RadioCovidSubset
from torch.utils.data import DataLoader, WeightedRandomSampler
import torch
from hashlib import sha256
from pathlib import Path
from radiocovid.core.utils import (
    seed_worker,
    get_seeded_generator,
    worker_balanced_n_samples,
)
from tqdm.auto import tqdm
from sklearn.model_selection import train_test_split
import numpy as np
from collections import defaultdict, Counter
import hydra
from hydra import initialize, compose
import torch.nn.functional as F

In [2]:
initialize(
    version_base=None,
    config_path="../radiocovid-core/src/radiocovid/core/configs",
    job_name="test_app",
)
cfg = compose(config_name="train")

In [3]:
cfg.datamodule.dataset.root = "../data/02_input"

In [5]:
datamodule = hydra.utils.instantiate(cfg.datamodule)

In [6]:
dataset = hydra.utils.instantiate(cfg.datamodule.dataset)
train_transform = hydra.utils.instantiate(cfg.datamodule.train_transform)
eval_transform = hydra.utils.instantiate(cfg.datamodule.eval_transform)
eval_loader = hydra.utils.instantiate(cfg.datamodule.eval_loader)
train_loader = hydra.utils.instantiate(cfg.datamodule.train_loader)
class_retriever = hydra.utils.instantiate(cfg.datamodule.class_retriever)

In [7]:
fit_idx, test_idx = train_test_split(
    range(len(dataset)),
    test_size=datamodule.test_set,
    stratify=dataset.targets,
    random_state=cfg.seed,
)

train_idx, val_idx = train_test_split(
    fit_idx,
    test_size=datamodule.val_size,
    stratify=np.array(dataset.targets)[np.setdiff1d(range(len(dataset)), test_idx)],
    random_state=cfg.seed,
)

In [8]:
val_set = RadioCovidSubset(dataset, indices=val_idx, transform=eval_transform)

In [9]:
if class_retriever:
    targets = []
    meta_labels = []
    meta_set = defaultdict(lambda: set())
    for c, t in dataset.samples:
        meta_labels.append(class_retriever(path=c))
        meta_set[t].add(Path(c).stem.split("-")[0])
        targets.append(t)
else:
    targets = dataset.targets

if train_idx is None:
    train_set = RadioCovidSubset(
        dataset, indices=range(len(dataset)), transform=train_transform
    )
    counter = Counter(meta_labels if class_retriever else targets)
else:
    targets = np.array(targets)[train_idx].tolist()
    meta_labels = np.array(meta_labels)[train_idx].tolist()
    train_set = RadioCovidSubset(dataset, indices=train_idx, transform=train_transform)
    counter = Counter(meta_labels if class_retriever else targets)

class_weights = {c: 1.0 / n for c, n in counter.items()}

if class_retriever:
    meta_sample_weights = [class_weights[int(t)] for t in meta_labels]
    sample_weights = torch.tensor(
        [
            (1 / len(meta_set[t])) * meta_sample_weights[i]
            for i, t in enumerate(targets)
        ],
        dtype=torch.double,
    )
else:
    sample_weights = torch.tensor(
        [class_weights[int(t)] for t in targets], dtype=torch.double
    )

In [10]:
test_set = RadioCovidSubset(dataset, indices=test_idx, transform=eval_transform)

In [11]:
Counter(sample_weights.numpy().tolist())

Counter({0.00019201228878648233: 5208,
         0.00010410160316468873: 3202,
         0.00018185124568103288: 1833,
         0.0005035246727089627: 662})

In [12]:
print(f"Train set size : {len(train_set)}")
if sample_weights is None:
    sample_weights = torch.ones(len(train_set))  # type: ignore[arg-type]
n = worker_balanced_n_samples(len(train_set), train_loader.keywords["batch_size"], 1)  # type: ignore[arg-type]
print(f"Train set size after padding : {n}")
sampler = WeightedRandomSampler(
    weights=sample_weights,
    num_samples=n,
    replacement=True,
    generator=get_seeded_generator(datamodule.seed),
)
tloader = train_loader(
    dataset=train_set,
    sampler=sampler,
    worker_init_fn=seed_worker,
    generator=get_seeded_generator(datamodule.seed),
)

Train set size : 10905
Train set size after padding : 10925


In [13]:
paths = [path for path, _ in dataset.samples]
len(paths)

18177

In [14]:
d = {
    int(sha256(path.encode()).hexdigest()[:8], 16): Path(path).stem.split("-")[0]
    for path in paths
}

In [15]:
# Dataset d'origine avant nettoyage
Counter(d.values())

Counter({'Normal': 8620,
         'Lung_Opacity': 5409,
         'COVID': 3069,
         'Viral Pneumonia': 1079})

In [16]:
s = [d[train_set[i]["id"]] for i in tqdm(range(len(train_set)))]

  0%|          | 0/10905 [00:00<?, ?it/s]

In [19]:
# Composition du trainset
Counter(s)

Counter({'Normal': 5208,
         'Lung_Opacity': 3202,
         'COVID': 1833,
         'Viral Pneumonia': 662})

In [18]:
tloader_iter = iter(tloader)

In [20]:
tloader_idx = []
target = []

In [21]:
for i in tqdm(range(len(tloader_iter))):
    batch = next(tloader_iter)
    tloader_idx.extend(batch["id"].numpy().tolist())
    target.extend(batch["target"].numpy().tolist())

  0%|          | 0/437 [00:00<?, ?it/s]

In [22]:
# Taille du train set après échantillongae
len(tloader_idx)

10925

In [23]:
# Exhantillons unique du dataset après echantillonnage
len(set(tloader_idx))

6626

In [24]:
classes = [d[i] for i in tloader_idx]

In [25]:
# Composition du dataset après échantillon (Meta classes)
Counter(classes)

Counter({'Normal': 5486,
         'Lung_Opacity': 1825,
         'COVID': 1812,
         'Viral Pneumonia': 1802})

La méthode d'échantillonnage réalise à la fois du Oversampling et Undersampling intra méta classes et inter classes selon le nombre d'échantillon.

en intra VP très largement oversamplé (x3), covid (=) et LO (x2/3),  en inter Normal oversamplé et Malade undersamplé