In [1]:
from radiocovid.core import RadioCovidDataset, RadioCovidDataModule
from radiocovid.core.data import RadioCovidSubset
from torch.utils.data import DataLoader, WeightedRandomSampler
import torch
from hashlib import sha256
from pathlib import Path
from radiocovid.core.utils import (
    seed_worker,
    get_seeded_generator,
    worker_balanced_n_samples,
)
from tqdm.auto import tqdm
from sklearn.model_selection import train_test_split
import numpy as np
from collections import defaultdict, Counter
import hydra
from hydra import initialize, compose
import torch.nn.functional as F
import cv2

In [2]:
initialize(
    version_base=None,
    config_path="../radiocovid-core/src/radiocovid/core/configs",
    job_name="test_app",
)
cfg = compose(config_name="train")

In [3]:
cfg.datamodule.dataset.root = "../data/03_inputs"

In [4]:
datamodule = hydra.utils.instantiate(cfg.datamodule)

In [5]:
dataset = hydra.utils.instantiate(cfg.datamodule.dataset)
train_transform = hydra.utils.instantiate(cfg.datamodule.train_transform)
eval_transform = hydra.utils.instantiate(cfg.datamodule.eval_transform)
eval_loader = hydra.utils.instantiate(cfg.datamodule.eval_loader)
train_loader = hydra.utils.instantiate(cfg.datamodule.train_loader)
class_retriever = hydra.utils.instantiate(cfg.datamodule.class_retriever)

In [6]:
fit_idx, test_idx = train_test_split(
    range(len(dataset)),
    test_size=datamodule.test_set,
    stratify=dataset.targets,
    random_state=cfg.seed,
)

train_idx, val_idx = train_test_split(
    fit_idx,
    test_size=datamodule.val_size,
    stratify=np.array(dataset.targets)[np.setdiff1d(range(len(dataset)), test_idx)],
    random_state=cfg.seed,
)

In [7]:
val_set = RadioCovidSubset(dataset, indices=val_idx, transform=eval_transform)

In [8]:
if class_retriever:
    targets = []
    meta_labels = []
    meta_set = defaultdict(lambda: set())
    for c, t in dataset.samples:
        meta_labels.append(class_retriever(path=c))
        meta_set[t].add(Path(c).stem.split("-")[0])
        targets.append(t)
else:
    targets = dataset.targets

if train_idx is None:
    train_set = RadioCovidSubset(
        dataset, indices=range(len(dataset)), transform=train_transform
    )
    counter = Counter(meta_labels if class_retriever else targets)
else:
    targets = np.array(targets)[train_idx].tolist()
    meta_labels = np.array(meta_labels)[train_idx].tolist()
    train_set = RadioCovidSubset(dataset, indices=train_idx, transform=train_transform)
    counter = Counter(meta_labels if class_retriever else targets)

class_weights = {c: 1.0 / n for c, n in counter.items()}

if class_retriever:
    meta_sample_weights = [class_weights[int(t)] for t in meta_labels]
    sample_weights = torch.tensor(
        [
            (1 / len(meta_set[t])) * meta_sample_weights[i]
            for i, t in enumerate(targets)
        ],
        dtype=torch.double,
    )
else:
    sample_weights = torch.tensor(
        [class_weights[int(t)] for t in targets], dtype=torch.double
    )

In [9]:
test_set = RadioCovidSubset(dataset, indices=test_idx, transform=eval_transform)

In [10]:
Counter(sample_weights.numpy().tolist())

Counter({0.00018099547511312217: 5525,
         9.97406742469579e-05: 3342,
         0.00017590149516270886: 1895,
         0.00048590864917395527: 686})

In [11]:
print(f"Train set size : {len(train_set)}")
if sample_weights is None:
    sample_weights = torch.ones(len(train_set))  # type: ignore[arg-type]
n = worker_balanced_n_samples(len(train_set), train_loader.keywords["batch_size"], 1)  # type: ignore[arg-type]
print(f"Train set size after padding : {n}")
sampler = WeightedRandomSampler(
    weights=sample_weights,
    num_samples=n,
    replacement=True,
    generator=get_seeded_generator(datamodule.seed),
)
tloader = train_loader(
    dataset=train_set,
    sampler=sampler,
    worker_init_fn=seed_worker,
    generator=get_seeded_generator(datamodule.seed),
)

Train set size : 11448
Train set size after padding : 11450




In [12]:
paths = [path for path, _ in dataset.samples]
len(paths)

19082

In [13]:
d = {
    int(sha256(path.encode()).hexdigest()[:8], 16): Path(path).stem.split("-")[0]
    for path in paths
}

In [14]:
p = {
    int(sha256(path.encode()).hexdigest()[:8], 16): Path(path)
    for path in paths
}

In [15]:
# Dataset d'origine avant nettoyage
Counter(d.values())

Counter({'Normal': 9228,
         'Lung_Opacity': 5569,
         'COVID': 3147,
         'Viral Pneumonia': 1138})

In [16]:
s = [d[train_set[i]["id"]] for i in tqdm(range(len(train_set)))]

  0%|          | 0/11448 [00:00<?, ?it/s]

In [17]:
# Composition du trainset
Counter(s)

Counter({'Normal': 5525,
         'Lung_Opacity': 3342,
         'COVID': 1895,
         'Viral Pneumonia': 686})

In [18]:
tloader_iter = iter(tloader)

In [19]:
tloader_idx = []
target = []

In [20]:
for i in tqdm(range(len(tloader_iter))):
    batch = next(tloader_iter)
    tloader_idx.extend(batch["id"].numpy().tolist())
    target.extend(batch["target"].numpy().tolist())

  0%|          | 0/458 [00:00<?, ?it/s]

In [21]:
image_list = [cv2.resize(cv2.imread(p[i], cv2.IMREAD_GRAYSCALE), (256, 256)) for i in tloader_idx]

In [22]:
mean = np.mean(image_list)
print("Moyenne des intensités du trainset : ", mean)

Moyenne des intensités du trainset :  127.95057013898958


In [23]:
std = np.std(image_list)
print("Std intensités du trainset : ", std)

Std intensités du trainset :  64.67947974314482


In [24]:
# Taille du train set après échantillongae
len(tloader_idx)

11450

In [25]:
# Exhantillons unique du dataset après echantillonnage
len(set(tloader_idx))

6989

In [26]:
classes = [d[i] for i in tloader_idx]

In [27]:
# Composition du dataset après échantillon (Meta classes)
Counter(classes)

Counter({'Normal': 5730,
         'Lung_Opacity': 1988,
         'Viral Pneumonia': 1939,
         'COVID': 1793})

La méthode d'échantillonnage réalise à la fois du Oversampling et Undersampling intra méta classes et inter classes selon le nombre d'échantillon.

en intra VP très largement oversamplé (x3), covid (=) et LO (x2/3),  en inter Normal oversamplé et Malade undersamplé