# Analyis Rerun on K-fold Cross Validation

In this notebook, we rerun the analysis that are to be included in the final report, but this time using K-fold cross validation instead of a simple train-test split. This allows us to better assess the performance and robustness of our models across different subsets of the data.

The following will be covered:
1. dataset manipulations
2. augumentations comparisons
3. model score distributions analysis
    - score distributions (during ensemble opt)
4. ensemble strategies analysis
    - wbf vs nms (during ensemble opt)
5. one-class-only datasets analysis
	

## Helper functions

This is just a setup for all experiments

In [6]:
import os
import random
from pathlib import Path
from copy import deepcopy
import matplotlib.pyplot as plt

from ml_carbucks.utils.conversions import convert_yolo_to_coco
from ml_carbucks.utils.logger import setup_logger

logger = setup_logger(__name__)

def create_counter(images_dir_root: str | Path, splits: list = ["train", "val"], normalize: bool = False):
    counter = {"all": {}, "img_counts": {}}
    for split in splits:
        # load each file and count how many classes there are in each split and what is their distribution
        split_dir = os.path.join(images_dir_root, split)
        counter[split] = {}
        for root, dirs, files in os.walk(split_dir):
            for file in files:
                if not file.endswith(".jpg"):
                    continue

                file_path = os.path.join(root, file)
                label_path = file_path.replace("images", "labels").replace(".jpg", ".txt")

                counter["img_counts"][split] = counter["img_counts"].get(split, 0) + 1
                counter["img_counts"]['all'] = counter["img_counts"].get('all', 0) + 1
                if not os.path.exists(label_path):
                    counter['all']['no_label'] = counter['all'].get("no_label", 0) + 1
                    counter[split]["no_label"] = counter[split].get("no_label", 0) + 1
                    continue

                with open(label_path, "r") as f:
                    if len(f.read().strip()) == 0:
                        counter['all']['no_label'] = counter['all'].get("no_label", 0) + 1
                        counter[split]["no_label"] = counter[split].get("no_label", 0) + 1
                        continue
                    f.seek(0)
                    for line in f:
                        class_id = line.strip().split()[0]
                        counter[split][class_id] = counter[split].get(class_id, 0) + 1
                        counter['all'][class_id] = counter['all'].get(class_id, 0) + 1

    if normalize:
        counter_normalized = deepcopy(counter)
        for split in splits + ['all']:
            total = sum(counter_normalized[split].values())
            for class_id in counter_normalized[split]:
                counter_normalized[split][class_id] = round(counter_normalized[split][class_id] / total * 100, 4) 

            counter_normalized[split] = dict(sorted(counter_normalized[split].items(), key=lambda item: (item[0] != "no_label", int(item[0]) if item[0] != "no_label" else -1)))
        return counter_normalized
    else:
        for split in splits + ['all']:
            counter[split] = dict(sorted(counter[split].items(), key=lambda item: (item[0] != "no_label", int(item[0]) if item[0] != "no_label" else -1)))
    return counter

def visualize_counter(counter:dict, split: str = 'all', counter_name: str = ""):
    plt.bar(
        x=list(counter[split].keys()),
        height=[v for v in counter[split].values()]
    )
    plt.xlabel("Class ID")
    plt.ylabel("Proportion")
    # write actual numbers on top of bars
    for i, v in enumerate(counter[split].values()):
        plt.text(i, v + 0.5, str(v), ha='center')
    plt.title(f"Class Distribution in '{split}' Split for {counter_name} Dataset")
    plt.show()

def display_dataset_analysis(images_dir_root: str | Path, splits: list = ["train", "val"], counter_name: str = "", normalize: bool = False, visualize_splits: list = ['all']):
    counter = create_counter(images_dir_root, splits, normalize)
    print(f"Dataset Analysis for {counter_name} Dataset:")
    print(counter)
    for split in visualize_splits:
        visualize_counter(counter, split, counter_name)

def clean_up_empty_labels(dataset_dir: str | Path, splits: list):
    print(f"Cleaning up empty labels in dataset at: {dataset_dir}")
    for split in splits:
        for root, dirs, files in os.walk(Path(dataset_dir) / "images" / split):
            for file in files:
                if not file.endswith(".jpg"):
                    continue
                
                img_file_path = os.path.join(root, file)
                label_file_path = img_file_path.replace(".jpg", ".txt").replace("images", "labels")
                img_name = file
                label_name = img_name.replace(".jpg", ".txt")

                if not os.path.exists(label_file_path):
                    print(f"Found image with no corresponding label file: {img_file_path}")
                    os.makedirs(os.path.join(dataset_dir, "images", "empty", split), exist_ok=True)
                    # move image file
                    new_img_path = os.path.join(dataset_dir, "images", "empty", split, file)
                    os.rename(img_file_path, new_img_path)
                    continue

                with open(label_file_path, "r") as f:
                    lines = f.readlines()
                
                if len(lines) == 0:
                    print(f"Found empty label file: {label_file_path}")
                    os.makedirs(os.path.join(dataset_dir, "images", "empty", split), exist_ok=True)
                    os.makedirs(os.path.join(dataset_dir, "labels", "empty", split), exist_ok=True)
                    # move label file
                    new_label_path = os.path.join(dataset_dir, "labels", "empty", split, label_name)
                    os.rename(label_file_path, new_label_path)
                    # move image file
                    new_img_path = os.path.join(dataset_dir, "images", "empty", split, img_name)
                    os.rename(img_file_path, new_img_path)

    convert_yolo_to_coco(
        base_dir=dataset_dir,
        splits=splits,
    )
                    
def balance_dataset(dataset_dir: str | Path, splits: list, remove_class_probabilities: dict[str, float] | None = None):
    for split in splits:
        files_moved_cnt = {class_id: 0 for class_id in remove_class_probabilities.keys()} if remove_class_probabilities else {}
        for root, dirs, files in os.walk(Path(dataset_dir) / "labels" / split):
            for file in files:
                if not file.endswith(".txt"):
                    continue
                file_path = os.path.join(root, file)
                with open(file_path, "r") as f:
                    lines = f.readlines()
                
                class_labels = set()
                for line in lines:
                    class_id = line.strip().split()[0]
                    class_labels.add(class_id)

                # we want to move the files only that have pure one class labels, not mixed
                if len(class_labels) != 1:
                    continue

                class_id = class_labels.pop()
                move_file = False
                if remove_class_probabilities and class_id in remove_class_probabilities:
                    prob = remove_class_probabilities[class_id]
                    if random.random() <= prob:
                        move_file = True

                if move_file:
                    files_moved_cnt[class_id] += 1
                    print(f"Moving pure class {class_id} label file: {file_path}")
                    os.makedirs(os.path.join(dataset_dir, "images", "balancing", split), exist_ok=True)
                    os.makedirs(os.path.join(dataset_dir, "labels", "balancing", split), exist_ok=True)
                    # move label file
                    new_label_path = os.path.join(dataset_dir, "labels", "balancing", split, file)
                    os.rename(file_path, new_label_path)
                    # move image file
                    img_file = file.replace(".txt", ".jpg")
                    img_path = os.path.join(dataset_dir, "images", split, img_file)
                    if os.path.exists(img_path):
                        new_img_path = os.path.join(dataset_dir, "images", "balancing", split, img_file)
                        os.rename(img_path, new_img_path)
                    else:
                        print(f"Corresponding image file not found for label: {file_path}")
        if files_moved_cnt:
            print(f"Moved files for {split}: {files_moved_cnt}")


    convert_yolo_to_coco(
        base_dir=dataset_dir,
        splits=splits,
    )
       

In [None]:
raise ValueError("Please make sure that the functions above were executed as the analysis rely that the datasets were correclty prepared. You may comment this out once done.")

### 1. Dataset manipulations
We will analyze how different dataset manipulations affect model performance across the K folds.
- raw data
- cleaned data (without empty annotation images)
- balanced class
- combined datasets

Thos manipulations will be evaluated using cross-validation on the baseline datasets meaning the manipulations will only be applied to the training folds while the validation fold remains unchanged. This ensures that we can accurately assess the impact of each manipulation on model performance.

In [None]:
import os
from ml_carbucks.utils.DatasetsPathManager import DatasetsPathManager
from ml_carbucks.utils.conversions import convert_coco_to_yolo

manipulated_datasets_dir = []
for train_fold in DatasetsPathManager.CARBUCKS_TRAIN_CV:
    original_train_img_dir = train_fold[0][0]
    fold_dir = original_train_img_dir.parent.parent # type: ignore
    print(f"Processing fold directory: {fold_dir}")


    cleaned_dir = fold_dir.parent / f"{fold_dir.name}_cleaned"
    balanced_dir = fold_dir.parent / f"{fold_dir.name}_balanced"

    os.system(f"rm -rf {cleaned_dir}")
    os.system(f"rm -rf {balanced_dir}")
 
    os.makedirs(cleaned_dir / "images" / "train", exist_ok=True)
    os.makedirs(balanced_dir / "images" / "train", exist_ok=True)
    
    # #copy original fold to new location
    os.system(f"cp -r {fold_dir}/images/train/ {cleaned_dir}/images/")
    os.system(f"cp -r {fold_dir}/instances_train_curated.json {cleaned_dir}")
    os.system(f"cp -r {fold_dir}/images/train/ {balanced_dir}/images/")
    os.system(f"cp -r {fold_dir}/instances_train_curated.json {balanced_dir}")
    
    convert_coco_to_yolo(
        img_dir=cleaned_dir / "images" / "train",
        ann_file=cleaned_dir / "instances_train_curated.json",
    )
    
    convert_coco_to_yolo(
        img_dir=balanced_dir / "images" / "train",
        ann_file=balanced_dir / "instances_train_curated.json",
    )

    clean_up_empty_labels(dataset_dir=cleaned_dir, splits=["train"])
    clean_up_empty_labels(dataset_dir=balanced_dir, splits=["train"])

    balance_dataset(
        dataset_dir=balanced_dir,
        splits=["train"],
        remove_class_probabilities={
            "0": 0.9, # <- this can be adjusted to remove some, it removes the % of pure class images
            "1": 0.0, # <-
            "2": 0.0, # <-
        }
    )

    convert_yolo_to_coco(
        base_dir=cleaned_dir,
        splits=["train"],
    )

    convert_yolo_to_coco(
        base_dir=balanced_dir,
        splits=["train"],
    )

    display_dataset_analysis(
        images_dir_root=cleaned_dir / "images",
        splits=["train"],
        counter_name=f"Cleaned Dataset Fold {fold_dir.name}",
        normalize=True,
        visualize_splits=["train"]
    )
    display_dataset_analysis(
        images_dir_root=balanced_dir / "images",
        splits=["train"],
        counter_name=f"Balanced Dataset Fold {fold_dir.name}",
        normalize=True,
        visualize_splits=["train"]
    )

    
    manipulated_datasets_dir.append(cleaned_dir)
    manipulated_datasets_dir.append(balanced_dir)

In [None]:
from typing import Any

from ml_carbucks import RESULTS_DIR
from ml_carbucks.adapters.BaseDetectionAdapter import BaseDetectionAdapter
from ml_carbucks.utils.DatasetsPathManager import DatasetsPathManager
from ml_carbucks.utils.result_saver import ResultSaver
from ml_carbucks.adapters import YoloUltralyticsAdapter, RtdetrUltralyticsAdapter, FasterRcnnAdapter, EfficientDetAdapter


METRIC = "map_50"
BASE_PARAMS :dict[str, Any]= {
    "epochs": 20,
    "batch_size": 16,
    "accumulation_steps": 2,
    "img_size": 320,
    "verbose": True
}

saver1 = ResultSaver(
    path=RESULTS_DIR,
    name="saver1_crossval_data_manipulations",
    metadata=BASE_PARAMS,
    append=True
)

logger.info(f"Starting cross-validation for models on singular datasets manipulations")
singular_models: list[BaseDetectionAdapter] = [
    YoloUltralyticsAdapter(**BASE_PARAMS),
    RtdetrUltralyticsAdapter(**BASE_PARAMS),
    FasterRcnnAdapter(**BASE_PARAMS),
    EfficientDetAdapter(**BASE_PARAMS),
]
for train_idx, (train, val) in enumerate(zip(DatasetsPathManager.CARBUCKS_TRAIN_CV[:3], DatasetsPathManager.CARBUCKS_VAL_CV[:3], strict=True)):
    for model in singular_models:
        
        saver1.save(
            model_name=model.__class__.__name__,
            dataset_name="carbucks_standard",
            fold=train_idx,
            metric_name=METRIC,
            metric_value=model.clone().fit(train).evaluate(val)[METRIC]
        )

        saver1.save(
            model_name=model.__class__.__name__,
            dataset_name="cardd",
            fold=train_idx,
            metric_name=METRIC,
            metric_value=model.clone().fit(DatasetsPathManager.CARDD_TRAIN).evaluate(val)[METRIC]
        )

        cleaned_train = [
            (str(train[0][0]).replace(f"fold_{train_idx}", f"fold_{train_idx}_cleaned"), 
             str(train[0][1]).replace(f"fold_{train_idx}", f"fold_{train_idx}_cleaned"))]
        saver1.save(
            model_name=model.__class__.__name__,
            dataset_name="carbucks_cleaned",
            fold=train_idx,
            metric_name=METRIC,
            metric_value=model.clone().fit(cleaned_train).evaluate(val)[METRIC] # type: ignore
        )
        
        balanced_train = [
            (str(train[0][0]).replace(f"fold_{train_idx}", f"fold_{train_idx}_balanced"), 
             str(train[0][1]).replace(f"fold_{train_idx}", f"fold_{train_idx}_balanced"))]
        saver1.save(
            model_name=model.__class__.__name__,
            dataset_name="carbucks_balanced",
            fold=train_idx,
            metric_name=METRIC,
            metric_value=model.clone().fit(balanced_train).evaluate(val)[METRIC] # type: ignore
        )

logger.info(f"Starting cross-validation for models on combined datasets manipulations")
combined_models: list[BaseDetectionAdapter] = [
    FasterRcnnAdapter(**BASE_PARAMS),
    EfficientDetAdapter(**BASE_PARAMS),
]
for train_idx, (train, val) in enumerate(zip(DatasetsPathManager.CARBUCKS_TRAIN_CV[:3], DatasetsPathManager.CARBUCKS_VAL_CV[:3], strict=True)):
    for model in combined_models:
        
        saver1.save(
            model_name=model.__class__.__name__,
            dataset_name="cardd+carbucks_standard",
            fold=train_idx,
            metric_name=METRIC,
            metric_value=model.clone().fit([DatasetsPathManager.CARDD_TRAIN[0], DatasetsPathManager.CARBUCKS_TRAIN_CV[train_idx][0]]).evaluate(val)[METRIC] # type: ignore
        )

        cleaned_train = [
            (str(train[0][0]).replace(f"fold_{train_idx}", f"fold_{train_idx}_cleaned"), 
             str(train[0][1]).replace(f"fold_{train_idx}", f"fold_{train_idx}_cleaned"))]
        saver1.save(
            model_name=model.__class__.__name__,
            dataset_name="cardd+carbucks_cleaned",
            fold=train_idx,
            metric_name=METRIC,
            metric_value=model.clone().fit([DatasetsPathManager.CARDD_TRAIN[0], cleaned_train[0]]).evaluate(val)[METRIC] # type: ignore
        )

        balanced_train = [
            (str(train[0][0]).replace(f"fold_{train_idx}", f"fold_{train_idx}_balanced"), 
             str(train[0][1]).replace(f"fold_{train_idx}", f"fold_{train_idx}_balanced"))]
        saver1.save(
            model_name=model.__class__.__name__,
            dataset_name="cardd+carbucks_balanced",
            fold=train_idx,
            metric_name=METRIC,
            metric_value=model.clone().fit([DatasetsPathManager.CARDD_TRAIN[0], balanced_train[0]]).evaluate(val)[METRIC] # type: ignore
        )
    
logger.info("Cross-validation for models on dataset manipulations completed.")
logger.info(saver1.data)
    

### Cleaning up

In [None]:
for man_dir in manipulated_datasets_dir:
    os.system(f"rm -rf {man_dir}")

### Results and Discussion

## 2. Augmentations comparisons
We will compare different augmentation strengths and techniques using cross-validation to determine their effect on model robustness and accuracy.

In [None]:
from typing import Any

from ml_carbucks import RESULTS_DIR
from ml_carbucks.adapters.BaseDetectionAdapter import BaseDetectionAdapter
from ml_carbucks.utils.DatasetsPathManager import DatasetsPathManager
from ml_carbucks.utils.result_saver import ResultSaver
from ml_carbucks.adapters import YoloUltralyticsAdapter, RtdetrUltralyticsAdapter, FasterRcnnAdapter, EfficientDetAdapter


AUG_METRIC = "map_50"
AUG_BASE_PARAMS :dict[str, Any]= {
    "batch_size": 8,
    "accumulation_steps": 4,
    "img_size": 320,
}

from dataclasses import dataclass
from typing import Literal
import torch
import gc
@dataclass
class EfficientDetAdapterCustomLoader(EfficientDetAdapter):
    loader: Literal["inbuilt", "custom"] = "custom"

model_clsasses: list[type[BaseDetectionAdapter]] = [
    YoloUltralyticsAdapter,
    RtdetrUltralyticsAdapter,
    FasterRcnnAdapter,
    EfficientDetAdapter,
    EfficientDetAdapterCustomLoader,
]

saver2 = ResultSaver(
    path=RESULTS_DIR,
    name="saver2_crossval_augmentations_comparisons",
    metadata=AUG_BASE_PARAMS,
)

EPOCHS_TO_TEST = [30]

logger.info("Starting cross-validation for models augmentations comparisons")
for epoch_count in EPOCHS_TO_TEST:
    for train_idx, (train, val) in enumerate(
        zip(
            DatasetsPathManager.CARBUCKS_TRAIN_CV[:3],
            DatasetsPathManager.CARBUCKS_VAL_CV[:3],
            strict=True,
        )
    ):
        for model_cls in model_clsasses:

            model_aug = model_cls(**AUG_BASE_PARAMS, training_augmentations=True, epochs=epoch_count)  # type: ignore
            
            res_aug = model_aug.debug(train, val, results_path=RESULTS_DIR / "augmentation_comparison", results_name=f"fold_{train_idx}_{model_cls.__name__}_aug_")
            saver2.save(
                model_name=model_aug.__class__.__name__,
                augmentation=True,
                fold=train_idx,
                metric_name=AUG_METRIC,
                epochs=epoch_count,
                metric_value=res_aug[AUG_METRIC],
            )
            del model_aug
            del res_aug
            gc.collect()
            torch.cuda.empty_cache()
            torch.cuda.ipc_collect()

            model_noaug = model_cls(**AUG_BASE_PARAMS, training_augmentations=False, epochs=epoch_count)  # type: ignore
            res_noaug = model_noaug.debug(train, val, results_path=RESULTS_DIR / "augmentation_comparison", results_name=f"fold_{train_idx}_{model_cls.__name__}_noaug_")

            saver2.save(
                model_name=model_noaug.__class__.__name__,
                augmentation=False,
                fold=train_idx,
                metric_name=AUG_METRIC,
                epochs=epoch_count,
                metric_value=res_noaug[AUG_METRIC],
            )
            del model_noaug
            del res_noaug
            gc.collect()
            torch.cuda.empty_cache()
            torch.cuda.ipc_collect()

logger.info("Cross-validation for models augmentations comparisons completed.")
logger.info(saver2.data)

### Results and Discussion

## 3. Model score distributions analysis
We will analyze the score distributions of different models using cross-validation to understand how they affect ensemble performance

For that we will analyse the distributions of the final ensemble that were already obtained during the `Ensemble Optimization` step using cross-validation.

In [None]:
raise ValueError("Please make sure that the prestep_hash variable is set to the correct hash of the ensemble optimization prestep that was run with cross-validation. You may comment this out once done.")

In [None]:
import numpy as np
import matplotlib.pyplot as plt

def plot_histograms(scores_dict, xlim=None, info_title=''):
    # make seeded color from title
    np.random.seed(abs(hash(info_title)) % (2**32))
    colors = np.random.rand(len(scores_dict), 3)


    fig, axs = plt.subplots(2, 2, figsize=(10, 5))
    axs = axs.flatten()
    for i, scores in enumerate(scores_dict.values()):
        axs[i].hist(scores, bins=50, color=colors[0])
        if xlim is not None:
            axs[i].set_xlim(xlim)
        axs[i].set_title(f'Adapter {list(scores_dict.keys())[i]} prediction scores distribution {info_title}')
    plt.tight_layout()
    plt.show()

In [None]:
import pickle as pkl

from ml_carbucks import OPTUNA_DIR

PRESTEP_HASH = "your_prestep_hash_here"  # the hash of the ensemble optimization prestep

PRESTEP_PATH = OPTUNA_DIR / "ensemble" / f"prestep_{PRESTEP_HASH}"


(
    adapters_predictions,
    ground_truths,
    distributions,
    adapters_crossval_metrics,
    adapters_dataset_metrics,
    adapter_hashes,
    adapter_names,
)  = pkl.load(open(PRESTEP_PATH, "rb"))


all_scores = [
    [] for _ in range(len(adapters_predictions))
]

for i, preds in enumerate(adapters_predictions):
    all_scores[i] = []
    for p in preds:
        for scores in p['scores']:
            all_scores[i].append(scores.item())

from torch import erf
import torch


minmax_normalized_scores = []
for scores in all_scores:
    min_score = min(scores)
    max_score = max(scores)
    norm_scores = [(s - min_score) / (max_score - min_score) for s in scores]
    minmax_normalized_scores.append(norm_scores)

zscore_normalized_scores = []
for scores in all_scores:
    mean_score = sum(scores) / len(scores)
    std_score = (sum((s - mean_score) ** 2 for s in scores) / len(scores)) ** 0.5
    norm_scores = [(s - mean_score) / std_score for s in scores]
    zscore_normalized_scores.append(norm_scores)

zscore_probability_normalized_scores = []
for scores in all_scores:
    mean_score = sum(scores) / len(scores)
    std_score = (sum((s - mean_score) ** 2 for s in scores) / len(scores)) ** 0.5
    zscore_scores = [(s - mean_score) / std_score for s in scores]
    prob_scores = [0.5 * (1 + erf(torch.tensor(s) / (2 ** 0.5))) for s in zscore_scores]
    zscore_probability_normalized_scores.append(prob_scores)

quantile_normalized_scores = []
for scores in all_scores:
    sorted_scores = sorted(scores)
    norm_scores = [sorted_scores.index(s) / len(scores) for s in scores]
    quantile_normalized_scores.append(norm_scores)

plot_histograms(all_scores, info_title='raw')
plot_histograms(minmax_normalized_scores, xlim=(0, 1), info_title='min-max normalized')
plot_histograms(zscore_normalized_scores, xlim=(-6, 6), info_title='z-score normalized')
plot_histograms(zscore_probability_normalized_scores, xlim=(0, 1), info_title='z-score probability normalized')
plot_histograms(quantile_normalized_scores, xlim=(0, 1), info_title='quantile normalized')

### Results and Discussion

## 4. Ensemble strategies analysis
We will evaluate different ensemble strategies using cross-validation to understand their impact on detection performance.
    - wbf vs nms (during ensemble opt)

For that we will analyse the results of the `Ensemble Optimization` step that was already performed using cross-validation.

Topics:
- strategies (nms, wbf)
- score normalization

### Optuna pictures here and results I guess

### Results and Discussion

## 5. One-class-only datasets analysis
We will investigate the performance of models trained on datasets containing only one class using cross-validation. 

We will focus only on `Crack` class as it is the most `under-represented` class in the dataset and thus the most challenging one.

In [10]:
import os
import yaml
from pathlib import Path

from ml_carbucks.utils.conversions import convert_yolo_to_coco


def filter_dataset(data_dir: str | Path, splits:list[str], class_to_keep:str="2", negative_sample_ratio:float=-1):

    dataset_yaml_path = os.path.join(data_dir, "dataset_single.yaml")

    if not os.path.exists(dataset_yaml_path):
        logger.info(f"Dataset yaml file does not exist. Proceeding...")
    else:
        loaded_yaml = None
        with open(dataset_yaml_path, "r") as f:
            loaded_yaml = yaml.safe_load(f)
        if loaded_yaml['nc'] == 1 and loaded_yaml['names'] == [f'class_to_keep_{class_to_keep}']:
            logger.info(f"Dataset already filtered to class {class_to_keep}. Exiting...")
            return


    for split in splits:
        obj = {}
        for file in os.listdir(os.path.join(data_dir, "labels", split)):
            if file.endswith(".txt"):
                with open(os.path.join(data_dir, "labels", split, file), "r") as f:
                    lines = f.readlines()
                    file_classes = set()
                    for line in lines:
                        class_id = line.split()[0]
                        file_classes.add(class_id)
                    obj[file.split(".")[0]] = list(file_classes)

        files_to_keep = []
        for key, value in obj.items():
            if class_to_keep in value:
                files_to_keep.append(key)

        files_neg_to_keep = []
        for key, value in obj.items():
            if class_to_keep not in value:
                if negative_sample_ratio < 0:
                    files_neg_to_keep.append(key)
                elif len(files_neg_to_keep) < len(files_to_keep) and random.random() < negative_sample_ratio:
                    files_neg_to_keep.append(key)
                else:
                    pass
                
        
        print(f"{len(files_to_keep)}, {len(files_neg_to_keep)}")

        for key, value in obj.items():
            if key in files_to_keep or key in files_neg_to_keep:
                # clean up other classes from the labels
                with open(os.path.join(data_dir, "labels", split, f"{key}.txt"), "r") as f:
                    lines = f.readlines()
                with open(os.path.join(data_dir, "labels", split, f"{key}.txt"), "w") as f:
                    for line in lines:
                        class_id = line.split()[0]
                        if class_id == class_to_keep:
                            f.write(line.replace(f"{class_to_keep} ", "0 "))
            else:
                os.remove(os.path.join(data_dir, "labels", split, f"{key}.txt"))
                os.remove(os.path.join(data_dir, "images", split, f"{key}.jpg"))   
    
    # write dataset.yaml file
    dataset_yaml = {
        'nc': 1,
        'names': [f'class_to_keep_{class_to_keep}']
    }

    dataset_yaml["train"] = f"images/train"
    dataset_yaml["val"] = f"images/val"
    for split in splits:
        dataset_yaml[split] = f"images/{split}"

    with open(dataset_yaml_path, "w") as f:
        yaml.dump(dataset_yaml, f)

    convert_yolo_to_coco(data_dir, splits)

In [14]:
import os
from ml_carbucks.utils.DatasetsPathManager import DatasetsPathManager
from ml_carbucks.utils.conversions import convert_coco_to_yolo, convert_yolo_to_coco

single_class_created_dirs = []
for train_fold in DatasetsPathManager.CARBUCKS_TRAIN_CV:
    original_train_img_dir = train_fold[0][0]
    fold_dir = original_train_img_dir.parent.parent # type: ignore
    print(f"Processing fold directory: {fold_dir}")
    filtered_dir = fold_dir.parent / f"{fold_dir.name}_crack"
    
    os.system(f"rm -rf {filtered_dir}")
    #copy original fold to new location
    os.makedirs(filtered_dir / "images" / "train", exist_ok=True)
    os.system(f"cp -r {fold_dir}/images/train/ {filtered_dir}/images/")
    os.system(f"cp -r {fold_dir}/images/val/ {filtered_dir}/images/")
    os.system(f"cp -r {fold_dir}/instances_train_curated.json {filtered_dir}")
    os.system(f"cp -r {fold_dir}/instances_val_curated.json {filtered_dir}")
    
    convert_coco_to_yolo(
        img_dir=filtered_dir / "images" / "train",
        ann_file=filtered_dir / "instances_train_curated.json",
    )
    convert_coco_to_yolo(
        img_dir=filtered_dir / "images" / "train",
        ann_file=filtered_dir / "instances_val_curated.json",
    )
    filter_dataset(
        data_dir=filtered_dir,
        splits=["train"],
        class_to_keep="2",  # assuming '2' is the class ID for 'Crack'
        negative_sample_ratio=-1
    )

    convert_yolo_to_coco(
        base_dir=filtered_dir,
        splits=["train", "val"],
    )

    single_class_created_dirs.append(filtered_dir)


Processing fold directory: /home/bachelor/ml-carbucks/data/final_carbucks/crossval/fold_1
INFO ml_carbucks.utils.conversions 14:58:21 | COCO to YOLO conversion completed in 0.08 seconds
INFO ml_carbucks.utils.conversions 14:58:21 | COCO to YOLO conversion completed in 0.02 seconds
INFO __main__ 14:58:21 | Dataset yaml file does not exist. Proceeding...
424, 2870
INFO ml_carbucks.utils.conversions 14:58:21 | Saved /home/bachelor/ml-carbucks/data/final_carbucks/crossval/fold_1_crack/instances_train_curated.json with 3258 images and 424 annotations.
INFO ml_carbucks.utils.conversions 14:58:21 | Saved /home/bachelor/ml-carbucks/data/final_carbucks/crossval/fold_1_crack/instances_train_curated.json with 3258 images and 424 annotations.
INFO ml_carbucks.utils.conversions 14:58:21 | Saved /home/bachelor/ml-carbucks/data/final_carbucks/crossval/fold_1_crack/instances_val_curated.json with 814 images and 0 annotations.
Processing fold directory: /home/bachelor/ml-carbucks/data/final_carbucks/cr

In [None]:
from ml_carbucks import RESULTS_DIR
from ml_carbucks.utils.DatasetsPathManager import DatasetsPathManager
from ml_carbucks.adapters.BaseDetectionAdapter import BaseDetectionAdapter
from ml_carbucks.adapters import YoloUltralyticsAdapter, RtdetrUltralyticsAdapter, FasterRcnnAdapter, EfficientDetAdapter
from ml_carbucks.utils.result_saver import ResultSaver

ONE_METRIC = "map_50"
ONE_BASE_PARAMS :dict[str, Any]= {
    "epochs": 10,
    "batch_size": 8,
    "accumulation_steps": 4,
    "img_size": 320,
}

saver5 = ResultSaver(
    path=RESULTS_DIR,
    name="saver5_crossval_one_class_datasets",
    metadata=ONE_BASE_PARAMS
)

one_models: list[BaseDetectionAdapter] = [
    YoloUltralyticsAdapter(**ONE_BASE_PARAMS),
    RtdetrUltralyticsAdapter(**ONE_BASE_PARAMS),
    FasterRcnnAdapter(**ONE_BASE_PARAMS),
    EfficientDetAdapter(**ONE_BASE_PARAMS),
]

for train_idx, (train, val) in enumerate(zip(DatasetsPathManager.CARBUCKS_TRAIN_CV, DatasetsPathManager.CARBUCKS_VAL_CV, strict=True)):
    for model in one_models:
        
        # NOTE: this will produce BOX PR with each class MAP@50, and we will need to manually look it up later
        model.debug(
            train, val, 
            results_path=RESULTS_DIR / "debug_one_class_datasets" ,
            results_name = f"fold_{train_idx}_{model.__class__.__name__}"
        )

        one_class_crack = [
            (str(train[0][0]).replace(f"fold_{train_idx}", f"fold_{train_idx}_crack"), 
                str(train[0][1]).replace(f"fold_{train_idx}", f"fold_{train_idx}_crack"))
        ]
        saver5.save(
            model_name=model.__class__.__name__,
            dataset_name="carbucks_one_class_crack",
            fold=train_idx,
            metric_name=ONE_METRIC,
            metric_value=model.clone().fit(one_class_crack).evaluate(val)[ONE_METRIC] # type: ignore
        )

### Cleaning up

In [None]:
import os
for single_class_dir in single_class_created_dirs:
    os.system(f"rm -rf {single_class_dir}")

### Results and Discussion