In [23]:
from pathlib import Path
import yaml
import pandas as pd
from sklearn.model_selection import KFold
import shutil

In [24]:
dataset_path = Path("../full_dataset")  # replace with 'path/to/dataset' for your custom data
labels = sorted(dataset_path.rglob("GoodLabels2/*.txt"))  # all data in 'labels'

In [63]:
print(labels)

[PosixPath('../full_dataset/GoodLabels2/Fito1.jpg.txt'), PosixPath('../full_dataset/GoodLabels2/Fito10.jpg.txt'), PosixPath('../full_dataset/GoodLabels2/Fito100.jpg.txt'), PosixPath('../full_dataset/GoodLabels2/Fito101.jpg.txt'), PosixPath('../full_dataset/GoodLabels2/Fito102.jpg.txt'), PosixPath('../full_dataset/GoodLabels2/Fito103.jpg.txt'), PosixPath('../full_dataset/GoodLabels2/Fito104.jpg.txt'), PosixPath('../full_dataset/GoodLabels2/Fito105.jpg.txt'), PosixPath('../full_dataset/GoodLabels2/Fito106.jpg.txt'), PosixPath('../full_dataset/GoodLabels2/Fito107.jpg.txt'), PosixPath('../full_dataset/GoodLabels2/Fito11.jpg.txt'), PosixPath('../full_dataset/GoodLabels2/Fito12.jpg.txt'), PosixPath('../full_dataset/GoodLabels2/Fito13.jpg.txt'), PosixPath('../full_dataset/GoodLabels2/Fito14.jpg.txt'), PosixPath('../full_dataset/GoodLabels2/Fito15.jpg.txt'), PosixPath('../full_dataset/GoodLabels2/Fito16.jpg.txt'), PosixPath('../full_dataset/GoodLabels2/Fito17.jpg.txt'), PosixPath('../full_data

In [27]:
yaml_file = "../datasets/cocoa_diseases/cocoa_dataset.yaml"  # your data YAML with data directories and names dictionary
with open(yaml_file, "r", encoding="utf8") as y:
    classes = yaml.safe_load(y)["names"]
cls_idx = sorted(classes.keys())

In [67]:
index = [label.stem for label in labels]
index=[name.split(".")[0] for name in index]

# uses base filename as ID (no extension)
# Remove the file extension from index
#
labels_df = pd.DataFrame([], columns=cls_idx, index=index)

In [57]:
print(len(index))

312


## Count the instances of each class-label present in the annotation files.

In [68]:
from collections import Counter

for label in labels:
    stem=label.stem.split(".")[0]
    lbl_counter = Counter()

    with open(label, "r") as lf:
        lines = lf.readlines()

    for line in lines:
        # classes for YOLO label uses integer at first position of each line
        lbl_counter[int(line.split(" ")[0])] += 1

    labels_df.loc[stem] = lbl_counter

labels_df = labels_df.fillna(0.0)  # replace `nan` values with `0.0`

In [69]:
labels_df.head(20)

Unnamed: 0,0,1,2
Fito1,3.0,0.0,8.0
Fito10,1.0,0.0,0.0
Fito100,1.0,0.0,5.0
Fito101,1.0,0.0,1.0
Fito102,1.0,1.0,2.0
Fito103,1.0,0.0,1.0
Fito104,1.0,0.0,5.0
Fito105,1.0,0.0,7.0
Fito106,2.0,0.0,7.0
Fito107,1.0,0.0,1.0


In [70]:
ksplit = 5
kf = KFold(n_splits=ksplit, shuffle=True, random_state=20)  # setting random_state for repeatable results

kfolds = list(kf.split(labels_df))

## The dataset has now been split into k folds, each having a list of train and val indices. We will construct a DataFrame to display these results more clearly.

In [71]:
folds = [f"split_{n}" for n in range(1, ksplit + 1)]
folds_df = pd.DataFrame(index=index, columns=folds)

for i, (train, val) in enumerate(kfolds, start=1):
    folds_df[f"split_{i}"].loc[labels_df.iloc[train].index] = "train"
    folds_df[f"split_{i}"].loc[labels_df.iloc[val].index] = "val"

In [72]:
 

# Create folds_df with modified index
folds = [f"split_{n}" for n in range(1, ksplit + 1)]
folds_df = pd.DataFrame(index=index, columns=folds)

for i, (train, val) in enumerate(kfolds, start=1):
    folds_df[f"split_{i}"].loc[labels_df.iloc[train].index.str.replace(r'\.\w+$', '', regex=True)] = "train"
    folds_df[f"split_{i}"].loc[labels_df.iloc[val].index.str.replace(r'\.\w+$', '', regex=True)] = "val"


In [73]:
folds_df.head()

Unnamed: 0,split_1,split_2,split_3,split_4,split_5
Fito1,val,train,train,train,train
Fito10,train,val,train,train,train
Fito100,train,train,val,train,train
Fito101,train,train,train,train,val
Fito102,train,train,train,train,val


## Now we will calculate the distribution of class labels for each fold as a ratio of the classes present in val to those present in train.

In [74]:
fold_lbl_distrb = pd.DataFrame(index=folds, columns=cls_idx)

for n, (train_indices, val_indices) in enumerate(kfolds, start=1):
    train_totals = labels_df.iloc[train_indices].sum()
    val_totals = labels_df.iloc[val_indices].sum()

    # To avoid division by zero, we add a small value (1E-7) to the denominator
    ratio = val_totals / (train_totals + 1e-7)
    fold_lbl_distrb.loc[f"split_{n}"] = ratio

In [75]:
fold_lbl_distrb

Unnamed: 0,0,1,2
split_1,0.227027,0.27193,0.269792
split_2,0.304598,0.239316,0.203356
split_3,0.304598,0.228814,0.24261
split_4,0.207447,0.27193,0.196271
split_5,0.213904,0.239316,0.349945


## Next, we create the directories and dataset YAML files for each split.

In [38]:
import datetime

supported_extensions = [".jpg", ".jpeg", ".png"]

# Initialize an empty list to store image file paths
images = []
kfold_dataset_path=Path("../kfold_datasets")

# Loop through supported extensions and gather image files
for ext in supported_extensions:
    images.extend(
        sorted(
            f for f in (dataset_path / "img").rglob(f"*{ext}")
            if ".ipynb_checkpoints" not in f.parts  # Exclude checkpoint files
        )
    )

# Create the necessary directories and dataset YAML files (unchanged)
save_path = Path(kfold_dataset_path / f"{datetime.date.today().isoformat()}_{ksplit}-Fold_Cross-val")
save_path.mkdir(parents=True, exist_ok=True)
ds_yamls = []

for split in folds_df.columns:
    # Create directories
    split_dir = save_path / split
    split_dir.mkdir(parents=True, exist_ok=True)
    (split_dir / "train" / "images").mkdir(parents=True, exist_ok=True)
    (split_dir / "train" / "labels").mkdir(parents=True, exist_ok=True)
    (split_dir / "val" / "images").mkdir(parents=True, exist_ok=True)
    (split_dir / "val" / "labels").mkdir(parents=True, exist_ok=True)

    # Create dataset YAML files
    dataset_yaml = split_dir / f"{split}_dataset.yaml"
    ds_yamls.append(dataset_yaml)

    with open(dataset_yaml, "w") as ds_y:
        yaml.safe_dump(
            {
                "path": split_dir.as_posix(),
                "train": "train",
                "val": "val",
                "names": classes,
            },
            ds_y,
        )

In [42]:
print(images) 

[PosixPath('../full_dataset/img/Fito1.jpg'), PosixPath('../full_dataset/img/Fito10.jpg'), PosixPath('../full_dataset/img/Fito100.jpg'), PosixPath('../full_dataset/img/Fito101.jpg'), PosixPath('../full_dataset/img/Fito102.jpg'), PosixPath('../full_dataset/img/Fito103.jpg'), PosixPath('../full_dataset/img/Fito104.jpg'), PosixPath('../full_dataset/img/Fito105.jpg'), PosixPath('../full_dataset/img/Fito106.jpg'), PosixPath('../full_dataset/img/Fito107.jpg'), PosixPath('../full_dataset/img/Fito11.jpg'), PosixPath('../full_dataset/img/Fito12.jpg'), PosixPath('../full_dataset/img/Fito13.jpg'), PosixPath('../full_dataset/img/Fito14.jpg'), PosixPath('../full_dataset/img/Fito15.jpg'), PosixPath('../full_dataset/img/Fito16.jpg'), PosixPath('../full_dataset/img/Fito17.jpg'), PosixPath('../full_dataset/img/Fito18.jpg'), PosixPath('../full_dataset/img/Fito19.jpg'), PosixPath('../full_dataset/img/Fito2.jpg'), PosixPath('../full_dataset/img/Fito20.jpg'), PosixPath('../full_dataset/img/Fito21.jpg'), Pos

In [47]:
folds_df

Unnamed: 0,split_1,split_2,split_3,split_4,split_5
fito1.jpg,val,train,train,train,train
fito10.jpg,train,val,train,train,train
fito100.jpg,train,train,val,train,train
fito101.jpg,train,train,train,train,val
fito102.jpg,train,train,train,train,val
...,...,...,...,...,...
sana95.jpg,train,val,train,train,train
sana96.jpg,train,train,train,val,train
sana97.jpg,train,train,train,val,train
sana98.jpg,train,train,train,val,train


In [76]:
# Ensure index is clean
folds_df.index = folds_df.index.str.strip().str.lower()

for image, label in zip(images, labels):
    img_stem = image.stem.strip().lower()  # Normalize file name
    print(f"Checking for: {img_stem} in folds_df")

    if img_stem in folds_df.index:
        for split, k_split in folds_df.loc[img_stem].items():
            img_to_path = save_path / split / k_split / "images"
            lbl_to_path = save_path / split / k_split / "labels"

            shutil.copy(image, img_to_path / image.name)
            shutil.copy(label, lbl_to_path / label.name)
    else:
        print(f"Warning: {img_stem} not found in folds_df!")


Checking for: fito1 in folds_df
Checking for: fito10 in folds_df
Checking for: fito100 in folds_df
Checking for: fito101 in folds_df
Checking for: fito102 in folds_df
Checking for: fito103 in folds_df
Checking for: fito104 in folds_df
Checking for: fito105 in folds_df
Checking for: fito106 in folds_df
Checking for: fito107 in folds_df
Checking for: fito11 in folds_df
Checking for: fito12 in folds_df
Checking for: fito13 in folds_df
Checking for: fito14 in folds_df
Checking for: fito15 in folds_df
Checking for: fito16 in folds_df
Checking for: fito17 in folds_df
Checking for: fito18 in folds_df
Checking for: fito19 in folds_df
Checking for: fito2 in folds_df
Checking for: fito20 in folds_df
Checking for: fito21 in folds_df
Checking for: fito22 in folds_df
Checking for: fito23 in folds_df
Checking for: fito24 in folds_df
Checking for: fito25 in folds_df
Checking for: fito26 in folds_df
Checking for: fito27 in folds_df
Checking for: fito28 in folds_df
Checking for: fito29 in folds_df
Chec

# Cross validate one model

In [1]:
from ultralytics import YOLO
from ultralytics import settings


In [2]:

settings.update({"datasets_dir": "/home/jovyan/ML2/Sparsification"})

print(settings)

JSONDict("/home/jovyan/.config/Ultralytics/settings.json"):
{
  "settings_version": "0.0.6",
  "datasets_dir": "/home/jovyan/ML2/Sparsification",
  "weights_dir": "weights",
  "runs_dir": "runs",
  "uuid": "93458b231be90a548d04c63a0c95116cfd63e3c32b7079f22fa8b532fb8aae75",
  "sync": true,
  "api_key": "",
  "openai_api_key": "",
  "clearml": true,
  "comet": true,
  "dvc": true,
  "hub": true,
  "mlflow": true,
  "neptune": true,
  "raytune": true,
  "tensorboard": true,
  "wandb": false,
  "vscode_msg": true
}


In [3]:
for k in range(5):
    print(k+1)

1
2
3
4
5


In [3]:
results = {}

# Define your additional arguments here

model=YOLO("models/yolo11x_trained_pruned_local_structured_50_channel.pt")

 

for k in range(5):
    k+=1
    dataset_yaml = f"../kfold_datasets/2025-01-31_5-Fold_Cross-val/split_{k}/split_{k}_dataset.yaml"
    train_results = model.train(
        data = dataset_yaml,
        epochs=100,
        project="../lottery_ticket_iterative_pruning/1stround",
        name=f"split_{k}_train",
        plots=True,
        imgsz=640,
        lr0=0.001,   # default lr is 0.01
        batch=2,  # Small batch size
        device=[7],  # GPU
        workers=4  # Minimal number of workers
    )
    results[k] = model.metrics  # save output metrics for further analysis

New https://pypi.org/project/ultralytics/8.3.70 available 😃 Update with 'pip install -U ultralytics'
Ultralytics 8.3.58 🚀 Python-3.8.10 torch-1.13.1+cu116 CUDA:7 (NVIDIA A16, 15000MiB)
[34m[1mengine/trainer: [0mtask=detect, mode=train, model=models/yolo11x_trained_pruned_local_structured_50_channel.pt, data=../kfold_datasets/2025-01-31_5-Fold_Cross-val/split_1/split_1_dataset.yaml, epochs=100, time=None, patience=100, batch=2, imgsz=640, save=True, save_period=-1, cache=False, device=[7], workers=4, project=../lottery_ticket_iterative_pruning/1stround, name=split_1_train7, exist_ok=False, pretrained=True, optimizer=auto, verbose=True, seed=0, deterministic=True, single_cls=False, rect=False, cos_lr=False, close_mosaic=10, resume=False, amp=True, fraction=1.0, profile=False, freeze=None, multi_scale=False, overlap_mask=True, mask_ratio=4, dropout=0.0, val=True, split=val, save_json=False, save_hybrid=False, conf=None, iou=0.7, max_det=300, half=False, dnn=False, plots=True, source=No

[34m[1mtrain: [0mScanning /home/jovyan/ML2/kfold_datasets/2025-01-31_5-Fold_Cross-val/split_1/train/labels... 0 images, 249 backgrounds, 0 corrupt: 100%|██████████| 249/249 [00:00<00:00, 699.89it/s]






[34m[1mtrain: [0mNew cache created: /home/jovyan/ML2/kfold_datasets/2025-01-31_5-Fold_Cross-val/split_1/train/labels.cache


[34m[1mval: [0mScanning /home/jovyan/ML2/kfold_datasets/2025-01-31_5-Fold_Cross-val/split_1/val/labels... 0 images, 63 backgrounds, 0 corrupt: 100%|██████████| 63/63 [00:00<00:00, 656.29it/s]

[34m[1mval: [0mNew cache created: /home/jovyan/ML2/kfold_datasets/2025-01-31_5-Fold_Cross-val/split_1/val/labels.cache





Plotting labels to ../lottery_ticket_iterative_pruning/1stround/split_1_train7/labels.jpg... 
zero-size array to reduction operation maximum which has no identity
[34m[1moptimizer:[0m 'optimizer=auto' found, ignoring 'lr0=0.001' and 'momentum=0.937' and determining best 'optimizer', 'lr0' and 'momentum' automatically... 
[34m[1moptimizer:[0m AdamW(lr=0.001429, momentum=0.9) with parameter groups 167 weight(decay=0.0), 174 weight(decay=0.0005), 173 bias(decay=0.0)
[34m[1mTensorBoard: [0mmodel graph visualization added ✅
Image sizes 640 train, 640 val
Using 4 dataloader workers
Logging results to [1m../lottery_ticket_iterative_pruning/1stround/split_1_train7[0m
Starting training for 100 epochs...

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      1/100      3.91G          0      5.878          0          0        640: 100%|██████████| 125/125 [00:36<00:00,  3.45it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 16/16 [00:02<00:00,  7.66it/s]

                   all         63          0          0          0          0          0






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      2/100      3.27G          0     0.9729          0          0        640: 100%|██████████| 125/125 [00:33<00:00,  3.69it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 16/16 [00:02<00:00,  7.78it/s]


RuntimeError: torch.cat(): expected a non-empty list of Tensors