# TomatoMAP data fetcher

In [1]:
import glob
import os
from pathlib import Path

def scan_all_images(img_root_dir="img"):

    img_root = Path(img_root_dir)
    all_image_paths = []

    print(f"scan data root: {img_root}")
    print("="*50)

    # check root
    if not img_root.exists():
        print(f"error: root not exist {img_root}!")
        return []

    # check cam1-4
    camera_dirs = [d for d in img_root.iterdir() if d.is_dir()]
    print(f"found {len(camera_dirs)} cams: {[d.name for d in camera_dirs]}")

    for camera_dir in camera_dirs:
        print(f"solving cam: {camera_dir.name}")

        # iter
        group_dirs = [d for d in camera_dir.iterdir() if d.is_dir() and d.name.startswith('Group')]
        print(f"found {len(group_dirs)} groups: {[d.name for d in group_dirs]}")

        for group_dir in group_dirs:
            print(f"solving group: {group_dir.name}")

            # iter plants
            plant_dirs = [d for d in group_dir.iterdir() if d.is_dir() and d.name.startswith('plant')]
            print(f" found {len(plant_dirs)} plant samples")

            for plant_dir in plant_dirs:
                # iter date
                date_dirs = [d for d in plant_dir.iterdir() if d.is_dir()]

                for date_dir in date_dirs:
                    # get all data
                    jpg_pattern = str(date_dir / "*.jpg")
                    jpg_files = glob.glob(jpg_pattern)

                    if jpg_files:
                        #print(f" {plant_dir.name}/{date_dir.name}: found {len(jpg_files)} image data")
                        all_image_paths.extend(jpg_files)

    print("\n" + "="*50)
    print(f"total:")
    print(f"found {len(all_image_paths)} images")

    if all_image_paths:
        print(f"\nfirst 5 image path example:")
        for i, path in enumerate(all_image_paths[:5]):
            print(f"  {i+1}. {path}")

        if len(all_image_paths) > 5:
            print(f"  ... (still {len(all_image_paths)-5} images)")

    return all_image_paths

image_paths = scan_all_images("img")
print(f"\ndone！found {len(image_paths)} images")

scan data root: img
found 4 cams: ['1', '2', '3', '4']
solving cam: 1
found 5 groups: ['Group4', 'Group5', 'Group1', 'Group2', 'Group3']
solving group: Group4
 found 20 plant samples
solving group: Group5
 found 20 plant samples
solving group: Group1
 found 21 plant samples
solving group: Group2
 found 20 plant samples
solving group: Group3
 found 20 plant samples
solving cam: 2
found 5 groups: ['Group1', 'Group2', 'Group3', 'Group4', 'Group5']
solving group: Group1
 found 21 plant samples
solving group: Group2
 found 20 plant samples
solving group: Group3
 found 20 plant samples
solving group: Group4
 found 20 plant samples
solving group: Group5
 found 20 plant samples
solving cam: 3
found 5 groups: ['Group5', 'Group1', 'Group2', 'Group3', 'Group4']
solving group: Group5
 found 20 plant samples
solving group: Group1
 found 21 plant samples
solving group: Group2
 found 20 plant samples
solving group: Group3
 found 20 plant samples
solving group: Group4
 found 20 plant samples
solving c

# TomatoMAP-Cls BBCH Label Loading

In [2]:
import pandas as pd
import re

def load_bbch_excel(excel_path="BBCH_classification.xlsx"):
    print("load BBCH label...")

    bbch_data = {}
    all_bbch_values = set()

    try:
        # read per group
        for group_num in range(1, 6):
            sheet_name = f"Group {group_num}"
            print(f"  loading {sheet_name}...")

            df = pd.read_excel(excel_path, sheet_name=sheet_name)

            # 1st col
            df = df.dropna(subset=[df.columns[0]])
            date_column = df.columns[0]
            df[date_column] = df[date_column].astype(str)

            # yyyymmdd format
            date_pattern = re.compile(r'^\d{8}$')
            df = df[df[date_column].str.match(date_pattern, na=False)]

            bbch_data[group_num] = df

            # fetch bbch label
            for col in df.columns:
                if col.startswith('BBCH plant'):
                    values = df[col].dropna()
                    all_bbch_values.update(values.astype(int))

            print(f"time points: {len(df)}")
            print(f"plant amount: {len([c for c in df.columns if c.startswith('BBCH plant')])}")

        unique_bbch_values = sorted(list(all_bbch_values))

        print(f"data overview:")
        print(f"total group: {len(bbch_data)}")
        print(f"BBCH classes: {len(unique_bbch_values)}")
        print(f"BBCH range: {min(unique_bbch_values)} - {max(unique_bbch_values)}")
        print(f"all BBCH: {unique_bbch_values}")

        return bbch_data, unique_bbch_values

    except Exception as e:
        print(f"failed to load label file: {e}")
        return {}, []

bbch_data, unique_bbch_values = load_bbch_excel("BBCH_classification.xlsx")
print(f"done！loaded {len(bbch_data)} groups，{len(unique_bbch_values)} BBCH classes")

load BBCH label...
  loading Group 1...
time points: 3
plant amount: 21
  loading Group 2...
time points: 15
plant amount: 20
  loading Group 3...
time points: 19
plant amount: 20
  loading Group 4...
time points: 20
plant amount: 20
  loading Group 5...
time points: 10
plant amount: 20
data overview:
total group: 5
BBCH classes: 50
BBCH range: 13 - 89
all BBCH: [13, 14, 15, 16, 17, 19, 20, 21, 22, 23, 27, 28, 29, 51, 52, 53, 54, 55, 56, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89]
done！loaded 5 groups，50 BBCH classes


# TomatoMAP-Cls Label Matching to Images

In [3]:
from collections import Counter

def create_image_label_mapping_simple(image_paths, bbch_data):
    print("matching labels to images...")
    print(f"solving {len(image_paths)} images...")

    label_mapping = {}
    missing_labels = []
    success_count = 0

    for i, img_path in enumerate(image_paths):
        if i % 10000 == 0:  # process track
            print(f"processing: {i}/{len(image_paths)}")

        path_parts = Path(img_path).parts

        group_name = None
        plant_name = None
        date_str = None

        for part in path_parts:
            if part.lower().startswith('group'):
                group_name = part
            elif part.startswith('plant'):
                plant_name = part
            elif re.match(r'^\d{8}$', part):
                date_str = part

        if not all([group_name, plant_name, date_str]):
            missing_labels.append(img_path)
            continue

        try:
            group_num = int(group_name.lower().replace('group', ''))
            plant_num = int(plant_name.replace('plant', ''))

            # check bbch
            if group_num not in bbch_data:
                missing_labels.append(img_path)
                continue

            df = bbch_data[group_num]
            date_column = df.columns[0]
            plant_column = f"BBCH plant {plant_num}"

            if plant_column not in df.columns:
                missing_labels.append(img_path)
                continue

            matching_rows = df[df[date_column] == date_str]

            if len(matching_rows) == 0:
                missing_labels.append(img_path)
                continue

            bbch_value = matching_rows[plant_column].iloc[0]

            if pd.notna(bbch_value):
                try:
                    bbch_int = int(bbch_value)
                    label_mapping[img_path] = bbch_int
                    success_count += 1
                except (ValueError, TypeError):
                    missing_labels.append(img_path)
            else:
                missing_labels.append(img_path)

        except (ValueError, KeyError, IndexError) as e:
            missing_labels.append(img_path)
            continue

    print(f"TomatoMAP is now annotated for TomatoMAP-Cls!")
    print(f"matching result:")
    print(f"success: {len(label_mapping)} images")
    print(f"fail: {len(missing_labels)} images")

    total = len(label_mapping) + len(missing_labels)
    if total > 0:
        success_rate = len(label_mapping) / total * 100
        print(f"matching rate: {success_rate:.1f}%")

    return label_mapping, missing_labels

print("start matching")

data_found = False
if 'image_paths' in locals() and 'bbch_data' in locals():
    print(f"local var")
    data_found = True
elif 'image_paths' in globals() and 'bbch_data' in globals():
    print(f"global var")
    image_paths = globals()['image_paths']
    bbch_data = globals()['bbch_data']
    data_found = True

if data_found:
    print(f"data overview:")
    print(f"image num: {len(image_paths)}")
    print(f"BBCH groups: {len(bbch_data)}")

    label_mapping, missing_labels = create_image_label_mapping_simple(image_paths, bbch_data)

    if label_mapping:
        unique_bbch_values = sorted(list(set(label_mapping.values())))
        label_counts = Counter(label_mapping.values())

        print(f"BBCH-scale:")
        print(f"  class number: {len(unique_bbch_values)}")
        print(f"  scale range: {min(unique_bbch_values)} - {max(unique_bbch_values)}")
        print(f"  list: {unique_bbch_values}")

        print(f"distrubution (top 10):")
        for bbch_value, count in label_counts.most_common(10):
            print(f"  BBCH {bbch_value}: {count} images")

        globals()['label_mapping'] = label_mapping
        globals()['unique_bbch_values'] = unique_bbch_values
        globals()['missing_labels'] = missing_labels

        print(f" label_mapping: {len(label_mapping)} matches")
        print(f" unique_bbch_values: {len(unique_bbch_values)} classes")
        print(f" missing_labels: {len(missing_labels)} missing labels")

    else:
        print(f"no matching")
        print(f"please check data")

else:
    print("can't find data:")
    print(" run cells before first")


start matching
local var
data overview:
image num: 64464
BBCH groups: 5
matching labels to images...
solving 64464 images...
processing: 0/64464
processing: 10000/64464
processing: 20000/64464
processing: 30000/64464
processing: 40000/64464
processing: 50000/64464
processing: 60000/64464
TomatoMAP is now annotated for TomatoMAP-Cls!
matching result:
success: 64464 images
fail: 0 images
matching rate: 100.0%
BBCH-scale:
  class number: 50
  scale range: 13 - 89
  list: [13, 14, 15, 16, 17, 19, 20, 21, 22, 23, 27, 28, 29, 51, 52, 53, 54, 55, 56, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89]
distrubution (top 10):
  BBCH 13: 5328 images
  BBCH 79: 4944 images
  BBCH 78: 3360 images
  BBCH 70: 3264 images
  BBCH 75: 2880 images
  BBCH 73: 2688 images
  BBCH 76: 2640 images
  BBCH 77: 2592 images
  BBCH 72: 2544 images
  BBCH 14: 2400 images
 label_mapping: 64464 matches
 unique_bbch_values: 50 classes
 missing_la

# TomatoMAP-Cls Generation (can take a while depend on your hardware)

In [4]:
import shutil
from pathlib import Path
from collections import defaultdict
import random

def create_pytorch_dataset(label_mapping, unique_bbch_values,
                          output_dir="TomatoMAP-Cls",
                          train_ratio=0.7, val_ratio=0.2, test_ratio=0.1):

    print("building TomatoMAP-Cls")

    random.seed(888)

    base_dir = Path(output_dir)
    if base_dir.exists():
        print(f"  cleaning existed dir: {base_dir}")
        shutil.rmtree(base_dir)

    dirs = {
        'train': base_dir / "train",
        'val': base_dir / "val",
        'test': base_dir / "test"
    }

    for split_name, split_dir in dirs.items():
        split_dir.mkdir(parents=True, exist_ok=True)
        for bbch_value in unique_bbch_values:
            class_dir = split_dir / f"bbch_{bbch_value:02d}"
            class_dir.mkdir(exist_ok=True)

    print(f"  built {len(unique_bbch_values)} bbch dirs")

    images_by_class = defaultdict(list)
    for img_path, bbch_value in label_mapping.items():
        images_by_class[bbch_value].append(img_path)

    print(f"data distrubution:")
    total_images = 0
    for bbch_value in sorted(images_by_class.keys()):
        count = len(images_by_class[bbch_value])
        total_images += count
        print(f"  BBCH {bbch_value:2d}: {count:4d} images")

    split_counts = {"train": 0, "val": 0, "test": 0}

    for bbch_value, img_paths in images_by_class.items():
        if len(img_paths) < 3:
            print(f"BBCH {bbch_value} only {len(img_paths)} images")

        img_paths_shuffled = img_paths.copy()
        random.shuffle(img_paths_shuffled)

        n_total = len(img_paths_shuffled)
        n_train = max(1, int(n_total * train_ratio))
        n_val = max(1, int(n_total * val_ratio)) if n_total > 2 else 0
        n_test = n_total - n_train - n_val

        splits = {
            'train': img_paths_shuffled[:n_train],
            'val': img_paths_shuffled[n_train:n_train + n_val],
            'test': img_paths_shuffled[n_train + n_val:]
        }

        class_name = f"bbch_{bbch_value:02d}"
        for split_name, split_paths in splits.items():
            if not split_paths:
                continue

            target_dir = dirs[split_name] / class_name

            for idx, img_path in enumerate(split_paths):
                if os.path.exists(img_path):

                    original_name = Path(img_path).stem
                    ext = Path(img_path).suffix
                    new_name = f"{original_name}_{idx:03d}{ext}"

                    target_path = target_dir / new_name
                    shutil.copy2(img_path, target_path)
                    split_counts[split_name] += 1
                else:
                    print(f"file not exist: {img_path}")

    print(f"TomatoMAP-Cls done!")
    print(f"  path: {base_dir}")
    print(f"  train: {split_counts['train']} images")
    print(f"  val: {split_counts['val']} images")
    print(f"  test: {split_counts['test']} images")
    print(f"  total: {sum(split_counts.values())} images")

    return str(base_dir), len(unique_bbch_values)

if 'label_mapping' in locals() and 'unique_bbch_values' in locals():
    dataset_dir, num_classes = create_pytorch_dataset(label_mapping, unique_bbch_values)
    print(f"TomatoMAP-Cls rdy!")
    print(f"   path: {dataset_dir}")
    print(f"   classes: {num_classes}")

    globals()['dataset_dir'] = dataset_dir
    globals()['num_classes'] = num_classes
else:
    print("please run cells before first")
    dataset_dir = "TomatoMAP-Cls"
    num_classes = 50
    print(f"default value as test: dataset_dir={dataset_dir}, num_classes={num_classes}")

building TomatoMAP-Cls
  built 50 bbch dirs
data distrubution:
  BBCH 13: 5328 images
  BBCH 14: 2400 images
  BBCH 15: 1056 images
  BBCH 16:  432 images
  BBCH 17:  144 images
  BBCH 19:   48 images
  BBCH 20:  528 images
  BBCH 21:  144 images
  BBCH 22:  144 images
  BBCH 23:   48 images
  BBCH 27:   48 images
  BBCH 28:   48 images
  BBCH 29:  432 images
  BBCH 51:  576 images
  BBCH 52: 1296 images
  BBCH 53: 1536 images
  BBCH 54:  624 images
  BBCH 55:  432 images
  BBCH 56:   48 images
  BBCH 59:  144 images
  BBCH 60: 1584 images
  BBCH 61:  816 images
  BBCH 62:  480 images
  BBCH 63:  624 images
  BBCH 64:  816 images
  BBCH 65: 1248 images
  BBCH 66:  720 images
  BBCH 67: 1104 images
  BBCH 68: 1536 images
  BBCH 69: 1632 images
  BBCH 70: 3264 images
  BBCH 71: 2352 images
  BBCH 72: 2544 images
  BBCH 73: 2688 images
  BBCH 74: 2064 images
  BBCH 75: 2880 images
  BBCH 76: 2640 images
  BBCH 77: 2592 images
  BBCH 78: 3360 images
  BBCH 79: 4944 images
  BBCH 80: 1392 i

# TomatoMAP-Det Generation

In [5]:
import os
import shutil
from pathlib import Path
from tqdm import tqdm
import random
import glob

def scan_all_images(img_root="img"):
    print("scaning TomatoMAP...")

    img_root = Path(img_root)
    if not img_root.exists():
        print(f"path not exist: {img_root}")
        return []

    all_images = []

    for file_path in img_root.rglob("*"):
        if file_path.is_file() and file_path.suffix.lower() in ['.jpg', '.jpeg']:
            all_images.append(str(file_path))

    print(f"found {len(all_images)} images")

    unique_paths = set(all_images)
    if len(unique_paths) != len(all_images):
        duplicate_count = len(all_images) - len(unique_paths)
        print(f"found duplicated {duplicate_count} path，cleaning...")
        all_images = list(unique_paths)
        print(f"after cleaning: {len(all_images)} images left")

    return all_images

def create_detection_dataset(img_root="img", label_root="labels", output_dir="TomatoMAP-Det"):
    print("building TomatoMAP-Det subset")
    print("="*50)
    print(f"image data: {img_root}")
    print(f"label data: {label_root}")
    print(f"output path: {output_dir}")

    image_paths = scan_all_images(img_root)
    if not image_paths:
        print("didn't find any images")
        return None

    base_dir = Path(output_dir)
    if base_dir.exists():
        print(f"cleaning path: {base_dir}")
        shutil.rmtree(base_dir)

    splits = ['train', 'val', 'test']
    for split in splits:
        (base_dir / "images" / split).mkdir(parents=True, exist_ok=True)
        (base_dir / "labels" / split).mkdir(parents=True, exist_ok=True)

    print(f"build structure:")
    print(f"   {base_dir}/images/{{train,val,test}}")
    print(f"   {base_dir}/labels/{{train,val,test}}")

    random.seed(888)

    valid_paths = [path for path in image_paths if os.path.exists(path)]
    print(f"valid images: {len(valid_paths)}")

    if not valid_paths:
        print("no valid images")
        return None

    shuffled_paths = valid_paths.copy()
    random.shuffle(shuffled_paths)

    n_total = len(shuffled_paths)
    n_train = int(n_total * 0.7)
    n_val = int(n_total * 0.2)

    split_data = {
        'train': shuffled_paths[:n_train],
        'val': shuffled_paths[n_train:n_train + n_val],
        'test': shuffled_paths[n_train + n_val:]
    }

    print(f"spilting:")
    for split, paths in split_data.items():
        print(f"  {split}: {len(paths)} images")

    total_stats = {'images': 0, 'labels': 0, 'missing': 0}
    img_root_path = Path(img_root)
    label_root_path = Path(label_root)

    filename_counts = {}

    for split, paths in split_data.items():
        print(f"sovling {split} data...")

        split_stats = {'images': 0, 'labels': 0, 'missing': 0}

        for img_path in tqdm(paths, desc=f"backuping{split}data"):
            img_path = Path(img_path)

            original_filename = img_path.name

            if original_filename in filename_counts:
                filename_counts[original_filename] += 1

                name_stem = img_path.stem
                name_ext = img_path.suffix
                final_filename = f"{name_stem}_{filename_counts[original_filename]}{name_ext}"
                print(f"conflict data found, renaming: {original_filename} -> {final_filename}")
            else:
                filename_counts[original_filename] = 0
                final_filename = original_filename

            target_img_path = base_dir / "images" / split / final_filename
            shutil.copy2(img_path, target_img_path)
            split_stats['images'] += 1

            # try:
            #     relative_to_img = img_path.relative_to(img_root_path)

            #     label_relative = relative_to_img.with_suffix('.txt')
            #     label_path = label_root_path / label_relative

            try:    
                label_filename = img_path.stem + '.txt'
                label_path = label_root_path / label_filename
                
                if label_path.exists():
                    target_label_name = final_filename.replace(img_path.suffix, '.txt')
                    target_label_path = base_dir / "labels" / split / target_label_name
                    shutil.copy2(label_path, target_label_path)
                    split_stats['labels'] += 1
                else:
                    print(f"missing label for: {img_path}")
                    split_stats['missing'] += 1

            except ValueError as e:
                print(f"missing label for: {img_path} (ValueError)")
                split_stats['missing'] += 1
            except Exception as e:
                print(f"label process error {img_path}: {e}")
                split_stats['missing'] += 1

        print(f" {split}: {split_stats['images']} images, {split_stats['labels']} labels, {split_stats['missing']} misiing")
        for key in total_stats:
            total_stats[key] += split_stats[key]

    conflicts = {name: count for name, count in filename_counts.items() if count > 0}
    if conflicts:
        print(f"found {len(conflicts)} conflicted files，processed")
        if len(conflicts) <= 10:
            print(f"conflict file: {list(conflicts.keys())}")
        else:
            print(f"conflict file: {list(conflicts.keys())[:10]}... (还有{len(conflicts)-10}个)")
    else:
        print(f"no conflict file!")

    print(f"TomatoMAP-Det finished!")
    print(f"output path: {base_dir.absolute()}")
    print(f"statics:")
    print(f"  images: {total_stats['images']}")
    print(f"  labels: {total_stats['labels']}")
    print(f"  missing labels: {total_stats['missing']}")
    if total_stats['images'] > 0:
        print(f" label matched: {total_stats['labels']/total_stats['images']*100:.1f}%")

    return str(base_dir.absolute()), total_stats

def main():
    print("TomatoMAP-Det builder")
    print("="*60)

    CONFIG = {
        'img_root': 'img',
        'label_root': 'labels',
        'output_dir': 'TomatoMAP-Det'
    }

    print(f"configuration:")
    for key, value in CONFIG.items():
        print(f"  {key}: {value}")

    if not os.path.exists(CONFIG['img_root']):
        print(f"image folder not exist: {CONFIG['img_root']}")
        return False

    if not os.path.exists(CONFIG['label_root']):
        print(f"label folder not exist: {CONFIG['label_root']}")
        return False

    result = create_detection_dataset(
        img_root=CONFIG['img_root'],
        label_root=CONFIG['label_root'],
        output_dir=CONFIG['output_dir']
    )

    if result:
        dataset_dir, stats = result

        globals()['detection_dataset_dir'] = dataset_dir
        globals()['detection_stats'] = stats

        print(f"  detection_dataset_dir = '{dataset_dir}'")
        print(f"  detection_stats = {stats}")

        return True
    else:
        print("TomatoMAP-Det dataset creation failed")
        return False

if __name__ == "__main__":
    success = main()

    if success:
        print(f"TomatoMAP-Det built successfully!")
    else:
        print(f"TomatoMAP-Det built failed! check readme for more details")

TomatoMAP-Det builder
configuration:
  img_root: img
  label_root: labels
  output_dir: TomatoMAP-Det
building TomatoMAP-Det subset
image data: img
label data: labels
output path: TomatoMAP-Det
scaning TomatoMAP...
found 64464 images
cleaning path: TomatoMAP-Det
build structure:
   TomatoMAP-Det/images/{train,val,test}
   TomatoMAP-Det/labels/{train,val,test}
valid images: 64464
spilting:
  train: 45124 images
  val: 12892 images
  test: 6448 images
sovling train data...


backupingtraindata:  51%|██████████████████▋                  | 22833/45124 [00:11<00:11, 2007.68it/s]

missing label for: img/4/Group3/plant54/20231004/pi4_01855_54_7_20231004091848.jpg


backupingtraindata: 100%|█████████████████████████████████████| 45124/45124 [00:22<00:00, 1996.49it/s]


 train: 45124 images, 45123 labels, 1 misiing
sovling val data...


backupingvaldata: 100%|███████████████████████████████████████| 12892/12892 [00:06<00:00, 1870.64it/s]


 val: 12892 images, 12892 labels, 0 misiing
sovling test data...


backupingtestdata: 100%|████████████████████████████████████████| 6448/6448 [00:03<00:00, 1914.54it/s]

 test: 6448 images, 6448 labels, 0 misiing
no conflict file!
TomatoMAP-Det finished!
output path: /home/ubuntu/project/bbch/TomatoMAP-Det
statics:
  images: 64464
  labels: 64463
  missing labels: 1
 label matched: 100.0%
  detection_dataset_dir = '/home/ubuntu/project/bbch/TomatoMAP-Det'
  detection_stats = {'images': 64464, 'labels': 64463, 'missing': 1}
TomatoMAP-Det built successfully!





# Congratulations! TomatoMAP-Cls, TomatomaMAP-Det is built successfully.