### Directory Setup and Clearing

In [None]:
import os

# ======================================
# STEP 1: ONLY DEFINE PATHS (NO OPERATIONS)
# ======================================

# Base directory
base_dir = "/home/jovyan/__ANIMALS/datasets/ds8_auto_label_yolo_architectures"

# 
working_dirs = {
    'train_images': os.path.join(base_dir, "train/images"),
    'train_labels': os.path.join(base_dir, "train/labels"),
    'val_images': os.path.join(base_dir, "val/images"), 
    'val_labels': os.path.join(base_dir, "val/labels"),
    'augmented': os.path.join(base_dir, "augmented")  
}

# Original images (READ ONLY)
original_images_dir = "/home/jovyan/__ANIMALS/images"

# Verify paths (read-only check)
print("Path verification:")
print(f"Original images: {original_images_dir} (exists: {os.path.exists(original_images_dir)})")
print(f"Base directory: {base_dir} (exists: {os.path.exists(base_dir)})")

# Show planned working directories
print("\nPlanned working directories:")
for name, path in working_dirs.items():
    print(f"{name}: {path}")

In [None]:
import shutil

# ======================================
# SAFE DIRECTORY PREPARATION
# ======================================

def clean_and_create_dirs(dirs_to_clean):
    """Cleans and recreates ONLY specified directories"""
    print("\nDirectory preparation:")
    
    # Only clean these specific directories
    allowed_to_clean = {
        'train_images', 'train_labels', 
        'val_images', 'val_labels',
        'augmented'
    }
    
    # Validate requested directories
    invalid = set(dirs_to_clean) - allowed_to_clean
    if invalid:
        raise ValueError(f"Cannot clean these directories: {invalid}")
    
    # Process each directory
    for dir_name in dirs_to_clean:
        path = working_dirs[dir_name]
        
        # Clean if exists
        if os.path.exists(path):
            shutil.rmtree(path)
            print(f"✓ Cleared: {path}")
        
        # Create fresh
        os.makedirs(path, exist_ok=True)
        print(f"✓ Created: {path}")
    
    print("\nOriginal images directory remains untouched:")
    print(f"{original_images_dir} (exists: {os.path.exists(original_images_dir)})")

# List of directories to reset
directories_to_reset = [
    'train_images',
    'train_labels', 
    'val_images',
    'val_labels'
    # Omit 'augmented' for now - we'll create it later if needed
]

# Execute cleaning
try:
    clean_and_create_dirs(directories_to_reset)
    print("\nSUCCESS: Working directories ready")
except Exception as e:
    print(f"\nERROR: {str(e)}")
    print("Please verify the directory list and try again")

### Data Validation and Cleaning

In [None]:
import json

# Paths
json_file = "result.json"

# Load JSON file
with open(json_file, "r") as f:
    coco_data = json.load(f)

# Extract image and annotation data
images = {img["id"]: img["file_name"].split("/")[-1] for img in coco_data["images"]}
annotations = coco_data["annotations"]
categories = {cat["id"]: cat["name"] for cat in coco_data["categories"]}

# Filter images to include only those with annotations
annotated_images = set(ann["image_id"] for ann in annotations)
filtered_images = {img_id: images[img_id] for img_id in images if img_id in annotated_images}

# Group images by category
category_to_image_ids = {name: set() for name in categories.values()}

for annotation in annotations:
    image_id = annotation["image_id"]
    category_id = annotation["category_id"]
    category_name = categories[category_id]
    
    # Add the image ID to the corresponding category
    category_to_image_ids[category_name].add(image_id)

# Convert sets to lists for splitting
category_to_image_ids = {k: list(v) for k, v in category_to_image_ids.items()}

In [None]:
import json
from tqdm import tqdm
from PIL import Image
from collections import defaultdict

# ======================================
# STEP 3: SAFE DATA VALIDATION
# ======================================

def validate_dataset(annotation_path):
    """Validates images without modifying originals"""
    print("\nStarting validation...")
    
    # 1. Load annotations
    try:
        with open(annotation_path) as f:
            coco_data = json.load(f)
        print(f"Loaded annotations with {len(coco_data['images'])} images")
    except Exception as e:
        raise ValueError(f"Error loading {annotation_path}: {str(e)}")

    # 2. Validate each image
    valid_images = []
    invalid_images = []
    corrupt_files = []

    for img in tqdm(coco_data['images'], desc="Validating"):
        try:
            # Get actual filename (handles cases with prefixes)
            filename = img['file_name'].split('__')[-1]
            img_path = os.path.join(original_images_dir, filename)
            
            # Check existence
            if not os.path.exists(img_path):
                invalid_images.append(img['id'])
                continue
                
            # Verify image integrity
            with Image.open(img_path) as im:
                im.verify()  # Verify without loading pixels
                
                # Check dimensions match metadata
                if im.size != (img['width'], img['height']):
                    print(f"Size mismatch: {filename} (metadata {img['width']}x{img['height']} vs actual {im.size})")
                    invalid_images.append(img['id'])
                    continue
                    
            valid_images.append(img)
            
        except Exception as e:
            corrupt_files.append(filename)
            invalid_images.append(img['id'])

    # 3. Filter annotations
    valid_annotations = [
        ann for ann in coco_data['annotations'] 
        if ann['image_id'] not in invalid_images
    ]
    
    # 4. Print report
    print("\nValidation Report:")
    print(f"Valid images: {len(valid_images)}")
    print(f"Invalid/corrupt: {len(invalid_images)}")
    if corrupt_files:
        print(f"Corrupt files (sample): {corrupt_files[:5]}")

    return coco_data, valid_images, valid_annotations

# Run validation
try:
    coco_data, valid_images, valid_annotations = validate_dataset("result.json")
    print("\nSUCCESS: Validation complete")
    print(f"Ready to process {len(valid_images)} valid images")
except Exception as e:
    print(f"\nERROR: {str(e)}")
    print("Please check:")
    print("1. result.json exists in current directory")
    print("2. Original images are accessible")

### Create Balanced Dataset Split

In [None]:
import os
import shutil
from sklearn.model_selection import train_test_split

def create_balanced_split(coco_data, valid_images, valid_annotations, output_dir):
    """Create balanced train/val split without modifying originals"""
    # 1. Create output directories
    os.makedirs(os.path.join(output_dir, "train/images"), exist_ok=True)
    os.makedirs(os.path.join(output_dir, "train/labels"), exist_ok=True)
    os.makedirs(os.path.join(output_dir, "val/images"), exist_ok=True)
    os.makedirs(os.path.join(output_dir, "val/labels"), exist_ok=True)

    # Group images by category for balanced split
    category_to_images = defaultdict(list)
    for ann in valid_annotations:
        category_to_images[ann['category_id']].append(ann['image_id'])

    # Create balanced split (80% train, 20% val)
    train_ids, val_ids = set(), set()
    
    for cat_id, img_ids in category_to_images.items():
        # Ensure each class has at least 1 validation sample
        if len(img_ids) > 4:  # Only split if we have enough samples
            train, val = train_test_split(img_ids, test_size=0.2, random_state=42)
            train_ids.update(train)
            val_ids.update(val)
        else:
            # For rare classes, put all in training and we'll augment later
            train_ids.update(img_ids)

    # Copy files to new directories (not moving - preserving originals)
    def copy_files(image_ids, split_name):
        copied = 0
        for img_id in tqdm(image_ids, desc=f"Copying {split_name} images"):
            img_info = next(img for img in valid_images if img['id'] == img_id)
            src_path = os.path.join("/home/jovyan/__ANIMALS/images", 
                                  img_info['file_name'].split('__')[-1])
            dest_path = os.path.join(output_dir, split_name, "images", 
                                   os.path.basename(src_path))
            
            # Copy image
            shutil.copyfile(src_path, dest_path)
            copied += 1
        return copied

    train_count = copy_files(train_ids, "train")
    val_count = copy_files(val_ids, "val")

    print(f"\nDataset split created at {output_dir}")
    print(f"Training images: {train_count}")
    print(f"Validation images: {val_count}")
    print(f"Split ratio: {train_count/(train_count+val_count):.1%} train, {val_count/(train_count+val_count):.1%} val")

    return train_ids, val_ids

# Usage:
output_dir = "/home/jovyan/__ANIMALS/datasets/ds8_auto_label_yolo_architectures/"
train_ids, val_ids = create_balanced_split(coco_data, valid_images, valid_annotations, output_dir)

### Create YOLO Format Labels and Verify Class Balance

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

def create_yolo_labels(coco_data, valid_annotations, image_ids, output_dir, split_name):
    """Convert COCO annotations to YOLO format and analyze class distribution"""
    # Prepare paths
    label_dir = os.path.join(output_dir, split_name, "labels")
    os.makedirs(label_dir, exist_ok=True)
    
    # Create category mapping
    cat_id_to_yolo = {cat['id']: idx for idx, cat in enumerate(sorted(coco_data['categories'], key=lambda x: x['id']))}
    
    # Convert annotations and track class counts
    class_counts = defaultdict(int)
    
    for img_id in tqdm(image_ids, desc=f"Creating {split_name} labels"):
        img_info = next(img for img in valid_images if img['id'] == img_id)
        img_anns = [ann for ann in valid_annotations if ann['image_id'] == img_id]
        
        # Create label file
        label_path = os.path.join(label_dir, os.path.splitext(img_info['file_name'].split('__')[-1])[0] + '.txt')
        
        with open(label_path, 'w') as f:
            for ann in img_anns:
                # Convert bbox to YOLO format
                x, y, w, h = ann['bbox']
                img_w, img_h = img_info['width'], img_info['height']
                x_center = (x + w/2) / img_w
                y_center = (y + h/2) / img_h
                width = w / img_w
                height = h / img_h
                
                # Write to file
                f.write(f"{cat_id_to_yolo[ann['category_id']]} {x_center:.6f} {y_center:.6f} {width:.6f} {height:.6f}\n")
                
                # Count classes
                class_counts[ann['category_id']] += 1
    
    # Analyze and visualize class distribution
    class_names = {cat['id']: cat['name'] for cat in coco_data['categories']}
    df = pd.DataFrame({
        'Class': [class_names[cat_id] for cat_id in class_counts],
        'Count': list(class_counts.values()),
        'Split': split_name
    }).sort_values('Count', ascending=False)
    
    plt.figure(figsize=(12, 6))
    plt.bar(df['Class'], df['Count'])
    plt.title(f'Class Distribution - {split_name} Set')
    plt.xticks(rotation=90)
    plt.ylabel('Number of Instances')
    plt.show()
    
    return df

# Create labels for both sets
train_df = create_yolo_labels(coco_data, valid_annotations, train_ids, output_dir, "train")
val_df = create_yolo_labels(coco_data, valid_annotations, val_ids, output_dir, "val")

# Show combined statistics
combined_df = pd.concat([train_df, val_df])
class_balance = combined_df.groupby('Class')['Count'].sum().sort_values(ascending=False)

print("\nClass Balance Report:")
print(class_balance)
print(f"\nMost common class: {class_balance.idxmax()} ({class_balance.max()} instances)")
print(f"Least common class: {class_balance.idxmin()} ({class_balance.min()} instances)")
print(f"Imbalance ratio: {class_balance.max()/class_balance.min():.1f}x")

### Targeted Augmentation for Rare Classes

In [None]:
!pip install albumentations

In [None]:
import albumentations as A
import numpy as np
import cv2

def augment_rare_classes(output_dir, class_counts, min_samples=50):
    """Augment rare classes until they reach minimum samples"""
    # 1. Identify rare classes
    rare_classes = [cat_id for cat_id, count in class_counts.items() if count < min_samples]
    print(f"Rare classes to augment: {[coco_data['categories'][cat_id]['name'] for cat_id in rare_classes]}")
    
    # Define augmentations
    transform = A.Compose([
        A.HorizontalFlip(p=0.5),
        A.RandomBrightnessContrast(p=0.5),
        A.Rotate(limit=15, p=0.5),
        A.HueSaturationValue(p=0.5),
    ], bbox_params=A.BboxParams(format='yolo'))
    
    # Process each rare class
    for cat_id in rare_classes:
        class_name = coco_data['categories'][cat_id]['name']
        print(f"\nAugmenting {class_name} (current: {class_counts[cat_id]} samples)")
        
        # Find all images containing this class
        train_label_dir = os.path.join(output_dir, "train", "labels")
        image_files = [f for f in os.listdir(train_label_dir) if f.endswith('.txt')]
        
        samples_created = 0
        for label_file in tqdm(image_files, desc=f"Processing {class_name}"):
            with open(os.path.join(train_label_dir, label_file)) as f:
                lines = f.readlines()
            
            # Check if this image contains the rare class
            has_rare_class = any(int(line.split()[0]) == cat_id for line in lines)
            if not has_rare_class:
                continue
            
            # Load image and annotations
            img_path = os.path.join(output_dir, "train", "images", label_file.replace('.txt', '.JPG'))
            image = cv2.imread(img_path)
            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
            
            # Parse bounding boxes
            bboxes = []
            for line in lines:
                class_id, xc, yc, w, h = map(float, line.split())
                bboxes.append([xc, yc, w, h, int(class_id)])
            
            # Apply augmentations
            augmented = transform(image=image, bboxes=bboxes)
            
            # Save new sample
            new_img_path = os.path.join(output_dir, "train", "images", f"aug_{class_name}_{samples_created}.JPG")
            cv2.imwrite(new_img_path, cv2.cvtColor(augmented['image'], cv2.COLOR_RGB2BGR))
            
            # Save new labels
            with open(os.path.join(train_label_dir, f"aug_{class_name}_{samples_created}.txt"), 'w') as f:
                for bbox in augmented['bboxes']:
                    f.write(f"{int(bbox[4])} {bbox[0]:.6f} {bbox[1]:.6f} {bbox[2]:.6f} {bbox[3]:.6f}\n")
            
            samples_created += 1
            if class_counts[cat_id] + samples_created >= min_samples:
                break
        
        print(f"Created {samples_created} new samples for {class_name}")
        class_counts[cat_id] += samples_created

# Get current class counts (from previous step)
class_counts = {cat['id']: 0 for cat in coco_data['categories']}
for ann in valid_annotations:
    class_counts[ann['category_id']] += 1

# Run augmentation (target minimum 50 samples per class)
augment_rare_classes(output_dir, class_counts, min_samples=50)

# Verify new counts
print("\nUpdated class counts:")
for cat_id, count in sorted(class_counts.items(), key=lambda x: x[1]):
    print(f"{coco_data['categories'][cat_id]['name']}: {count}")

### Optimized Training Configuration with Class Weighting

In [None]:
!pip install ultralytics

In [None]:
!pip install torch torchvision torchaudio

In [None]:
import os
import torch
import numpy as np
from ultralytics import YOLO
from datetime import datetime
from collections import defaultdict

In [None]:
# ======================
# 1. CPU Configuration
# ======================
os.environ["OMP_NUM_THREADS"] = "6"  # Use 6 cores
os.environ["KMP_AFFINITY"] = "granularity=fine,compact,1,0"
torch.set_num_threads(6)  # Prevent overallocation

In [None]:
# ======================
# 2. Class Weight Calculation
# ======================
class_counts = {
    'badger': 173, 'boar': 172, 'brown_bear': 6, 'hare': 14, 
    'lynx': 93, 'musk_deer': 4, 'otter': 4, 'raccoon': 50,
    'red_fox': 50, 'roe_deer_female': 314, 'roe_deer_male': 80,
    'sable': 50, 'sika_deer_female': 1002, 'sika_deer_male': 214,
    'tiger': 499, 'ussuri_bear': 8, 'wild_cat': 27, 
    'yellow_marten': 21
}

def calculate_cpu_weights(counts, power=0.75, max_weight=5.0):
    median = np.median(list(counts.values()))
    weights = {}
    for cls, count in counts.items():
        if count == 0:
            weights[cls] = max_weight
        else:
            weight = (median / (count + 1)) ** power
            weights[cls] = min(weight, max_weight)
    
    # Normalize weights
    max_w = max(weights.values())
    return {cls: w/max_w for cls, w in weights.items()}

class_weights = calculate_cpu_weights(class_counts)
print("CPU-Optimized Class Weights:")
for cls, weight in sorted(class_weights.items(), key=lambda x: x[1], reverse=True):
    print(f"{cls}: {weight:.2f}")

In [None]:


# ======================
# 2. Training Configuration (VALID PARAMS ONLY)
# ======================
output_dir = "/home/jovyan/__ANIMALS/datasets/ds8_auto_label_yolo_architectures/"

training_config = {
    "data": os.path.join(output_dir, "dataset.yaml"),
    "epochs": 250,
    "batch": 4,          # Optimized for CPU memory
    "imgsz": 512,        # Reduced from 640
    "device": "cpu",
    "workers": 6,        # 6 cores dedicated
    "optimizer": "Adam", # Better for CPU than AdamW
    "lr0": 0.001,
    "lrf": 0.01,
    "cls": 3.0,          # Class loss weight
    "box": 7.5,          # Box loss weight
    "augment": True,     # Basic augmentations
    "fliplr": 0.5,       # Horizontal flip
    "mosaic": 0.3,       # Reduced for CPU
    "mixup": 0.1,        # Reduced for CPU
    "close_mosaic": 10,
    "resume": True,
    "save_period": 10,   # Save every 10 epochs
    "patience": 75,      # Longer patience for CPU
    "name": "wildlife_cpu",
    "overlap_mask": True,
    "iou": 0.6           # Slightly lower threshold
}

# Remove unsupported parameters
training_config.pop('fl_gamma', None)
training_config.pop('copy_paste', None)


In [None]:
# ======================
# 4. Training Phases Manager
# ======================
class PhaseManager:
    def __init__(self):
        self.phases = {
            0: {"mosaic": 0.0, "mixup": 0.0, "lr": 0.001},    # Epochs 1-75
            75: {"mosaic": 0.3, "mixup": 0.1, "lr": 0.0002},   # Epochs 76-150
            150: {"mosaic": 0.1, "mixup": 0.05, "lr": 0.00005} # Epochs 151-250
        }
    
    def on_train_epoch_start(self, trainer):
        current_epoch = trainer.epoch
        for start_epoch, config in sorted(self.phases.items(), reverse=True):
            if current_epoch >= start_epoch:
                trainer.args.mosaic = config["mosaic"]
                trainer.args.mixup = config["mixup"]
                trainer.lr = config["lr"]
                break


In [None]:
# ======================
# 5. CPU Monitoring
# ======================
class CPUMonitor:
    def __init__(self):
        self.best_ap = 0
        self.class_history = defaultdict(list)
    
    def on_val_end(self, trainer):
        # Track best AP
        current_ap = trainer.metrics.ap50
        if current_ap > self.best_ap:
            self.best_ap = current_ap
        
        # Track class-wise performance
        for i, ap in enumerate(trainer.metrics.ap_class):
            cls_name = trainer.data['names'][i]
            self.class_history[cls_name].append(ap)
        
        # Print diagnostics
        print(f"\n[Epoch {trainer.epoch}] Best mAP50: {self.best_ap:.3f}")
        print("Top 5 improving classes:")
        improving = sorted([(k, v[-1]-v[-2]) for k,v in self.class_history.items() 
                          if len(v) > 1], key=lambda x: x[1], reverse=True)[:5]
        for cls, gain in improving:
            print(f"{cls}: +{gain:.3f}")

In [None]:
# ======================
# 4. Training Execution
# ======================
def train_model():
    # Initialize
    model = YOLO("yolov8s.pt")
    
    # Add callbacks
    phase_manager = PhaseManager()
    model.add_callback("on_train_epoch_start", phase_manager.on_train_epoch_start)
    
    # Check for existing checkpoints
    checkpoint_dir = os.path.join("runs", "detect", training_config["name"])
    last_checkpoint = None
    
    if os.path.exists(checkpoint_dir):
        checkpoints = [f for f in os.listdir(checkpoint_dir) 
                      if f.endswith(".pt") and "temp" not in f]
        if checkpoints:
            checkpoints.sort(key=lambda x: os.path.getmtime(os.path.join(checkpoint_dir, x)))
            last_checkpoint = os.path.join(checkpoint_dir, checkpoints[-1])
            print(f"Resuming from: {last_checkpoint}")
            training_config["resume"] = last_checkpoint
    
    # Start training with error handling
    try:
        results = model.train(**training_config)
    except Exception as e:
        print(f"Training failed: {e}")
        print("Attempting minimal configuration...")
        minimal_config = {
            "data": training_config["data"],
            "epochs": training_config["epochs"],
            "batch": 2,
            "imgsz": 512,
            "device": "cpu",
            "workers": 4,
            "optimizer": "Adam",
            "lr0": 0.001,
            "name": training_config["name"] + "_minimal"
        }
        results = model.train(**minimal_config)
    
    return results

# Run training
if __name__ == "__main__":
    train_model()

In [None]:
if epoch > 200 and trainer.metrics.ap50 > 0.8:
    trainer.args.val_augment = True  # TTA
    trainer.args.conf = 0.001        # Lower confidence threshold