# Phase 1-1: YOLOv8 Finetuning

Finetunes the last 16 layers of a yolov8

In [None]:
# Library imports
import os
import yaml
from pathlib import Path
from datetime import datetime
from collections import defaultdict
import numpy as np
import pandas as pd
from scipy import stats
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
!pip install ultralytics -q
from ultralytics import YOLO
import torch

# Google Drive mounting
try:
    from google.colab import drive  # type: ignore
    drive.mount('/content/drive')
except Exception:
    pass

# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.dpi'] = 100

print("All packages imported successfully!")
print(f"Current directory: {os.getcwd()}")
print(f"GPU available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU device: {torch.cuda.get_device_name(0)}")

In [None]:
# =============================================================================
# Environment Detection
# =============================================================================

import sys

# Determine if running in Colab or locally
IN_COLAB = 'google.colab' in sys.modules

print("Environment Detection:")
print("=" * 60)

if IN_COLAB:
    print(" Google Colab environment detected")
else:
    print(" Local environment detected")

In [None]:
# =============================================================================
# Path Configuration - Update these for your computer
# =============================================================================

# Google Drive paths (for Colab)
DRIVE_ROOT = "/content/drive/MyDrive/Colab Notebooks/Data/production"

# Local machine paths
LOCAL_ROOT = "/Users/tyreecruse/Desktop/CS230/Project/Data/production "

# Colab VM temporary storage
COLAB_LOCAL_ROOT = "/content"

# Dataset configuration
ZIP_FILE_NAME = "production.zip"
ZIP_FILE_PATH_IN_DRIVE = os.path.join(DRIVE_ROOT, ZIP_FILE_NAME)

print("\nPath Configuration:")
print("-" * 40)
if IN_COLAB:
    print(f"Drive root:  {DRIVE_ROOT}")
    print(f"Colab local: {COLAB_LOCAL_ROOT}")
    print(f"Zip file:    {ZIP_FILE_NAME}")
else:
    print(f"Local root:  {LOCAL_ROOT}")

In [None]:
# =============================================================================
# Data Staging for Colab
# =============================================================================

if IN_COLAB:
    print("\nData Staging:")
    print("-" * 40)

    # Define local paths in Colab VM
    local_zip_path = os.path.join(COLAB_LOCAL_ROOT, ZIP_FILE_NAME)
    local_dataset_path = os.path.join(COLAB_LOCAL_ROOT, "production")

    # Only copy and unzip if dataset doesn't exist locally
    if not os.path.exists(local_dataset_path):
        print(f"Step 1: Copying {ZIP_FILE_NAME} from Drive...")
        !cp "{ZIP_FILE_PATH_IN_DRIVE}" "{COLAB_LOCAL_ROOT}/"

        print("Step 2: Unzipping dataset...")
        !unzip -q "{local_zip_path}" -d "{COLAB_LOCAL_ROOT}"

        # Verify extraction
        if os.path.exists(local_dataset_path):
            file_count = len(os.listdir(local_dataset_path))
            print(f" Data staging complete ({file_count} items)")
        else:
            print(" Error: Dataset extraction failed")
    else:
        print(" Dataset already staged locally")

    # Set paths for Colab
    PROJECT_ROOT = COLAB_LOCAL_ROOT  # Read from fast local disk
    OUTPUT_ROOT = DRIVE_ROOT          # Write results to permanent Drive

else:
    # Set paths for local machine
    PROJECT_ROOT = LOCAL_ROOT
    OUTPUT_ROOT = LOCAL_ROOT
    print("\nUsing local paths - no staging required")

In [None]:
# =============================================================================
# Configuration - Update based on staging and actual data structure
# =============================================================================

if IN_COLAB:
    # Use the locally staged data for fast reading
    DATASET_ROOT = os.path.join(COLAB_LOCAL_ROOT, "production")

    # Output to Drive for permanent storage
    OUTPUT_BASE = "/content/drive/MyDrive/CS230/Colab Notebooks/Data/training results"

    # Create output directory if it doesn't exist
    os.makedirs(OUTPUT_BASE, exist_ok=True)

    # Create data.yaml dynamically since it needs absolute paths
    data_yaml_content = {
        'path': DATASET_ROOT,
        'train': 'train/images',
        'val': 'val/images',
        'test': 'test/images',
        'nc': 1,
        'names': ['tank']
    }

    # Write data.yaml to local staging area
    data_yaml_path = os.path.join(DATASET_ROOT, 'data.yaml')
    with open(data_yaml_path, 'w') as f:
        yaml.dump(data_yaml_content, f)

else:
    # Local machine paths
    DATASET_ROOT = LOCAL_ROOT
    OUTPUT_BASE = "/Users/tyreecruse/Desktop/CS230/Project"
    data_yaml_path = os.path.join(DATASET_ROOT, 'data.yaml')

CONFIG = {
    # Dataset paths - using staged local data in Colab
    "dataset_yaml": data_yaml_path,
    "dataset_path": DATASET_ROOT,

    # Output paths - save ALL results to Drive under training results folder
    "output_path": OUTPUT_BASE,  # Base directory for all outputs
    "experiment_name": "yolov8x_tank_baseline",

    # Model configuration
    "model_size": "yolov8x.pt",  # 68.2M parameters

    # Training parameters
    "epochs": 150,
    "batch_size": 8,  # Adjust based on GPU memory
    "imgsz": 640,
    "patience": 30,
    "save_period": 5,

    # Optimizer settings
    "optimizer": "AdamW",
    "lr0": 0.001,
    "lrf": 0.01,
    "momentum": 0.937,
    "weight_decay": 0.0005,
    "warmup_epochs": 3.0,

    # Loss weights - optimized for single class detection
    "box_weight": 7.5,  # Higher for precise localization
    "cls_weight": 0.5,  # Lower for single class
    "dfl_weight": 1.5,

    # Augmentation - minimal for clean baseline
    "hsv_h": 0.015,
    "hsv_s": 0.4,
    "hsv_v": 0.2,
    "degrees": 5.0,
    "translate": 0.1,
    "scale": 0.3,
    "mosaic": 0.25,  # Minimal mosaic
    "mixup": 0.0,  # No mixup for baseline

    # Monitoring
    "test_eval_period": 5,
    "plot_results": True,
    "save_json": True,
}

# Display configuration
print("\nConfiguration:")
print("=" * 60)
print(f"Dataset YAML:    {CONFIG['dataset_yaml']}")
print(f"Dataset path:    {CONFIG['dataset_path']}")
print(f"Output base:     {CONFIG['output_path']}")
print(f"Experiment dir:  {os.path.join(CONFIG['output_path'], CONFIG['experiment_name'])}")
print(f"Model:           {CONFIG['model_size']}")
print(f"Epochs:          {CONFIG['epochs']}")
print(f"Batch size:      {CONFIG['batch_size']}")
print(f"Image size:      {CONFIG['imgsz']}")

# Verify dataset structure
if IN_COLAB:
    print("\nDataset Structure Verification:")
    print("-" * 40)
    for split in ['train', 'val', 'test']:
        img_path = os.path.join(DATASET_ROOT, split, 'images')
        lbl_path = os.path.join(DATASET_ROOT, split, 'labels')
        if os.path.exists(img_path) and os.path.exists(lbl_path):
            img_count = len(os.listdir(img_path))
            lbl_count = len(os.listdir(lbl_path))
            print(f"{split:5s}: {img_count:4d} images, {lbl_count:4d} labels")
        else:
            print(f"{split:5s}: NOT FOUND")

# Verify output directory
if os.path.exists(CONFIG["output_path"]):
    print(f"\nOutput directory verified: {CONFIG['output_path']}")
else:
    print(f"\nCreating output directory: {CONFIG['output_path']}")
    os.makedirs(CONFIG["output_path"], exist_ok=True)

# Verify data.yaml exists
if os.path.exists(CONFIG["dataset_yaml"]):
    print(f"\nDataset YAML created/found at: {CONFIG['dataset_yaml']}")
    with open(CONFIG["dataset_yaml"], 'r') as f:
        data_config = yaml.safe_load(f)
    print(f"Classes: {data_config.get('nc', 'unknown')}")
    print(f"Names: {data_config.get('names', 'unknown')}")
else:
    print(f"\nDataset YAML not found at: {CONFIG['dataset_yaml']}")

In [None]:
def verify_dataset_structure(dataset_yaml_path):
    """
    Verify dataset structure and paths.

    Parameters
    ----------
    dataset_yaml_path : str or Path
        Path to dataset YAML file

    Returns
    -------
    verification : dict
        Verification results
    """
    yaml_path = Path(dataset_yaml_path)

    verification = {
        'yaml_exists': False,
        'splits': {},
        'total_images': 0,
        'total_labels': 0
    }

    if not yaml_path.exists():
        return verification

    verification['yaml_exists'] = True

    with open(yaml_path, 'r') as f:
        data_config = yaml.safe_load(f)

    base_path = Path(data_config.get('path', yaml_path.parent))

    for split in ['train', 'val', 'test']:
        split_path = data_config.get(split, '')
        if split_path:
            images_dir = base_path / split_path
            labels_dir = images_dir.parent / 'labels'

            if images_dir.exists():
                image_count = len(list(images_dir.glob('*.jpg')) +
                                 list(images_dir.glob('*.png')))
                label_count = len(list(labels_dir.glob('*.txt'))) if labels_dir.exists() else 0

                verification['splits'][split] = {
                    'images': image_count,
                    'labels': label_count,
                    'path': str(images_dir)
                }

                verification['total_images'] += image_count
                verification['total_labels'] += label_count

    return verification

In [None]:
def display_dataset_info(verification):
    """
    Display dataset information from verification.

    Parameters
    ----------
    verification : dict
        Verification results from verify_dataset_structure
    """
    print("\n" + "="*60)
    print("DATASET STRUCTURE VERIFICATION")
    print("="*60)

    if not verification['yaml_exists']:
        print("Dataset YAML not found!")
        return

    print(f"\nTotal images: {verification['total_images']:,}")
    print(f"Total labels: {verification['total_labels']:,}")

    print("\nSplit breakdown:")
    print("-"*40)
    for split, info in verification['splits'].items():
        coverage = info['labels'] / info['images'] * 100 if info['images'] > 0 else 0
        print(f"{split:5s}: {info['images']:5,} images, {info['labels']:5,} labels ({coverage:.1f}% coverage)")

In [None]:
def analyze_split_statistics(dataset_yaml_path):
    """
    Analyze statistics for each split to detect outliers and imbalances.

    Parameters
    ----------
    dataset_yaml_path : str or Path
        Path to dataset YAML file

    Returns
    -------
    analysis : dict
        Statistics for each split
    """
    yaml_path = Path(dataset_yaml_path)

    with open(yaml_path, 'r') as f:
        data_config = yaml.safe_load(f)

    base_path = Path(data_config.get('path', yaml_path.parent))
    analysis = {}

    for split in ['train', 'val', 'test']:
        split_path = data_config.get(split, '')
        if split_path:
            labels_dir = base_path / split_path.replace('images', 'labels')

            box_counts = []
            empty_files = 0
            dense_files = []

            for label_file in labels_dir.glob('*.txt'):
                with open(label_file, 'r') as f:
                    lines = f.readlines()

                num_boxes = len([l for l in lines if l.strip()])
                box_counts.append(num_boxes)

                if num_boxes == 0:
                    empty_files += 1
                elif num_boxes > 10:  # Threshold for dense images
                    dense_files.append((label_file.name, num_boxes))

            analysis[split] = {
                'total_images': len(box_counts),
                'avg_boxes': np.mean(box_counts) if box_counts else 0,
                'std_boxes': np.std(box_counts) if box_counts else 0,
                'min_boxes': min(box_counts) if box_counts else 0,
                'max_boxes': max(box_counts) if box_counts else 0,
                'empty_files': empty_files,
                'dense_files': len(dense_files),
                'box_distribution': box_counts
            }

    return analysis

In [None]:
def compare_split_distributions(analysis):
    """
    Compare distributions between splits to detect inconsistencies.

    Parameters
    ----------
    analysis : dict
        Split statistics from analyze_split_statistics

    Returns
    -------
    comparison : dict
        Comparison metrics between splits
    """
    from scipy import stats

    comparison = {}

    # Compare val vs test distributions
    if 'val' in analysis and 'test' in analysis:
        val_dist = analysis['val']['box_distribution']
        test_dist = analysis['test']['box_distribution']

        # Kolmogorov-Smirnov test
        ks_statistic, ks_pvalue = stats.ks_2samp(val_dist, test_dist)

        comparison['val_vs_test'] = {
            'ks_statistic': ks_statistic,
            'ks_pvalue': ks_pvalue,
            'similar_distribution': ks_pvalue > 0.05,
            'avg_box_diff': abs(analysis['val']['avg_boxes'] - analysis['test']['avg_boxes'])
        }

    # Compare train vs val
    if 'train' in analysis and 'val' in analysis:
        train_dist = analysis['train']['box_distribution']
        val_dist = analysis['val']['box_distribution']

        ks_statistic, ks_pvalue = stats.ks_2samp(train_dist, val_dist)

        comparison['train_vs_val'] = {
            'ks_statistic': ks_statistic,
            'ks_pvalue': ks_pvalue,
            'similar_distribution': ks_pvalue > 0.05,
            'avg_box_diff': abs(analysis['train']['avg_boxes'] - analysis['val']['avg_boxes'])
        }

    return comparison

In [None]:
def plot_split_distributions(analysis):
    """
    Visualize box count distributions across splits.

    Parameters
    ----------
    analysis : dict
        Split statistics from analyze_split_statistics
    """
    fig, axes = plt.subplots(1, 3, figsize=(15, 4))

    for idx, split in enumerate(['train', 'val', 'test']):
        if split in analysis:
            ax = axes[idx]
            box_counts = analysis[split]['box_distribution']

            ax.hist(box_counts, bins=20, edgecolor='black', alpha=0.7)
            ax.axvline(analysis[split]['avg_boxes'], color='red',
                      linestyle='--', label=f'Mean: {analysis[split]["avg_boxes"]:.2f}')

            ax.set_xlabel('Boxes per Image')
            ax.set_ylabel('Frequency')
            ax.set_title(f'{split.capitalize()} Split (n={len(box_counts)})')
            ax.legend()
            ax.grid(True, alpha=0.3)

    plt.tight_layout()
    plt.suptitle('Box Count Distribution Across Splits', y=1.02)
    plt.show()

In [None]:
def create_training_args(config):
    """
    Create training arguments dictionary from config.

    Parameters
    ----------
    config : dict
        Configuration dictionary

    Returns
    -------
    args : dict
        Training arguments for YOLO
    """
    args = {
        'data': config['dataset_yaml'],
        'epochs': config['epochs'],
        'batch': config['batch_size'],
        'imgsz': config['imgsz'],
        'patience': config['patience'],
        'save': True,
        'save_period': config['save_period'],
        'device': 0,
        'workers': 2,
        'project': config['output_path'],
        'name': config['experiment_name'],
        'exist_ok': True,
        'pretrained': True,
        'optimizer': config['optimizer'],
        'lr0': config['lr0'],
        'lrf': config['lrf'],
        'momentum': config['momentum'],
        'weight_decay': config['weight_decay'],
        'warmup_epochs': config['warmup_epochs'],
        'box': config['box_weight'],
        'cls': config['cls_weight'],
        'dfl': config['dfl_weight'],
        'hsv_h': config['hsv_h'],
        'hsv_s': config['hsv_s'],
        'hsv_v': config['hsv_v'],
        'degrees': config['degrees'],
        'translate': config['translate'],
        'scale': config['scale'],
        'shear': 0.0,
        'perspective': 0.0,
        'flipud': 0.0,
        'fliplr': 0.5,
        'mosaic': config['mosaic'],
        'mixup': config['mixup'],
        'copy_paste': 0.0,
        'close_mosaic': 100,
        'amp': True,
        'val': True,
        'plots': config['plot_results'],
        'save_json': config['save_json']
    }
    return args

In [None]:
def create_test_evaluation_callback(config, test_results):
    """
    Create callback for periodic test set evaluation.

    Parameters
    ----------
    config : dict
        Configuration dictionary
    test_results : list
        List to store test results

    Returns
    -------
    callback : function
        Callback function
    """
    def evaluate_on_test(trainer):
        if trainer.epoch % config['test_eval_period'] == 0 or trainer.epoch == trainer.epochs - 1:
            model = YOLO(trainer.best)
            metrics = model.val(
                data=config['dataset_yaml'],
                split='test',
                batch=config['batch_size']
            )

            # Note: Loss values are not available from validation metrics
            # We can only get performance metrics
            test_results.append({
                'epoch': trainer.epoch,
                'test_map50': float(metrics.box.map50),
                'test_map': float(metrics.box.map),
                'test_precision': float(metrics.box.p[0]) if len(metrics.box.p) > 0 else 0,
                'test_recall': float(metrics.box.r[0]) if len(metrics.box.r) > 0 else 0,
                'test_f1': float(metrics.box.f1[0]) if len(metrics.box.f1) > 0 else 0,
                # Calculate approximate total loss from trainer if available
                'test_box_loss': None,  # Not available from validation
                'test_cls_loss': None,  # Not available from validation
                'test_dfl_loss': None,  # Not available from validation
                'test_total_loss': None  # Not available from validation
            })

            # Save intermediate results
            results_path = Path(config['output_path']) / config['experiment_name'] / 'test_metrics.csv'
            pd.DataFrame(test_results).to_csv(results_path, index=False)

            print(f"Epoch {trainer.epoch}: Test mAP50={metrics.box.map50:.3f}, mAP={metrics.box.map:.3f}")

    return evaluate_on_test

In [None]:
def save_training_config(config):
    """
    Save training configuration to file.

    Parameters
    ----------
    config : dict
        Configuration dictionary
    """
    output_dir = Path(config['output_path']) / config['experiment_name']
    output_dir.mkdir(parents=True, exist_ok=True)

    config_path = output_dir / 'training_config.yaml'
    with open(config_path, 'w') as f:
        yaml.dump(config, f, default_flow_style=False)

    print(f"Configuration saved to: {config_path}")

In [None]:
def train_yolov8x(config, test_results):
    """
    Train YOLOv8x model with configuration.

    Parameters
    ----------
    config : dict
        Configuration dictionary
    test_results : list
        List to store test results

    Returns
    -------
    model : YOLO
        Trained model
    """
    print("\n" + "="*60)
    print("STARTING YOLOV8X TRAINING")
    print("="*60)

    # Initialize model
    model = YOLO(config['model_size'])
    print(f"Model loaded: {config['model_size']}")

    # Add test evaluation callback
    callback = create_test_evaluation_callback(config, test_results)
    model.add_callback("on_fit_epoch_end", callback)

    # Get training arguments
    args = create_training_args(config)

    # Save configuration
    save_training_config(config)

    # Train model
    print("\nTraining started...")
    print("-"*60)
    results = model.train(**args)

    print("-"*60)
    print("Training completed!")

    return model

In [None]:
def load_training_metrics(experiment_path):
    """
    Load training metrics from results files.

    Parameters
    ----------
    experiment_path : str or Path
        Path to experiment directory

    Returns
    -------
    metrics : dict
        Dictionary with training and test metrics
    """
    exp_path = Path(experiment_path)

    metrics = {}

    # Load training/validation results
    results_path = exp_path / 'results.csv'
    if results_path.exists():
        metrics['train_val'] = pd.read_csv(results_path)

    # Load test results
    test_path = exp_path / 'test_metrics.csv'
    if test_path.exists():
        metrics['test'] = pd.read_csv(test_path)

    return metrics

In [None]:
def plot_loss_curves(metrics):
    """
    Plot comprehensive loss curves.

    Parameters
    ----------
    metrics : dict
        Dictionary with training metrics

    Returns
    -------
    fig : matplotlib.figure
        Figure with plots
    """
    if 'train_val' not in metrics:
        print("No training metrics found")
        return None

    df = metrics['train_val']
    test_df = metrics.get('test', pd.DataFrame())

    # Calculate total losses
    df['train_total'] = df['train/box_loss'] + df['train/cls_loss'] + df['train/dfl_loss']
    df['val_total'] = df['val/box_loss'] + df['val/cls_loss'] + df['val/dfl_loss']

    # Create figure
    fig, axes = plt.subplots(2, 3, figsize=(15, 8))

    # Total loss
    ax = axes[0, 0]
    ax.plot(df.index, df['train_total'], label='Train', alpha=0.7)
    ax.plot(df.index, df['val_total'], label='Val', alpha=0.7)
    if not test_df.empty:
        test_total = test_df['test_box_loss'] + test_df['test_cls_loss'] + test_df['test_dfl_loss']
        ax.plot(test_df['epoch'], test_total, 'o-', label='Test', markersize=4)
    ax.set_xlabel('Epoch')
    ax.set_ylabel('Total Loss')
    ax.set_title('Total Loss Comparison')
    ax.legend()
    ax.grid(True, alpha=0.3)

    # mAP performance
    ax = axes[0, 1]
    ax.plot(df.index, df['metrics/mAP50(B)'], label='Val mAP50', alpha=0.7)
    ax.plot(df.index, df['metrics/mAP50-95(B)'], label='Val mAP50-95', alpha=0.7)
    if not test_df.empty:
        ax.plot(test_df['epoch'], test_df['test_map50'], 'o-', label='Test mAP50', markersize=4)
        ax.plot(test_df['epoch'], test_df['test_map'], 's-', label='Test mAP50-95', markersize=4)
    ax.set_xlabel('Epoch')
    ax.set_ylabel('mAP')
    ax.set_title('mAP Performance')
    ax.legend()
    ax.grid(True, alpha=0.3)

    # Overfitting gap
    ax = axes[0, 2]
    gap = df['val_total'] - df['train_total']
    ax.plot(df.index, gap, color='red', alpha=0.7)
    ax.axhline(y=0, color='black', linestyle='--', alpha=0.5)
    ax.fill_between(df.index, 0, gap, where=(gap > 0), color='red', alpha=0.3)
    ax.set_xlabel('Epoch')
    ax.set_ylabel('Val - Train Loss')
    ax.set_title('Overfitting Detection')
    ax.grid(True, alpha=0.3)

    # Component losses
    ax = axes[1, 0]
    ax.plot(df.index, df['train/box_loss'], label='Box', alpha=0.7)
    ax.plot(df.index, df['train/cls_loss'], label='Class', alpha=0.7)
    ax.plot(df.index, df['train/dfl_loss'], label='DFL', alpha=0.7)
    ax.set_xlabel('Epoch')
    ax.set_ylabel('Loss')
    ax.set_title('Training Loss Components')
    ax.legend()
    ax.grid(True, alpha=0.3)

    # Precision/Recall
    ax = axes[1, 1]
    ax.plot(df.index, df['metrics/precision(B)'], label='Precision', alpha=0.7)
    ax.plot(df.index, df['metrics/recall(B)'], label='Recall', alpha=0.7)
    ax.set_xlabel('Epoch')
    ax.set_ylabel('Score')
    ax.set_title('Precision & Recall')
    ax.legend()
    ax.grid(True, alpha=0.3)

    # Learning rate
    ax = axes[1, 2]
    ax.plot(df.index, df['lr/pg0'], alpha=0.7)
    ax.set_xlabel('Epoch')
    ax.set_ylabel('Learning Rate')
    ax.set_title('Learning Rate Schedule')
    ax.set_yscale('log')
    ax.grid(True, alpha=0.3)

    plt.tight_layout()
    return fig

In [None]:
def compare_val_test_performance(metrics):
    """
    Compare validation and test set performance to detect distribution issues.

    Parameters
    ----------
    metrics : dict
        Dictionary with training metrics

    Returns
    -------
    comparison : dict
        Val vs Test comparison metrics
    """
    if 'train_val' not in metrics or 'test' not in metrics:
        return None

    train_val_df = metrics['train_val']
    test_df = metrics['test']

    if test_df.empty:
        return None

    comparison = {}

    # Get validation metrics at test evaluation epochs
    for _, test_row in test_df.iterrows():
        epoch = int(test_row['epoch'])

        if epoch < len(train_val_df):
            val_row = train_val_df.iloc[epoch]

            # Calculate differences (only for available metrics)
            map50_diff = abs(val_row['metrics/mAP50(B)'] - test_row['test_map50'])
            map_diff = abs(val_row['metrics/mAP50-95(B)'] - test_row['test_map'])

            comparison[epoch] = {
                'val_map50': val_row['metrics/mAP50(B)'],
                'test_map50': test_row['test_map50'],
                'map50_diff': map50_diff,
                'val_map': val_row['metrics/mAP50-95(B)'],
                'test_map': test_row['test_map'],
                'map_diff': map_diff,
                'performance_gap': (map50_diff / val_row['metrics/mAP50(B)']) * 100 if val_row['metrics/mAP50(B)'] > 0 else 0
            }

    return comparison

In [None]:
def plot_val_test_comparison(comparison):
    """
    Plot validation vs test performance comparison.

    Parameters
    ----------
    comparison : dict
        Val vs Test comparison from compare_val_test_performance
    """
    if not comparison:
        print("No val vs test comparison data available")
        return

    epochs = sorted(comparison.keys())

    fig, axes = plt.subplots(2, 2, figsize=(12, 8))

    # mAP50 comparison
    ax = axes[0, 0]
    val_map50 = [comparison[e]['val_map50'] for e in epochs]
    test_map50 = [comparison[e]['test_map50'] for e in epochs]
    ax.plot(epochs, val_map50, 'o-', label='Validation', markersize=6)
    ax.plot(epochs, test_map50, 's-', label='Test', markersize=6)
    ax.set_xlabel('Epoch')
    ax.set_ylabel('mAP50')
    ax.set_title('Val vs Test mAP50')
    ax.legend()
    ax.grid(True, alpha=0.3)

    # mAP50-95 comparison
    ax = axes[0, 1]
    val_map = [comparison[e]['val_map'] for e in epochs]
    test_map = [comparison[e]['test_map'] for e in epochs]
    ax.plot(epochs, val_map, 'o-', label='Validation', markersize=6)
    ax.plot(epochs, test_map, 's-', label='Test', markersize=6)
    ax.set_xlabel('Epoch')
    ax.set_ylabel('mAP50-95')
    ax.set_title('Val vs Test mAP50-95')
    ax.legend()
    ax.grid(True, alpha=0.3)

    # Performance gap
    ax = axes[1, 0]
    map50_diff = [comparison[e]['map50_diff'] for e in epochs]
    ax.plot(epochs, map50_diff, 'o-', color='red', markersize=6)
    ax.axhline(y=0, color='black', linestyle='--', alpha=0.5)
    ax.set_xlabel('Epoch')
    ax.set_ylabel('|Val - Test| mAP50')
    ax.set_title('Absolute mAP50 Difference')
    ax.grid(True, alpha=0.3)

    # Performance gap percentage
    ax = axes[1, 1]
    perf_gap = [comparison[e]['performance_gap'] for e in epochs]
    ax.plot(epochs, perf_gap, 'o-', color='purple', markersize=6)
    ax.axhline(y=5, color='orange', linestyle='--', alpha=0.5, label='5% threshold')
    ax.axhline(y=10, color='red', linestyle='--', alpha=0.5, label='10% threshold')
    ax.set_xlabel('Epoch')
    ax.set_ylabel('Performance Gap (%)')
    ax.set_title('Val-Test Performance Gap')
    ax.legend()
    ax.grid(True, alpha=0.3)

    plt.tight_layout()
    plt.show()

In [None]:
def diagnose_val_test_consistency(comparison):
    """
    Diagnose if validation and test sets are consistently distributed.

    Parameters
    ----------
    comparison : dict
        Val vs Test comparison metrics

    Returns
    -------
    diagnosis : dict
        Diagnosis of val/test consistency
    """
    if not comparison:
        return {'status': 'no_data', 'message': 'No comparison data available'}

    # Calculate average differences
    epochs = list(comparison.keys())
    avg_map_diff = np.mean([comparison[e]['map50_diff'] for e in epochs])
    # REMOVED: avg_loss_diff line - this key doesn't exist in comparison dict
    avg_perf_gap = np.mean([comparison[e]['performance_gap'] for e in epochs])

    diagnosis = {
        'avg_map_difference': avg_map_diff,
        # REMOVED: 'avg_loss_difference' key
        'avg_performance_gap': avg_perf_gap,
        'issues': [],
        'recommendations': []
    }

    # Determine status
    if avg_perf_gap < 5:
        diagnosis['status'] = 'excellent'
        diagnosis['message'] = 'Validation and test sets are well-aligned'
    elif avg_perf_gap < 10:
        diagnosis['status'] = 'good'
        diagnosis['message'] = 'Minor differences between validation and test sets'
    else:
        diagnosis['status'] = 'concerning'
        diagnosis['message'] = 'Significant differences between validation and test sets'
        diagnosis['issues'].append(f'Performance gap of {avg_perf_gap:.1f}% detected')
        diagnosis['recommendations'].append('Consider re-splitting data for better distribution')
        diagnosis['recommendations'].append('Check for data leakage or distribution shift')

    return diagnosis

In [None]:
def generate_confusion_matrix_for_split(model_path, data_yaml, split='val'):
    """
    Generate confusion matrix for a specific split.

    Parameters
    ----------
    model_path : str or Path
        Path to trained model
    data_yaml : str or Path
        Path to data configuration
    split : str
        Split to evaluate ('train', 'val', 'test')

    Returns
    -------
    results : dict
        Confusion matrix and metrics
    """
    model = YOLO(model_path)

    # Run validation on specified split
    metrics = model.val(data=str(data_yaml), split=split, save_json=True)

    results = {
        'split': split,
        'map50': float(metrics.box.map50),
        'map50_95': float(metrics.box.map),
        'precision': float(metrics.box.p[0]) if len(metrics.box.p) > 0 else 0,
        'recall': float(metrics.box.r[0]) if len(metrics.box.r) > 0 else 0,
        'confusion_matrix': metrics.confusion_matrix.matrix if hasattr(metrics, 'confusion_matrix') else None
    }

    # Calculate F1 score
    if results['precision'] > 0 and results['recall'] > 0:
        results['f1'] = 2 * (results['precision'] * results['recall']) / \
                       (results['precision'] + results['recall'])
    else:
        results['f1'] = 0

    return results

In [None]:
def compare_split_performance(model_path, data_yaml):
    """
    Compare performance across all splits.

    Parameters
    ----------
    model_path : str or Path
        Path to trained model
    data_yaml : str or Path
        Path to data configuration

    Returns
    -------
    comparison : dict
        Performance comparison across splits
    """
    comparison = {}

    for split in ['train', 'val', 'test']:
        print(f"Evaluating {split} split...")
        comparison[split] = generate_confusion_matrix_for_split(
            model_path, data_yaml, split
        )

    return comparison

In [None]:
def plot_split_performance_comparison(comparison):
    """
    Visualize performance comparison across splits.

    Parameters
    ----------
    comparison : dict
        Performance metrics for each split
    """
    splits = list(comparison.keys())
    metrics_names = ['map50', 'map50_95', 'precision', 'recall', 'f1']

    # Prepare data for plotting
    data = {metric: [] for metric in metrics_names}
    for split in splits:
        for metric in metrics_names:
            data[metric].append(comparison[split].get(metric, 0))

    # Create bar plot
    fig, ax = plt.subplots(figsize=(12, 6))

    x = np.arange(len(splits))
    width = 0.15

    for i, metric in enumerate(metrics_names):
        offset = width * (i - 2)
        ax.bar(x + offset, data[metric], width, label=metric.upper())

    ax.set_xlabel('Split')
    ax.set_ylabel('Score')
    ax.set_title('Performance Comparison Across Splits')
    ax.set_xticks(x)
    ax.set_xticklabels([s.capitalize() for s in splits])
    ax.legend()
    ax.grid(True, alpha=0.3, axis='y')

    plt.tight_layout()
    plt.show()

    # Print detailed comparison
    print("\n" + "="*60)
    print("DETAILED SPLIT COMPARISON")
    print("="*60)

    for metric in metrics_names:
        print(f"\n{metric.upper()}:")
        for split in splits:
            value = comparison[split].get(metric, 0)
            print(f"  {split:5s}: {value:.3f}")

        # Calculate variance
        values = [comparison[split].get(metric, 0) for split in splits]
        if len(values) > 1:
            variance = np.std(values)
            print(f"  Std Dev: {variance:.3f}")

In [None]:
def analyze_training_health(metrics):
    """
    Analyze training health and detect issues.

    Parameters
    ----------
    metrics : dict
        Dictionary with training metrics

    Returns
    -------
    diagnosis : dict
        Training diagnosis
    """
    if 'train_val' not in metrics:
        return {'status': 'error', 'message': 'No metrics found'}

    df = metrics['train_val']

    # Calculate total losses
    df['train_total'] = df['train/box_loss'] + df['train/cls_loss'] + df['train/dfl_loss']
    df['val_total'] = df['val/box_loss'] + df['val/cls_loss'] + df['val/dfl_loss']

    # Analyze last 20% of training
    window = max(1, int(len(df) * 0.2))

    recent_stats = {
        'train_loss': df['train_total'].iloc[-window:].mean(),
        'val_loss': df['val_total'].iloc[-window:].mean(),
        'map50': df['metrics/mAP50(B)'].iloc[-window:].mean(),
        'map50_95': df['metrics/mAP50-95(B)'].iloc[-window:].mean(),
        'best_map50': df['metrics/mAP50(B)'].max(),
        'best_map50_95': df['metrics/mAP50-95(B)'].max(),
    }

    recent_stats['val_train_gap'] = (
        (recent_stats['val_loss'] - recent_stats['train_loss']) /
        recent_stats['train_loss'] * 100
    )

    # Diagnose
    diagnosis = {
        'status': 'healthy',
        'issues': [],
        'recommendations': [],
        'metrics': recent_stats
    }

    # Check overfitting
    if recent_stats['val_train_gap'] > 20:
        diagnosis['status'] = 'overfitting'
        diagnosis['issues'].append(f"Val loss {recent_stats['val_train_gap']:.1f}% higher than train")
        diagnosis['recommendations'].append("Consider more augmentation or regularization")

    # Check performance
    if recent_stats['map50'] < 0.7:
        diagnosis['status'] = 'underperforming' if diagnosis['status'] == 'healthy' else diagnosis['status']
        diagnosis['issues'].append(f"Low mAP50: {recent_stats['map50']:.3f}")
        diagnosis['recommendations'].append("Consider training longer or adjusting hyperparameters")

    # Check convergence
    recent_gradient = np.gradient(df['val_total'].iloc[-window:])
    if abs(recent_gradient.mean()) < 0.001:
        diagnosis['issues'].append("Model has converged")
        diagnosis['recommendations'].append("Consider early stopping")

    return diagnosis

In [None]:
def print_diagnosis(diagnosis):
    """
    Print training diagnosis in formatted way.

    Parameters
    ----------
    diagnosis : dict
        Diagnosis from analyze_training_health
    """
    print("\n" + "="*60)
    print("TRAINING DIAGNOSIS")
    print("="*60)

    print(f"\nStatus: {diagnosis['status'].upper()}")

    if diagnosis['issues']:
        print("\nIssues detected:")
        for issue in diagnosis['issues']:
            print(f"  - {issue}")
    else:
        print("\nNo issues detected")

    if diagnosis['recommendations']:
        print("\nRecommendations:")
        for rec in diagnosis['recommendations']:
            print(f"  - {rec}")

    print("\nMetrics (last 20% of training):")
    print("-"*40)
    metrics = diagnosis['metrics']
    print(f"Train loss:      {metrics['train_loss']:.4f}")
    print(f"Val loss:        {metrics['val_loss']:.4f}")
    print(f"Val-Train gap:   {metrics['val_train_gap']:.1f}%")
    print(f"Current mAP50:   {metrics['map50']:.3f}")
    print(f"Best mAP50:      {metrics['best_map50']:.3f}")
    print(f"Current mAP50-95: {metrics['map50_95']:.3f}")
    print(f"Best mAP50-95:   {metrics['best_map50_95']:.3f}")

In [None]:
# Cell 12a: Initialize and verify dataset structure
print("\n" + "="*80)
print("YOLOV8X TANK DETECTION TRAINING")
print("="*80)

results = {}

print("\nStep 1: Verifying dataset structure...")
verification = verify_dataset_structure(CONFIG['dataset_yaml'])
display_dataset_info(verification)

if not verification['yaml_exists']:
    raise FileNotFoundError(f"Dataset YAML not found: {CONFIG['dataset_yaml']}")

results['dataset_info'] = verification
print(" Dataset verification complete")

In [None]:
# Cell 3: Analyze Data Quality
print("\nStep 2: Analyzing data quality across splits...")
split_analysis = analyze_split_statistics(CONFIG['dataset_yaml'])

print("\nSplit Statistics:")
print("-"*40)
for split, stats in split_analysis.items():
    print(f"{split:5s}: {stats['total_images']:4d} images, "
          f"avg {stats['avg_boxes']:.2f} boxes/image, "
          f"{stats['dense_files']} dense files")

# Compare distributions
distribution_comparison = compare_split_distributions(split_analysis)

if 'val_vs_test' in distribution_comparison:
    val_test = distribution_comparison['val_vs_test']
    print(f"\nVal vs Test distribution similarity: "
          f"{'SIMILAR' if val_test['similar_distribution'] else 'DIFFERENT'} "
          f"(p-value: {val_test['ks_pvalue']:.3f})")

# Plot distributions
plot_split_distributions(split_analysis)

results['split_analysis'] = split_analysis
results['distribution_comparison'] = distribution_comparison
print(" Data quality analysis complete")

In [None]:
# Cell 4: Train Model
print("\nStep 3: Training YOLOv8x model...")
print("-"*60)
print("This step will take significant time...")
print("-"*60)

test_results = []
model = train_yolov8x(CONFIG, test_results)
results['test_metrics'] = test_results
print(" Model training complete")

In [None]:
# Cell 5: Step 4 - Analyze Training Results
print("\nStep 4: Analyzing training results...")
experiment_path = Path(CONFIG['output_path']) / CONFIG['experiment_name']

metrics = load_training_metrics(experiment_path)
results['metrics'] = metrics

# Plot losses
fig = plot_loss_curves(metrics)
if fig:
    plot_path = experiment_path / 'loss_analysis.png'
    fig.savefig(plot_path, dpi=150, bbox_inches='tight')
    plt.show()
    print(f"Loss plots saved to: {plot_path}")

print(" Training results analysis complete")

In [None]:
# Cell 6: Step 5 - Compare Validation vs Test
print("\nStep 5: Comparing validation vs test performance...")
val_test_comparison = compare_val_test_performance(metrics)

if val_test_comparison:
    plot_val_test_comparison(val_test_comparison)
    val_test_diagnosis = diagnose_val_test_consistency(val_test_comparison)

    print("\n" + "-"*40)
    print(f"Val/Test Consistency: {val_test_diagnosis['status'].upper()}")
    print(f"{val_test_diagnosis['message']}")
    if val_test_diagnosis.get('avg_performance_gap'):
        print(f"Average performance gap: {val_test_diagnosis['avg_performance_gap']:.2f}%")

    results['val_test_comparison'] = val_test_comparison
    results['val_test_diagnosis'] = val_test_diagnosis
    print(" Val/Test comparison complete")
else:
    print("No val/test comparison data available")

In [None]:
# Cell 7: Step 6 - Confusion Matrix Analysis
print("\nStep 6: Generating confusion matrices for all splits...")
experiment_path = Path(CONFIG['output_path']) / CONFIG['experiment_name']
best_model = experiment_path / 'weights' / 'best.pt'

if best_model.exists():
    print(f"Using model: {best_model}")
    split_comparison = compare_split_performance(best_model, CONFIG['dataset_yaml'])
    plot_split_performance_comparison(split_comparison)
    results['split_performance'] = split_comparison
    print(" Confusion matrix analysis complete")
else:
    print(f"Best model not found at: {best_model}")
    print("Skipping confusion matrix analysis")

In [None]:
# Cell 8: Step 7 - Overall Training Diagnosis
print("\nStep 7: Overall training diagnosis...")
diagnosis = analyze_training_health(metrics)
results['diagnosis'] = diagnosis
print_diagnosis(diagnosis)
print(" Training diagnosis complete")

In [None]:
# Cell 9: Final Summary
print("\n" + "="*80)
print("TRAINING PIPELINE SUMMARY")
print("="*80)

# Check what was completed
completed_steps = []
if 'dataset_info' in results:
    completed_steps.append("Dataset Verification")
if 'split_analysis' in results:
    completed_steps.append("Data Quality Analysis")
if 'test_metrics' in results:
    completed_steps.append("Model Training")
if 'metrics' in results:
    completed_steps.append("Results Analysis")
if 'val_test_comparison' in results:
    completed_steps.append("Val/Test Comparison")
if 'split_performance' in results:
    completed_steps.append("Confusion Matrix Analysis")
if 'diagnosis' in results:
    completed_steps.append("Training Diagnosis")

print("\nCompleted Steps:")
for i, step in enumerate(completed_steps, 1):
    print(f"  {i}. {step}")

if 'error' not in results:
    print("\n ALL STEPS COMPLETED SUCCESSFULLY")
else:
    print(f"\n Pipeline failed with error: {results['error']}")

print("\nResults dictionary contains:", list(results.keys()))

In [None]:
# Cell 10: Save Results Summary to File (Optional)
import json
from datetime import datetime

# Save results summary to Drive
summary_path = Path(CONFIG['output_path']) / CONFIG['experiment_name'] / 'training_summary.json'

# Convert non-serializable objects to strings
results_summary = {
    'timestamp': datetime.now().isoformat(),
    'config': CONFIG,
    'dataset_info': results.get('dataset_info', {}),
    'distribution_comparison': results.get('distribution_comparison', {}),
    'val_test_diagnosis': results.get('val_test_diagnosis', {}),
    'final_diagnosis': results.get('diagnosis', {})
}

with open(summary_path, 'w') as f:
    json.dump(results_summary, f, indent=2, default=str)

print(f"Training summary saved to: {summary_path}")

In [None]:
print("\n" + "="*60)
print("RESULTS VERIFICATION")
print("="*60)

experiment_path = Path(CONFIG['output_path']) / CONFIG['experiment_name']

# Check model weights
weights_dir = experiment_path / 'weights'
if weights_dir.exists():
    weights = list(weights_dir.glob('*.pt'))
    print(f"\nModel weights: {len(weights)} files")
    for weight_file in weights:
        size_mb = weight_file.stat().st_size / (1024 * 1024)
        print(f"  - {weight_file.name}: {size_mb:.1f} MB")
else:
    print("\nNo weights directory found")

# Check output files
expected_files = [
    'results.csv',
    'results.png',
    'confusion_matrix.png',
    'test_metrics.csv',
    'training_config.yaml'
]

print("\nOutput files:")
for filename in expected_files:
    filepath = experiment_path / filename
    if filepath.exists():
        size_kb = filepath.stat().st_size / 1024
        print(f"  [{filename}]: {size_kb:.1f} KB")
    else:
        print(f"  [{filename}]: not found")

# Best model path
best_model = weights_dir / 'best.pt' if weights_dir else None
if best_model and best_model.exists():
    print(f"\nBest model path:")
    print(f"  {best_model}")

In [None]:
print("\n" + "="*60)
print("SUMMARY")
print("="*60)

if 'results' in locals():
    # Dataset info
    if 'dataset_info' in results:
        info = results['dataset_info']
        print(f"\nDataset:")
        print(f"  Total images: {info['total_images']:,}")
        print(f"  Total labels: {info['total_labels']:,}")
        for split, data in info['splits'].items():
            print(f"  {split}: {data['images']:,} images")

    # Data quality analysis
    if 'distribution_comparison' in results:
        comp = results['distribution_comparison']
        if 'val_vs_test' in comp:
            print(f"\nData Quality:")
            val_test = comp['val_vs_test']
            print(f"  Val/Test similarity: {'Similar' if val_test['similar_distribution'] else 'Different'}")
            print(f"  KS p-value: {val_test['ks_pvalue']:.3f}")

    # Val vs Test consistency
    if 'val_test_diagnosis' in results:
        diag = results['val_test_diagnosis']
        print(f"\nVal/Test Consistency:")
        print(f"  Status: {diag['status'].upper()}")
        if 'avg_performance_gap' in diag:
            print(f"  Average gap: {diag['avg_performance_gap']:.2f}%")

    # Split performance
    if 'split_performance' in results:
        perf = results['split_performance']
        print(f"\nPerformance Across Splits:")
        for split in ['train', 'val', 'test']:
            if split in perf:
                print(f"  {split:5s}: mAP50={perf[split]['map50']:.3f}, "
                      f"F1={perf[split]['f1']:.3f}")

    # Training results
    if 'diagnosis' in results:
        diagnosis = results['diagnosis']
        metrics = diagnosis['metrics']
        print(f"\nTraining Performance:")
        print(f"  Status: {diagnosis['status'].upper()}")
        print(f"  Best mAP50: {metrics['best_map50']:.3f}")
        print(f"  Best mAP50-95: {metrics['best_map50_95']:.3f}")
        print(f"  Final mAP50: {metrics['map50']:.3f}")
        print(f"  Val-Train gap: {metrics['val_train_gap']:.1f}%")

print(f"\nModel location: {CONFIG['output_path']}/{CONFIG['experiment_name']}/weights/best.pt")

# YOLOv8x Tank Detection - Baseline Training on Cleaned Dataset

This notebook trains YOLOv8x on the pre-cleaned and clustered tank dataset found in the production folder

## Table of Contents
- [1 - Packages](#1)
- [2 - Configuration](#2)
- [3 - Dataset Verification Functions](#3)
- [4 - Data Quality Analysis Functions](#4)
- [5 - Training Setup Functions](#5)
- [6 - Loss Monitoring Functions](#6)
- [7 - Training Execution Functions](#7)
- [8 - Performance Analysis Functions](#8)
- [9 - Validation vs Test Comparison Functions](#9)
- [10 - Confusion Matrix Analysis Functions](#10)
- [11 - Diagnostic Functions](#11)
- [12 - Main Execution](#12)
- [13 - Results Verification](#13)
- [14 - Summary](#14)

<a name='1'></a>
## 1 - Packages

<a name='2'></a>
## 2 - Configuration

<a name='3'></a>
## 3 - Dataset Verification Functions

<a name='4'></a>
## 4 - Data Quality Analysis Functions

<a name='5'></a>
## 5 - Training Setup Functions

<a name='5'></a>
## 5 - Loss Monitoring Functions

<a name='6'></a>
## 6 - Training Execution Functions

<a name='7'></a>
## 7 - Performance Analysis Functions

<a name='8'></a>
## 8 - Validation vs Test Comparison Functions

<a name='9'></a>
## 9 - Confusion Matrix Analysis Functions

<a name='10'></a>
## 10 - Diagnostic Functions

<a name='12'></a>
## 12 - Main Execution

<a name='13'></a>
## 13 - Results Verification

<a name='14'></a>
## 14 - Summary