# Comprehensive Handwashing Detection Training Pipeline

**Complete training pipeline using modular Python modules**

This notebook demonstrates:
1. Dataset download (Kaggle WHO6)
2. Data preprocessing (frame extraction)
3. Model training (MobileNetV2)
4. Evaluation and visualization
5. Model comparison

**Runtime**: GPU (recommended for training)

**Expected Duration**: 2-3 hours for complete pipeline

**Author**: Generated with AdaL (https://github.com/sylphai/adal-cli)

**Date**: 2025-12-31

## 1. Setup & Dependencies

In [None]:
# Check if running on Google Colab
try:
    import google.colab
    IN_COLAB = True
    print("Running on Google Colab")
except ImportError:
    IN_COLAB = False
    print("Running locally")

In [None]:
# Mount Google Drive (Colab only)
if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/drive')
    
    # Set working directory
    import os
    WORK_DIR = '/content/drive/MyDrive/handwash_training'
    os.makedirs(WORK_DIR, exist_ok=True)
    %cd {WORK_DIR}
else:
    WORK_DIR = '.'
    print(f"Working directory: {WORK_DIR}")

In [None]:
# Install dependencies (TensorFlow preinstalled on Colab)
!pip install -q scikit-learn pandas numpy opencv-python-headless
!pip install -q matplotlib seaborn tqdm requests nbformat

print("Dependencies installed!")

In [None]:
# Verify GPU availability
import tensorflow as tf

print(f"TensorFlow version: {tf.__version__}")
print(f"GPU available: {len(tf.config.list_physical_devices('GPU')) > 0}")
print(f"GPU devices: {tf.config.list_physical_devices('GPU')}")

In [None]:
# Import standard libraries
import sys
import json
import logging
from pathlib import Path
from datetime import datetime

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

# Set random seeds
np.random.seed(42)
tf.random.set_seed(42)

print("Libraries imported successfully!")

## 2. Clone Training Modules

Clone the modular Python training modules from your repository.

In [None]:
# Clone repository (if not already cloned) and set repo root
import os, sys
from pathlib import Path

REPO_URL = 'https://github.com/AliNikkhah2001/edgeWash.git'
# Try common locations under Drive to avoid nested paths
PREFERRED_DIR = Path('/content/drive/MyDrive/edgeWash')
if 'WORK_DIR' in globals():
    PREFERRED_DIR = Path(WORK_DIR) / 'edgeWash'
PREFERRED_DIR.mkdir(parents=True, exist_ok=True)
os.chdir(PREFERRED_DIR)

REPO_DIR = Path('edgeWash')
if not REPO_DIR.exists():
    print(f'Cloning repository from {REPO_URL}...')
    !git clone {REPO_URL}
else:
    print(f'Repository already exists: {REPO_DIR}')
    print('Pulling latest changes...')
    !cd {REPO_DIR} && git pull

# Enter repo root
os.chdir(REPO_DIR)
repo_root = Path.cwd()
print('Repo root:', repo_root)

# Add training modules to Python path
training_dir = repo_root / 'training'
if str(training_dir) not in sys.path:
    sys.path.insert(0, str(training_dir))
print('Training modules path:', training_dir)


In [None]:
# Import training modules with robust path resolution
import sys, os
from pathlib import Path

def ensure_repo_root():
    candidates = [
        Path.cwd(),
        Path('/content/edgeWash'),
        Path('/content/drive/MyDrive/edgeWash'),
        Path('/content/drive/MyDrive/handwash_training'),
        Path('/content/drive/MyDrive/handwash_training_colab/edgeWash'),
        Path('/content/drive/MyDrive/handwash_training_colab/edgeWash/edgeWash')
    ]
    for c in candidates:
        if (c / 'training' / 'config.py').exists():
            os.chdir(c)
            td = c / 'training'
            if str(td) not in sys.path:
                sys.path.insert(0, str(td))
            print('Using repo root:', c)
            print('Added to sys.path:', td)
            return c, td
    raise FileNotFoundError('Cannot locate training/config.py; check your repo path.')

repo_root, training_dir = ensure_repo_root()

import config
import download_datasets
import preprocess_data
import data_generators
import models
import train as train_module
import evaluate

print('Training modules imported successfully!')


## 3. Configuration & Google Drive Paths

View and customize training hyperparameters.

**Important**: Checkpoints, logs, and models will be saved to Google Drive for persistence.

In [None]:
# Override config paths to save to Google Drive (Colab only)
if IN_COLAB:
    # Update paths to Google Drive
    config.WORK_DIR = Path(WORK_DIR)
    config.DATA_DIR = config.WORK_DIR / 'datasets'
    config.RAW_DIR = config.DATA_DIR / 'raw'
    config.PROCESSED_DIR = config.DATA_DIR / 'processed'
    config.MODELS_DIR = config.WORK_DIR / 'models'
    config.CHECKPOINTS_DIR = config.WORK_DIR / 'checkpoints'
    config.LOGS_DIR = config.WORK_DIR / 'logs'
    config.RESULTS_DIR = config.WORK_DIR / 'results'
    
    # Create directories
    for dir_path in [config.DATA_DIR, config.RAW_DIR, config.PROCESSED_DIR, config.MODELS_DIR, config.CHECKPOINTS_DIR, config.LOGS_DIR, config.RESULTS_DIR]:
        dir_path.mkdir(parents=True, exist_ok=True)
    
    print("✓ Paths configured to save to Google Drive:")
    print(f"  Data: {config.DATA_DIR}")
    print(f"  Models: {config.MODELS_DIR}")
    print(f"  Checkpoints: {config.CHECKPOINTS_DIR}")
    print(f"  Logs: {config.LOGS_DIR}")
    print(f"  Results: {config.RESULTS_DIR}")
else:
    print("Running locally - using default paths")

In [None]:
# Display configuration
print("\n" + "=" * 80)
print("TRAINING CONFIGURATION")
print("=" * 80)

print(f"\nImage size: {config.IMG_SIZE}")
print(f"Sequence length: {config.SEQUENCE_LENGTH}")
print(f"Number of classes: {config.NUM_CLASSES}")
print(f"Class names: {config.CLASS_NAMES}")

print(f"\nBatch size: {config.BATCH_SIZE}")
print(f"Epochs: {config.EPOCHS}")
print(f"Learning rate: {config.LEARNING_RATE}")
print(f"Early stopping patience: {config.PATIENCE}")

print(f"\nData split:")
print(f"  Train: {config.TRAIN_RATIO*100:.0f}%")
print(f"  Val:   {config.VAL_RATIO*100:.0f}%")
print(f"  Test:  {config.TEST_RATIO*100:.0f}%")

print(f"\nAugmentation:")
for key, value in config.AUGMENTATION_CONFIG.items():
    print(f"  {key}: {value}")

print(f"\nModel architectures available:")
for model_name, model_config in config.MODEL_CONFIGS.items():
    print(f"  - {model_name}: {model_config['name']}")

## 4. Dataset Download

Download Kaggle WHO6 dataset (~1 GB, quick start).

For full pipeline, also download PSKUS (18 GB) and METC (2 GB) - see commented code below.

In [None]:
# Download Kaggle WHO6 dataset
print("Downloading Kaggle WHO6 dataset...")
success = download_datasets.download_kaggle_dataset()

if success:
    print("\n✓ Kaggle dataset ready!")
else:
    print("\n✗ Kaggle dataset download failed!")

In [None]:
# Optional: Download PSKUS and METC datasets (large, requires zenodo-get)
# Uncomment to download:

# # Install zenodo-get
# !pip install zenodo-get

# # Download PSKUS (18 GB, ~30-60 minutes)
# print("Downloading PSKUS Hospital dataset (18 GB)...")
# download_datasets.download_pskus_dataset()

# # Download METC (2 GB, ~5-10 minutes)
# print("Downloading METC Lab dataset (2 GB)...")
# download_datasets.download_metc_dataset()

In [None]:
# Verify datasets
status = download_datasets.verify_datasets()

print("\n" + "=" * 80)
print("DATASET VERIFICATION")
print("=" * 80)

for dataset_name, info in status.items():
    status_icon = "✓" if info['exists'] else "✗"
    print(f"{status_icon} {info['name']}: {info['num_files']} files")

## 5. Data Preprocessing

Extract frames from videos and create train/val/test splits.

In [None]:
# Preprocess Kaggle dataset
print("Preprocessing Kaggle dataset...")
print("This may take 5-10 minutes...\n")

result = preprocess_data.preprocess_all_datasets(
    use_kaggle=True,
    use_pskus=False,  # Set True if PSKUS downloaded
    use_metc=False    # Set True if METC downloaded
)

if result:
    print("\n✓ Preprocessing complete!")
    print(f"\nProcessed files:")
    for key, path in result.items():
        print(f"  {key}: {path}")
else:
    print("\n✗ Preprocessing failed!")

## 6. Data Exploration

Visualize dataset statistics and sample frames.

In [None]:
# Load preprocessed data
train_df = pd.read_csv(config.PROCESSED_DIR / 'train.csv')
val_df = pd.read_csv(config.PROCESSED_DIR / 'val.csv')
test_df = pd.read_csv(config.PROCESSED_DIR / 'test.csv')

print("Dataset sizes:")
print(f"  Train: {len(train_df)} frames ({len(train_df['video_id'].unique())} videos)")
print(f"  Val:   {len(val_df)} frames ({len(val_df['video_id'].unique())} videos)")
print(f"  Test:  {len(test_df)} frames ({len(test_df['video_id'].unique())} videos)")

In [None]:
# Class distribution
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

for idx, (df, split_name) in enumerate([(train_df, 'Train'), (val_df, 'Val'), (test_df, 'Test')]):
    class_counts = df['class_name'].value_counts()
    
    axes[idx].bar(range(len(class_counts)), class_counts.values)
    axes[idx].set_title(f'{split_name} Set - Class Distribution', fontsize=12)
    axes[idx].set_xlabel('Class', fontsize=10)
    axes[idx].set_ylabel('Number of Frames', fontsize=10)
    axes[idx].set_xticks(range(len(class_counts)))
    axes[idx].set_xticklabels([cn.split('_')[-1] for cn in class_counts.index], rotation=45, ha='right')
    axes[idx].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.savefig('class_distribution.png', dpi=150, bbox_inches='tight')
plt.show()