# Comprehensive Handwashing Detection Training Pipeline

**Complete training pipeline using modular Python modules**

This notebook demonstrates:
1. Dataset download (Kaggle WHO6)
2. Data preprocessing (frame extraction)
3. Model training (MobileNetV2)
4. Evaluation and visualization
5. Model comparison

**Runtime**: GPU (recommended for training)

**Expected Duration**: 2-3 hours for complete pipeline

**Author**: Generated with AdaL (https://github.com/sylphai/adal-cli)

**Date**: 2025-12-31

## 1. Setup & Dependencies

In [None]:
# Check if running on Google Colab
try:
    import google.colab
    IN_COLAB = True
    print("Running on Google Colab")
except ImportError:
    IN_COLAB = False
    print("Running locally")

In [None]:
# Mount Google Drive (Colab only)
if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/drive')
    
    # Set working directory
    import os
    WORK_DIR = '/content/drive/MyDrive/handwash_training'
    os.makedirs(WORK_DIR, exist_ok=True)
    %cd {WORK_DIR}
else:
    WORK_DIR = '.'
    print(f"Working directory: {WORK_DIR}")

In [None]:
# Install dependencies
!pip install -q tensorflow==2.15.0
!pip install -q scikit-learn pandas numpy opencv-python-headless
!pip install -q matplotlib seaborn tqdm requests

print("Dependencies installed!")

In [None]:
# Verify GPU availability
import tensorflow as tf

print(f"TensorFlow version: {tf.__version__}")
print(f"GPU available: {len(tf.config.list_physical_devices('GPU')) > 0}")
print(f"GPU devices: {tf.config.list_physical_devices('GPU')}")

In [None]:
# Import standard libraries
import sys
import json
import logging
from pathlib import Path
from datetime import datetime

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

# Set random seeds
np.random.seed(42)
tf.random.set_seed(42)

print("Libraries imported successfully!")

## 2. Clone Training Modules

Clone the modular Python training modules from your repository.

In [None]:
# Clone repository (if not already cloned)
REPO_URL = "https://github.com/AliNikkhah2001/edgeWash.git"
REPO_DIR = Path("edgeWash")

if not REPO_DIR.exists():
    print(f"Cloning repository from {REPO_URL}...")
    !git clone {REPO_URL}
else:
    print(f"Repository already exists: {REPO_DIR}")
    print("Pulling latest changes...")
    !cd {REPO_DIR} && git pull

# Add training modules to Python path
training_dir = REPO_DIR / "training"
if str(training_dir) not in sys.path:
    sys.path.insert(0, str(training_dir))

print(f"Training modules path: {training_dir}")

In [None]:
# Import training modules
import config
import download_datasets
import preprocess_data
import data_generators
import models
import train as train_module
import evaluate

print("Training modules imported successfully!")

## 3. Configuration & Google Drive Paths

View and customize training hyperparameters.

**Important**: Checkpoints, logs, and models will be saved to Google Drive for persistence.

In [None]:
# Override config paths to save to Google Drive (Colab only)
if IN_COLAB:
    # Update paths to Google Drive
    config.MODELS_DIR = Path(WORK_DIR) / 'models'
    config.CHECKPOINTS_DIR = Path(WORK_DIR) / 'checkpoints'
    config.LOGS_DIR = Path(WORK_DIR) / 'logs'
    config.RESULTS_DIR = Path(WORK_DIR) / 'results'
    
    # Create directories
    for dir_path in [config.MODELS_DIR, config.CHECKPOINTS_DIR, config.LOGS_DIR, config.RESULTS_DIR]:
        dir_path.mkdir(parents=True, exist_ok=True)
    
    print("‚úì Paths configured to save to Google Drive:")
    print(f"  Models: {config.MODELS_DIR}")
    print(f"  Checkpoints: {config.CHECKPOINTS_DIR}")
    print(f"  Logs: {config.LOGS_DIR}")
    print(f"  Results: {config.RESULTS_DIR}")
else:
    print("Running locally - using default paths")

In [None]:
# Display configuration
print("\n" + "=" * 80)
print("TRAINING CONFIGURATION")
print("=" * 80)

print(f"\nImage size: {config.IMG_SIZE}")
print(f"Sequence length: {config.SEQUENCE_LENGTH}")
print(f"Number of classes: {config.NUM_CLASSES}")
print(f"Class names: {config.CLASS_NAMES}")

print(f"\nBatch size: {config.BATCH_SIZE}")
print(f"Epochs: {config.EPOCHS}")
print(f"Learning rate: {config.LEARNING_RATE}")
print(f"Early stopping patience: {config.PATIENCE}")

print(f"\nData split:")
print(f"  Train: {config.TRAIN_RATIO*100:.0f}%")
print(f"  Val:   {config.VAL_RATIO*100:.0f}%")
print(f"  Test:  {config.TEST_RATIO*100:.0f}%")

print(f"\nAugmentation:")
for key, value in config.AUGMENTATION_CONFIG.items():
    print(f"  {key}: {value}")

print(f"\nModel architectures available:")
for model_name, model_config in config.MODEL_CONFIGS.items():
    print(f"  - {model_name}: {model_config['name']}")

## 4. Dataset Download

Download Kaggle WHO6 dataset (~1 GB, quick start).

For full pipeline, also download PSKUS (18 GB) and METC (2 GB) - see commented code below.

In [None]:
# Download Kaggle WHO6 dataset
print("Downloading Kaggle WHO6 dataset...")
success = download_datasets.download_kaggle_dataset()

if success:
    print("\n‚úì Kaggle dataset ready!")
else:
    print("\n‚úó Kaggle dataset download failed!")

In [None]:
# Optional: Download PSKUS and METC datasets (large, requires zenodo-get)
# Uncomment to download:

# # Install zenodo-get
# !pip install zenodo-get

# # Download PSKUS (18 GB, ~30-60 minutes)
# print("Downloading PSKUS Hospital dataset (18 GB)...")
# download_datasets.download_pskus_dataset()

# # Download METC (2 GB, ~5-10 minutes)
# print("Downloading METC Lab dataset (2 GB)...")
# download_datasets.download_metc_dataset()

In [None]:
# Verify datasets
status = download_datasets.verify_datasets()

print("\n" + "=" * 80)
print("DATASET VERIFICATION")
print("=" * 80)

for dataset_name, info in status.items():
    status_icon = "‚úì" if info['exists'] else "‚úó"
    print(f"{status_icon} {info['name']}: {info['num_files']} files")

## 5. Data Preprocessing

Extract frames from videos and create train/val/test splits.

In [None]:
# Preprocess Kaggle dataset
print("Preprocessing Kaggle dataset...")
print("This may take 5-10 minutes...\n")

result = preprocess_data.preprocess_all_datasets(
    use_kaggle=True,
    use_pskus=False,  # Set True if PSKUS downloaded
    use_metc=False    # Set True if METC downloaded
)

if result:
    print("\n‚úì Preprocessing complete!")
    print(f"\nProcessed files:")
    for key, path in result.items():
        print(f"  {key}: {path}")
else:
    print("\n‚úó Preprocessing failed!")

## 6. Data Exploration

Visualize dataset statistics and sample frames.

In [None]:
# Load preprocessed data
train_df = pd.read_csv(config.PROCESSED_DIR / 'train.csv')
val_df = pd.read_csv(config.PROCESSED_DIR / 'val.csv')
test_df = pd.read_csv(config.PROCESSED_DIR / 'test.csv')

print("Dataset sizes:")
print(f"  Train: {len(train_df)} frames ({len(train_df['video_id'].unique())} videos)")
print(f"  Val:   {len(val_df)} frames ({len(val_df['video_id'].unique())} videos)")
print(f"  Test:  {len(test_df)} frames ({len(test_df['video_id'].unique())} videos)")

In [None]:
# Class distribution
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

for idx, (df, split_name) in enumerate([(train_df, 'Train'), (val_df, 'Val'), (test_df, 'Test')]):
    class_counts = df['class_name'].value_counts()
    
    axes[idx].bar(range(len(class_counts)), class_counts.values)
    axes[idx].set_title(f'{split_name} Set - Class Distribution', fontsize=12)
    axes[idx].set_xlabel('Class', fontsize=10)
    axes[idx].set_ylabel('Number of Frames', fontsize=10)
    axes[idx].set_xticks(range(len(class_counts)))
    axes[idx].set_xticklabels([cn.split('_')[-1] for cn in class_counts.index], rotation=45, ha='right')
    axes[idx].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.savefig('class_distribution.png', dpi=150, bbox_inches='tight')
plt.show()

## 7. Train All Models

Train all 3 model architectures sequentially:
1. **MobileNetV2**: Frame-based classifier (fast inference)
2. **LSTM**: Temporal sequence model (context-aware)
3. **GRU**: Alternative temporal model (faster than LSTM)

Best weights for each model will be saved to Google Drive.

In [None]:
# Training configuration
EPOCHS = 20  # Increase to 50 for production
MODELS_TO_TRAIN = ['mobilenetv2', 'lstm', 'gru']

print("=" * 80)
print("TRAINING PIPELINE: All 3 Models")
print("=" * 80)
print(f"\nModels: {', '.join([m.upper() for m in MODELS_TO_TRAIN])}")
print(f"Epochs: {EPOCHS}")
print(f"Checkpoints will be saved to: {config.CHECKPOINTS_DIR}")
print(f"Final models will be saved to: {config.MODELS_DIR}")
print("\n" + "=" * 80)

In [None]:
# Train all models
training_results = {}

for model_type in MODELS_TO_TRAIN:
    print(f"\n\n{'='*80}")
    print(f"TRAINING MODEL {MODELS_TO_TRAIN.index(model_type) + 1}/3: {model_type.upper()}")
    print('='*80)
    
    # Adjust batch size for sequence models
    batch_size = 32 if model_type == 'mobilenetv2' else 16
    
    # Train model
    result = train_module.train_model(
        model_type=model_type,
        train_csv=config.PROCESSED_DIR / 'train.csv',
        val_csv=config.PROCESSED_DIR / 'val.csv',
        batch_size=batch_size,
        epochs=EPOCHS,
        learning_rate=config.LEARNING_RATE
    )
    
    training_results[model_type] = result
    
    # Display summary
    best_epoch = result['best_epoch']
    best_val_acc = result['history']['val_accuracy'][best_epoch]
    best_val_loss = result['history']['val_loss'][best_epoch]
    
    print(f"\n‚úì {model_type.upper()} training complete!")
    print(f"  Best epoch: {best_epoch + 1}")
    print(f"  Best val accuracy: {best_val_acc:.4f}")
    print(f"  Best val loss: {best_val_loss:.4f}")
    print(f"  Final model saved: {result['final_model_path']}")

print(f"\n\n{'='*80}")
print("ALL MODELS TRAINED SUCCESSFULLY")
print('='*80)

## 8. Training Visualization

Compare training curves across all models.

## 9. Evaluate All Models

Evaluate all trained models on test set.

In [None]:
# Evaluate all models
evaluation_results = {}

print("=" * 80)
print("EVALUATING ALL MODELS ON TEST SET")
print("=" * 80)

for model_type in MODELS_TO_TRAIN:
    print(f"\nEvaluating {model_type.upper()}...")
    
    batch_size = 32 if model_type == 'mobilenetv2' else 16
    
    eval_results = evaluate.evaluate_model(
        model_path=training_results[model_type]['final_model_path'],
        test_csv=config.PROCESSED_DIR / 'test.csv',
        model_type=model_type,
        batch_size=batch_size,
        save_results=True
    )
    
    evaluation_results[model_type] = eval_results
    
    print(f"‚úì {model_type.upper()} evaluation complete!")
    print(f"  Accuracy: {eval_results['accuracy']:.4f}")
    print(f"  F1-Score: {eval_results['f1_score']:.4f}")

print("\n" + "=" * 80)
print("ALL EVALUATIONS COMPLETE")
print("=" * 80)

In [None]:
# Display detailed metrics for each model
for model_type in MODELS_TO_TRAIN:
    eval_results = evaluation_results[model_type]
    
    print("\n" + "=" * 80)
    print(f"{model_type.upper()} - TEST SET METRICS")
    print("=" * 80)
    
    print(f"\nOverall Metrics:")
    print(f"  Accuracy:       {eval_results['accuracy']:.4f}")
    print(f"  Top-2 Accuracy: {eval_results['top2_accuracy']:.4f}")
    print(f"  Precision:      {eval_results['precision']:.4f}")
    print(f"  Recall:         {eval_results['recall']:.4f}")
    print(f"  F1-Score:       {eval_results['f1_score']:.4f}")
    
    print(f"\nPer-Class F1-Scores:")
    for class_name in config.CLASS_NAMES:
        metrics = eval_results['per_class_metrics'][class_name]
        print(f"  {class_name}: {metrics['f1-score']:.4f}")

## 10. TensorBoard

Launch TensorBoard to view training logs.

In [None]:
# Load TensorBoard extension (Jupyter/Colab)
%load_ext tensorboard

In [None]:
# Launch TensorBoard
%tensorboard --logdir {config.LOGS_DIR}

## 11. Model Comparison

Compare all 3 models with comprehensive visualizations.

In [None]:
# Create model comparison visualization
print("=" * 80)
print("GENERATING MODEL COMPARISON PLOTS")
print("=" * 80)

# Call compare_models from evaluate module
comparison_path = config.RESULTS_DIR / 'model_comparison.png'
evaluate.compare_models(
    evaluation_results,
    save_path=comparison_path
)

print(f"\n‚úì Comparison plot saved: {comparison_path}")

In [None]:
# Display comparison plot
from IPython.display import Image, display

if comparison_path.exists():
    display(Image(filename=str(comparison_path)))
else:
    print("Comparison plot not found!")

In [None]:
# Create summary table
import pandas as pd

summary_data = []
for model_type in MODELS_TO_TRAIN:
    eval_results = evaluation_results[model_type]
    summary_data.append({
        'Model': model_type.upper(),
        'Accuracy': f"{eval_results['accuracy']:.4f}",
        'Top-2 Acc': f"{eval_results['top2_accuracy']:.4f}",
        'Precision': f"{eval_results['precision']:.4f}",
        'Recall': f"{eval_results['recall']:.4f}",
        'F1-Score': f"{eval_results['f1_score']:.4f}"
    })

summary_df = pd.DataFrame(summary_data)

print("\n" + "=" * 80)
print("MODEL COMPARISON SUMMARY")
print("=" * 80)
print("\n" + summary_df.to_string(index=False))

# Save summary
summary_path = config.RESULTS_DIR / 'model_comparison_summary.csv'
summary_df.to_csv(summary_path, index=False)
print(f"\n‚úì Summary saved: {summary_path}")

In [None]:
# Identify best model
best_model = max(evaluation_results.items(), key=lambda x: x[1]['f1_score'])
best_model_name = best_model[0]
best_f1 = best_model[1]['f1_score']

print("\n" + "=" * 80)
print("BEST MODEL")
print("=" * 80)
print(f"\nüèÜ {best_model_name.upper()} achieved the highest F1-Score: {best_f1:.4f}")
print(f"\nAll metrics for {best_model_name.upper()}:")
for metric, value in best_model[1].items():
    if isinstance(value, float):
        print(f"  {metric}: {value:.4f}")

## 12. Saved Models & Checkpoints

Summary of all saved model weights and checkpoints on Google Drive.

In [None]:
# Display saved model paths
print("=" * 80)
print("SAVED MODEL WEIGHTS (Google Drive)")
print("=" * 80)

print(f"\nModels directory: {config.MODELS_DIR}")
print(f"Checkpoints directory: {config.CHECKPOINTS_DIR}")

print("\nFinal Model Weights:")
for model_type in MODELS_TO_TRAIN:
    model_path = training_results[model_type]['final_model_path']
    checkpoint_path = training_results[model_type]['best_checkpoint_path']
    
    print(f"\n{model_type.upper()}:")
    print(f"  Final model: {model_path}")
    print(f"  Best checkpoint: {checkpoint_path}")
    
    # Check file size
    if Path(model_path).exists():
        size_mb = Path(model_path).stat().st_size / (1024 * 1024)
        print(f"  Model size: {size_mb:.2f} MB")

print("\n" + "=" * 80)
print("All model weights are saved to Google Drive!")
print("They will persist even if Colab runtime disconnects.")
print("=" * 80)

## 13. Summary & Next Steps

Complete training pipeline finished!

In [None]:
print("=" * 80)
print("TRAINING PIPELINE COMPLETE")
print("=" * 80)

print(f"\n‚úì Trained {len(MODELS_TO_TRAIN)} models: {', '.join([m.upper() for m in MODELS_TO_TRAIN])}")
print(f"‚úì All models evaluated on test set")
print(f"‚úì Best model: {best_model_name.upper()} (F1: {best_f1:.4f})")

print(f"\nResults saved to Google Drive:")
print(f"  - Models: {config.MODELS_DIR}")
print(f"  - Checkpoints: {config.CHECKPOINTS_DIR}")
print(f"  - Logs: {config.LOGS_DIR}")
print(f"  - Evaluation results: {config.RESULTS_DIR}")
print(f"  - Comparison plot: {comparison_path}")
print(f"  - Summary CSV: {summary_path}")

print("\nNext steps:")
print("  1. Fine-tune best model with more epochs (50+)")
print("  2. Download larger datasets (PSKUS, METC) for better accuracy")
print("  3. Experiment with different augmentation strategies")
print("  4. Analyze per-class performance and address weak classes")
print("  5. Export best model to TFLite for mobile deployment")
print("  6. Create inference demo with real-time video")

In [None]:
# Launch TensorBoard
%tensorboard --logdir {config.LOGS_DIR}

## 11. Optional: Train Additional Models

Train LSTM or GRU models for temporal modeling (requires sequence data).

In [None]:
# Uncomment to train LSTM model

# lstm_result = train_module.train_model(
#     model_type='lstm',
#     train_csv=config.PROCESSED_DIR / 'train.csv',
#     val_csv=config.PROCESSED_DIR / 'val.csv',
#     batch_size=16,  # Reduce batch size for sequence models
#     epochs=20,
#     learning_rate=config.LEARNING_RATE
# )

# print("\n‚úì LSTM training complete!")

In [None]:
# Uncomment to train GRU model

# gru_result = train_module.train_model(
#     model_type='gru',
#     train_csv=config.PROCESSED_DIR / 'train.csv',
#     val_csv=config.PROCESSED_DIR / 'val.csv',
#     batch_size=16,
#     epochs=20,
#     learning_rate=config.LEARNING_RATE
# )

# print("\n‚úì GRU training complete!")

## 12. Model Comparison

Compare multiple models (if trained).

In [None]:
# Example: Compare MobileNetV2, LSTM, GRU
# Uncomment if you trained multiple models

# model_results = {
#     'MobileNetV2': eval_results,
#     'LSTM': evaluate.evaluate_model(
#         model_path=str(config.MODELS_DIR / 'lstm_final.keras'),
#         test_csv=config.PROCESSED_DIR / 'test.csv',
#         model_type='lstm',
#         batch_size=16,
#         save_results=True
#     ),
#     'GRU': evaluate.evaluate_model(
#         model_path=str(config.MODELS_DIR / 'gru_final.keras'),
#         test_csv=config.PROCESSED_DIR / 'test.csv',
#         model_type='gru',
#         batch_size=16,
#         save_results=True
#     )
# }

# # Create comparison plot
# evaluate.compare_models(
#     model_results,
#     save_path=config.RESULTS_DIR / 'model_comparison.png'
# )

# display(Image(filename=str(config.RESULTS_DIR / 'model_comparison.png')))

## 13. Summary & Next Steps

Training pipeline complete!

In [None]:
print("=" * 80)
print("TRAINING PIPELINE COMPLETE")
print("=" * 80)

print(f"\nTrained model: {MODEL_TYPE}")
print(f"Model saved: {result['final_model_path']}")
print(f"\nTest Accuracy: {eval_results['accuracy']:.4f}")
print(f"Test F1-Score: {eval_results['f1_score']:.4f}")

print(f"\nResults saved to:")
print(f"  - Confusion matrix: {config.RESULTS_DIR / MODEL_TYPE / 'confusion_matrix.png'}")
print(f"  - Classification report: {config.RESULTS_DIR / MODEL_TYPE / 'classification_report.txt'}")
print(f"  - Metrics CSV: {config.RESULTS_DIR / MODEL_TYPE / 'metrics.csv'}")

print(f"\nTensorBoard logs: {config.LOGS_DIR}")
print(f"Checkpoints: {config.CHECKPOINTS_DIR}")

print("\nNext steps:")
print("  1. Fine-tune model with more epochs (50+)")
print("  2. Train temporal models (LSTM/GRU) for sequence modeling")
print("  3. Download larger datasets (PSKUS, METC) for better accuracy")
print("  4. Experiment with different augmentation strategies")
print("  5. Export model to TFLite for mobile deployment")