In [None]:
!pip install kagglehub tqdm h5py scikit-learn tensorflow opencv-python matplotlib 


In [None]:
# ==== Cell 2: Import libraries ====
import os
import sys
import glob
import zipfile
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Input, Dense, Dropout, GlobalAveragePooling2D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau, TensorBoard
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications.vgg16 import preprocess_input
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import cv2
import time
import datetime
import h5py
import shutil
from tqdm import tqdm
import requests
import kagglehub
import tensorflow.keras.backend as K
import re

In [None]:
# ==== New Cell: GPU Configuration ====
def configure_gpus():
    """
    Configure GPUs and set up distributed training strategy.
    
    Returns:
        tf.distribute.Strategy: Distribution strategy for multi-GPU training.
    """
    print("=" * 50)
    print("Configuring GPUs for distributed training...")
    
    # Check available GPUs
    gpus = tf.config.list_physical_devices('GPU')
    print(f"Number of GPUs available: {len(gpus)}")
    
    for gpu in gpus:
        print(f"Name: {gpu.name}, Type: {gpu.device_type}")
        
        # Configure memory growth to avoid allocating all memory at once
        try:
            tf.config.experimental.set_memory_growth(gpu, True)
            print(f"Memory growth set to True for {gpu.name}")
        except:
            print(f"Failed to set memory growth for {gpu.name}")
    
    # Choose the appropriate distribution strategy
    if len(gpus) > 1:
        # MirroredStrategy for multiple GPUs on a single machine
        strategy = tf.distribute.MirroredStrategy()
        print(f"Using MirroredStrategy with {strategy.num_replicas_in_sync} GPUs")
    else:
        # Default strategy for single GPU
        strategy = tf.distribute.get_strategy()
        print("Using default strategy (single GPU)")
    
    # Log device placement for debugging
    tf.debugging.set_log_device_placement(False)  # Set to True for detailed device placement logs
    
    return strategy


In [None]:


# Configuration
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
tf.random.set_seed(RANDOM_SEED)

# Directory structure for Kaggle
DATA_DIR = '/kaggle/working/vggface2_data'
TRAIN_DIR = os.path.join(DATA_DIR, 'train')
VAL_DIR = os.path.join(DATA_DIR, 'val')
PROCESSED_TRAIN_DIR = os.path.join(DATA_DIR, 'processed_train')
PROCESSED_VAL_DIR = os.path.join(DATA_DIR, 'processed_val')

# VGG model paths - Use predefined dataset directory
VGG_MODEL_FOLDER = '/kaggle/input/vgg_model/tensorflow2/default/1'  # Directly reference input dataset
VGG_FEATURE_MODEL_PATH = os.path.join(VGG_MODEL_FOLDER, 'vggface_features.h5')

# Fine-tuned model paths
FINETUNED_MODEL_FOLDER = '/kaggle/working/vgg_finetuned'
FINETUNED_FULL_MODEL_PATH = os.path.join(FINETUNED_MODEL_FOLDER, 'vggface_finetuned_full.h5')
FINETUNED_FEATURE_MODEL_PATH = os.path.join(FINETUNED_MODEL_FOLDER, 'vggface_finetuned_features.h5')

# Training parameters
BATCH_SIZE = 64
EPOCHS = 100
LEARNING_RATE = 1e-4
IMG_SIZE = (224, 224)
VALIDATION_SPLIT = 0.2

# Create directories if they don't exist
os.makedirs(DATA_DIR, exist_ok=True)
os.makedirs(FINETUNED_MODEL_FOLDER, exist_ok=True)
os.makedirs(PROCESSED_TRAIN_DIR, exist_ok=True)
os.makedirs(PROCESSED_VAL_DIR, exist_ok=True)

# Confirm paths
print(f"Using VGG model from: {VGG_FEATURE_MODEL_PATH}")


In [None]:
# ==== Cell 5: Modified dataset function for Kaggle ====
def download_vggface2_dataset(manual_path=None):
    """
    Set up the VGGFace2 dataset in Kaggle environment.
    
    Args:
        manual_path (str, optional): Path to manually added dataset in Kaggle.
    
    Returns:
        str: Path to the dataset files.
    """
    print("=" * 50)
    print("Setting up VGGFace2 dataset...")
    
    # Check if dataset directories already exist
    if os.path.exists(TRAIN_DIR) and os.path.exists(VAL_DIR):
        print("Dataset directories already exist. Skipping setup.")
        return DATA_DIR
    
    # In Kaggle, datasets are typically in /kaggle/input/[dataset-name]
    # Check if the dataset is already available in the input directory
    kaggle_dataset_paths = glob.glob('/kaggle/input/*vggface2*')
    
    if kaggle_dataset_paths:
        dataset_path = kaggle_dataset_paths[0]
        print(f"Found VGGFace2 dataset at: {dataset_path}")
    elif manual_path and os.path.exists(manual_path):
        print(f"Using manually specified dataset at: {manual_path}")
        dataset_path = manual_path
    else:
        print("VGGFace2 dataset not found in Kaggle inputs.")
        print("Please add the dataset to your notebook using the 'Add Data' button.")
        print("For VGGFace2, search for 'hearfool/vggface2' or add your own dataset.")
        
        # Provide instructions for adding data in Kaggle
        print("\nInstructions:")
        print("1. Click '+Add data' at the top right of your notebook")
        print("2. Search for 'hearfool/vggface2' or the dataset you want to use")
        print("3. Click 'Add' and then run this cell again")
        
        # Ask for manual path as fallback
        manual_input = input("Or enter path to the dataset if already added: ")
        if manual_input and os.path.exists(manual_input):
            dataset_path = manual_input
        else:
            print("Valid dataset path not provided. Exiting.")
            sys.exit(1)
    
    # Check if the path is a directory with train/val subdirectories
    if os.path.isdir(dataset_path):
        possible_train_dir = os.path.join(dataset_path, 'train')
        possible_val_dir = os.path.join(dataset_path, 'val')
        
        if os.path.exists(possible_train_dir) and os.path.exists(possible_val_dir):
            print("Dataset already contains train and val directories.")
            print(f"Creating symbolic links to {TRAIN_DIR} and {VAL_DIR}")
            
            # Make sure target directories exist
            os.makedirs(TRAIN_DIR, exist_ok=True)
            os.makedirs(VAL_DIR, exist_ok=True)
            
            # In Kaggle, we need to copy files instead of symlinks
            if not os.path.exists(TRAIN_DIR) or not os.listdir(TRAIN_DIR):
                if os.path.exists(TRAIN_DIR):
                    os.rmdir(TRAIN_DIR)
                # Kaggle doesn't always support symlinks, so use a copy command instead
                print(f"Copying training data from {possible_train_dir} to {TRAIN_DIR}")
                # Create a symlink first (faster) and fall back to copy if it fails
                try:
                    os.symlink(possible_train_dir, TRAIN_DIR)
                    print("Created symbolic link for training directory")
                except:
                    print("Symbolic link creation failed, copying files instead (this may take a while)")
                    # Use shutil.copytree or os.system command for copying
                    # This is commented out as it might be slow for large datasets
                    # shutil.copytree(possible_train_dir, TRAIN_DIR)
                    print("Consider using only a subset of the data for testing")
            
            if not os.path.exists(VAL_DIR) or not os.listdir(VAL_DIR):
                if os.path.exists(VAL_DIR):
                    os.rmdir(VAL_DIR)
                # Similar approach for validation directory
                try:
                    os.symlink(possible_val_dir, VAL_DIR)
                    print("Created symbolic link for validation directory")
                except:
                    print("Symbolic link creation failed. Consider copying files if needed.")
            
            return DATA_DIR
        
        # Check for zip files
        zip_files = glob.glob(os.path.join(dataset_path, "*.zip"))
        if zip_files:
            print(f"Found {len(zip_files)} zip files in the dataset directory.")
            # Extract zip files logic...
        else:
            print("Checking for direct dataset structure...")
            class_dirs = glob.glob(os.path.join(dataset_path, "*"))
            if class_dirs:
                print(f"Found {len(class_dirs)} potential class directories.")
                # If this looks like a dataset with class folders, set it up
                os.makedirs(TRAIN_DIR, exist_ok=True)
                print(f"Linking dataset from {dataset_path} to {TRAIN_DIR}")
                # Try symbolic link first
                try:
                    os.symlink(dataset_path, TRAIN_DIR)
                    print(f"Created symbolic link for training directory")
                except:
                    print("Symbolic link creation failed. You may need to copy files manually.")
                return DATA_DIR
    else:
        # Handle single zip file case
        print(f"Found single dataset file: {dataset_path}")
        # Extract zip file logic...
    
    # Verify the folder structure
    expected_folders = [TRAIN_DIR, VAL_DIR]
    for folder in expected_folders:
        if not os.path.exists(folder):
            print(f"Warning: Expected folder {folder} not found after setup.")
        else:
            print(f"Verified: {folder} exists.")
    
    return DATA_DIR

In [None]:
# ==== Cell 6: Define dataset preprocessing function ====
def reconstruct_validation_set(train_dir, val_dir, processed_train_dir, processed_val_dir, validation_split=0.2):
    """
    Reconstruct the validation set to include examples from all classes in the training set.
    
    Args:
        train_dir (str): Path to the original training directory
        val_dir (str): Path to the original validation directory
        processed_train_dir (str): Path to save the new processed training data
        processed_val_dir (str): Path to save the new processed validation data
        validation_split (float): Fraction of training data to use for validation
        
    Returns:
        tuple: Lists of training and validation image paths
    """
    print("=" * 50)
    print("Checking for processed data...")
    
    # Check if processed directories exist and contain data
    processed_data_exists = (os.path.exists(processed_train_dir) and 
                             os.path.exists(processed_val_dir) and
                             len(os.listdir(processed_train_dir)) > 0 and
                             len(os.listdir(processed_val_dir)) > 0)
    
    if processed_data_exists:
        print("Processed data already exists.")
        reuse_existing = input("Would you like to use existing processed data? (y/n): ").lower() == 'y'
        
        if reuse_existing:
            print("Using existing processed data.")
            # Get paths of existing processed images
            processed_train_images = []
            processed_val_images = []
            
            # Walk through the processed train directory
            for root, _, files in os.walk(processed_train_dir):
                for file in files:
                    if file.lower().endswith(('.png', '.jpg', '.jpeg')):
                        processed_train_images.append(os.path.join(root, file))
            
            # Walk through the processed val directory
            for root, _, files in os.walk(processed_val_dir):
                for file in files:
                    if file.lower().endswith(('.png', '.jpg', '.jpeg')):
                        processed_val_images.append(os.path.join(root, file))
                        
            print(f"Found {len(processed_train_images)} existing training images and {len(processed_val_images)} existing validation images")
            return processed_train_images, processed_val_images
        else:
            print("Reprocessing data...")
    else:
        print("No processed data found. Processing data...")
    
    print("Reconstructing validation set to include all classes...")
    
    # Clear existing processed directories if they exist
    if os.path.exists(processed_train_dir):
        shutil.rmtree(processed_train_dir)
    if os.path.exists(processed_val_dir):
        shutil.rmtree(processed_val_dir)
    
    os.makedirs(processed_train_dir, exist_ok=True)
    os.makedirs(processed_val_dir, exist_ok=True)
    
    # Get all class directories in the training set
    class_dirs = [d for d in os.listdir(train_dir) if os.path.isdir(os.path.join(train_dir, d))]
    print(f"Found {len(class_dirs)} identity classes in the training set")
    
    processed_train_images = []
    processed_val_images = []
    
    # Process each class
    for class_name in tqdm(class_dirs, desc="Processing classes"):
        class_dir = os.path.join(train_dir, class_name)
        
        # Get all images for this class
        images = [os.path.join(class_dir, f) for f in os.listdir(class_dir) 
                 if f.lower().endswith(('.jpg', '.jpeg', '.png'))]
        
        if not images:
            continue
            
        # Split images into train and validation
        train_imgs, val_imgs = train_test_split(
            images, test_size=validation_split, random_state=RANDOM_SEED
        )
        
        # Create class directories in processed folders
        os.makedirs(os.path.join(processed_train_dir, class_name), exist_ok=True)
        os.makedirs(os.path.join(processed_val_dir, class_name), exist_ok=True)
        
        # Process training images
        for idx, img_path in enumerate(train_imgs):
            try:
                # Load and resize image
                img = cv2.imread(img_path)
                if img is None:
                    continue
                    
                img_resized = cv2.resize(img, IMG_SIZE)
                
                # Save processed image
                new_img_path = os.path.join(processed_train_dir, class_name, f"{idx}.jpg")
                cv2.imwrite(new_img_path, img_resized)
                processed_train_images.append(new_img_path)
            except Exception as e:
                print(f"Error processing image {img_path}: {e}")
        
        # Process validation images
        for idx, img_path in enumerate(val_imgs):
            try:
                # Load and resize image
                img = cv2.imread(img_path)
                if img is None:
                    continue
                    
                img_resized = cv2.resize(img, IMG_SIZE)
                
                # Save processed image
                new_img_path = os.path.join(processed_val_dir, class_name, f"{idx}.jpg")
                cv2.imwrite(new_img_path, img_resized)
                processed_val_images.append(new_img_path)
            except Exception as e:
                print(f"Error processing image {img_path}: {e}")
    
    print(f"Created new dataset with {len(processed_train_images)} training images and {len(processed_val_images)} validation images")
    print(f"Data split across {len(class_dirs)} identity classes")
    
    return processed_train_images, processed_val_images


In [None]:
# ==== Modified Cell: Update create_data_generators function ====
def create_data_generators(strategy):
    """
    Create data generators for training and validation, optimized for multi-GPU training.
    
    Args:
        strategy: Distribution strategy for multi-GPU training.
        
    Returns:
        tuple: Training and validation data generators and tf.data.Dataset versions.
    """
    print("=" * 50)
    print("Setting up data generators for multi-GPU training...")
    
    # Calculate global batch size based on number of GPUs
    global_batch_size = BATCH_SIZE * strategy.num_replicas_in_sync
    print(f"Using global batch size: {global_batch_size}")
    
    # Minimal processing for both training and validation
    train_datagen = ImageDataGenerator(
        preprocessing_function=preprocess_input
    )
    
    val_datagen = ImageDataGenerator(
        preprocessing_function=preprocess_input
    )
    
    # Create generators with global batch size
    train_generator = train_datagen.flow_from_directory(
        PROCESSED_TRAIN_DIR,
        target_size=IMG_SIZE,
        batch_size=global_batch_size,
        class_mode='categorical',
        shuffle=True
    )
    
    val_generator = val_datagen.flow_from_directory(
        PROCESSED_VAL_DIR,
        target_size=IMG_SIZE,
        batch_size=global_batch_size,
        class_mode='categorical',
        shuffle=False
    )
    
    print(f"Created data generators with {train_generator.num_classes} classes.")
    print(f"Training on {train_generator.samples} samples with global batch size of {global_batch_size}")
    print(f"Training steps per epoch: {len(train_generator)}")
    
    # Convert generators to tf.data.Dataset for better multi-GPU performance
    def generator_to_dataset(generator):
        def gen():
            while True:
                x, y = next(generator)
                yield x, y
                
        output_signature = (
            tf.TensorSpec(shape=(None,) + generator.image_shape, dtype=tf.float32),
            tf.TensorSpec(shape=(None, generator.num_classes), dtype=tf.float32)
        )
        
        dataset = tf.data.Dataset.from_generator(
            gen, 
            output_signature=output_signature
        )
        
        # Add performance optimizations
        dataset = dataset.repeat()
        dataset = dataset.prefetch(tf.data.AUTOTUNE)
        
        return dataset
    
    # Convert both generators to datasets
    train_dataset = generator_to_dataset(train_generator)
    val_dataset = generator_to_dataset(val_generator)
    
    print("Successfully created tf.data.Datasets from generators")
    
    return train_generator, val_generator, train_dataset, val_dataset


In [None]:

def load_vggface_model():
    """
    Load the pretrained VGGFace model.
    
    Returns:
        Model: Loaded VGGFace model.
    """
    print("=" * 50)
    print("Loading pretrained VGGFace model...")

    # Directly check the expected model path
    if os.path.exists(VGG_FEATURE_MODEL_PATH):
        print(f"Found VGGFace model at: {VGG_FEATURE_MODEL_PATH}")
    else:
        print(f"Error: VGGFace model not found at {VGG_FEATURE_MODEL_PATH}")
        sys.exit(1)
    
    try:
        # Load the model
        vgg_model = load_model(VGG_FEATURE_MODEL_PATH)
        print("VGGFace model loaded successfully.")
        return vgg_model
    except Exception as e:
        print(f"Error loading VGGFace model: {e}")
        sys.exit(1)


In [None]:
import os
import glob
import re
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau, TensorBoard, Callback

# Create a custom callback to clean up checkpoints after saving
class CheckpointCleanupCallback(Callback):
    def __init__(self, checkpoint_dir):
        super().__init__()
        self.checkpoint_dir = checkpoint_dir
    
    def on_epoch_end(self, epoch, logs=None):
        # Only run if a checkpoint was saved this epoch
        # This will be triggered after ModelCheckpoint has saved a new file
        checkpoint_files = sorted(
            glob.glob(os.path.join(self.checkpoint_dir, 'checkpoint_*.keras')),
            key=os.path.getmtime
        )
        
        # Keep only the best checkpoint (most recent)
        if len(checkpoint_files) > 1:
            for old_checkpoint in checkpoint_files[:-1]:
                print(f"Deleting old checkpoint: {old_checkpoint}")
                os.remove(old_checkpoint)

def build_and_finetune_model(vgg_model, train_generator, val_generator, strategy, resume_from_checkpoint=None):
    """
    Build and fine-tune the model for face recognition with multi-GPU support.
    Can resume training from a checkpoint.
    
    Args:
        vgg_model (Model): Loaded VGGFace model.
        train_generator: Training data generator.
        val_generator: Validation data generator.
        strategy: Distribution strategy for multi-GPU training.
        resume_from_checkpoint (str, optional): Path to checkpoint file to resume from.
    
    Returns:
        tuple: Fine-tuned model and training history.
    """
    print("=" * 50)
    print("Building and fine-tuning the model with multi-GPU support...")
    
    num_classes = train_generator.num_classes
    print(f"Building model with {num_classes} output classes")

    if train_generator.num_classes != val_generator.num_classes:
        print(f"WARNING: Training generator has {train_generator.num_classes} classes but validation generator has {val_generator.num_classes}")
    
    global_batch_size = BATCH_SIZE * strategy.num_replicas_in_sync
    print(f"Global batch size: {global_batch_size} (Base:{BATCH_SIZE} × {strategy.num_replicas_in_sync} GPUs)")
    
    # Create the model inside the strategy scope
    with strategy.scope():
        if resume_from_checkpoint:
            print(f"Loading model from checkpoint: {resume_from_checkpoint}")
            try:
                fine_tuned_model = load_model(resume_from_checkpoint)
                print("Successfully loaded model from checkpoint")

                # Ensure correct learning rate
                try:
                    if hasattr(fine_tuned_model.optimizer, 'learning_rate'):
                        K.set_value(fine_tuned_model.optimizer.learning_rate, LEARNING_RATE)
                    elif hasattr(fine_tuned_model.optimizer, 'lr'):
                        K.set_value(fine_tuned_model.optimizer.lr, LEARNING_RATE)
                    else:
                        print("Could not reset learning rate - optimizer structure unknown")
                except Exception as e:
                    print(f"Could not reset learning rate: {e}")
                    print("Continuing with loaded optimizer settings")

            except Exception as e:
                print(f"Error loading checkpoint: {e}")
                print("Building model from scratch instead...")
                resume_from_checkpoint = None
        
        # If no checkpoint, build model from scratch
        if not resume_from_checkpoint:
            for layer in vgg_model.layers[:10]:
                layer.trainable = False
            for layer in vgg_model.layers[10:]:
                layer.trainable = True
            
            x = vgg_model.output
            x = Dropout(0.5, name='dropout_ft1')(x)
            x = Dense(1024, activation='relu', name='fc8')(x)
            x = Dropout(0.5, name='dropout_ft2')(x)
            predictions = Dense(num_classes, activation='softmax', name='predictions')(x)
            
            fine_tuned_model = Model(inputs=vgg_model.input, outputs=predictions)
            fine_tuned_model.compile(
                optimizer=Adam(learning_rate=LEARNING_RATE),
                loss='categorical_crossentropy',
                metrics=['accuracy']
            )

    fine_tuned_model.summary()

    # Find the last epoch number if resuming
    initial_epoch = 0
    if resume_from_checkpoint:
        checkpoint_name = os.path.basename(resume_from_checkpoint)
        match = re.search(r'checkpoint_(\d+)_', checkpoint_name)
        if match:
            initial_epoch = int(match.group(1))
            print(f"Resuming from epoch {initial_epoch}")

    # Clean up any existing checkpoints before starting training
    def cleanup_all_checkpoints():
        checkpoint_files = sorted(
            glob.glob(os.path.join(FINETUNED_MODEL_FOLDER, 'checkpoint_*.keras')),
            key=os.path.getmtime
        )
        
        # Keep only the checkpoint we're resuming from (if any)
        for checkpoint in checkpoint_files:
            if resume_from_checkpoint and os.path.basename(checkpoint) == os.path.basename(resume_from_checkpoint):
                continue
            print(f"Deleting old checkpoint: {checkpoint}")
            os.remove(checkpoint)
    
    # Clean up old checkpoints before starting training
    cleanup_all_checkpoints()

    # Set up callbacks
    model_checkpoint_callback = ModelCheckpoint(
        os.path.join(FINETUNED_MODEL_FOLDER, 'checkpoint_{epoch:02d}_{val_accuracy:.4f}.keras'),
        monitor='val_accuracy',
        save_best_only=True,
        save_weights_only=False,
        mode='max',
        verbose=1
    )
    
    # Create our custom cleanup callback
    cleanup_callback = CheckpointCleanupCallback(FINETUNED_MODEL_FOLDER)

    callbacks = [
        EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True),
        ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=10, min_lr=1e-7),
        model_checkpoint_callback,
        cleanup_callback,  # Add our cleanup callback
        TensorBoard(log_dir=os.path.join(FINETUNED_MODEL_FOLDER, 'logs'))
    ]
    
    # Update generator batch sizes
    train_generator.batch_size = global_batch_size
    val_generator.batch_size = global_batch_size

    # Start fine-tuning
    print("\nStarting fine-tuning...")
    history = fine_tuned_model.fit(
        train_generator,
        steps_per_epoch=len(train_generator),
        epochs=EPOCHS,
        validation_data=val_generator,
        validation_steps=len(val_generator),
        callbacks=callbacks,
        verbose=1,
        initial_epoch=initial_epoch
    )

    return fine_tuned_model, history

In [None]:
# ==== New Cell: Memory monitoring function (optional but useful) ====

def monitor_gpu_memory():
    """
    Monitor GPU memory usage during training.
    This function can be called periodically to check memory usage.
    """
    import subprocess
    import re
    
    try:
        # Use nvidia-smi to get GPU memory info
        result = subprocess.check_output(['nvidia-smi', '--query-gpu=memory.used,memory.total', '--format=csv,noheader,nounits'])
        result = result.decode('utf-8').strip().split('\n')
        
        print("\nGPU Memory Usage:")
        for i, gpu_info in enumerate(result):
            memory_used, memory_total = map(int, re.findall(r'\d+', gpu_info))
            usage_percent = memory_used / memory_total * 100
            print(f"GPU {i}: {memory_used} MB / {memory_total} MB ({usage_percent:.1f}%)")
        print()
    except Exception as e:
        print(f"Failed to monitor GPU memory: {e}")

In [None]:
def evaluate_model(model, val_generator):
    """
    Evaluate the fine-tuned model with multi-GPU support.
    
    Args:
        model (Model): Fine-tuned model.
        val_generator: Validation data generator.
    
    Returns:
        list: Evaluation results.
    """
    print("=" * 50)
    print("Evaluating the fine-tuned model...")
    
    # Monitor GPU memory before evaluation
    monitor_gpu_memory()
    
    # Reset generator if it's a custom generator that supports reset()
    if hasattr(val_generator, 'reset') and callable(val_generator.reset):
        val_generator.reset()
    
    # Perform evaluation
    eval_results = model.evaluate(
        val_generator, 
        steps=len(val_generator), 
        verbose=1
    )
    
    print(f"Evaluation Results:")
    print(f"Loss: {eval_results[0]:.4f}")
    print(f"Accuracy: {eval_results[1]:.4f}")
    
    # Monitor GPU memory after evaluation
    monitor_gpu_memory()
    
    return eval_results

In [None]:
# ==== Cell 11: Define model saving function ====
def save_models(fine_tuned_model):
    """
    Save the fine-tuned model and feature extraction model.
    
    Args:
        fine_tuned_model (Model): Fine-tuned model.
    """
    print("=" * 50)
    print("Saving the fine-tuned models...")
    
    # Save the complete fine-tuned model
    fine_tuned_model.save(FINETUNED_FULL_MODEL_PATH)
    print(f"Saved complete fine-tuned model to: {FINETUNED_FULL_MODEL_PATH}")
    
    # Create and save the feature extraction model
    # (This is useful for extracting features for other tasks)
    feature_layer = fine_tuned_model.get_layer('fc8')
    feature_model = Model(inputs=fine_tuned_model.input, outputs=feature_layer.output)
    feature_model.save(FINETUNED_FEATURE_MODEL_PATH)
    print(f"Saved feature extraction model to: {FINETUNED_FEATURE_MODEL_PATH}")


In [None]:
# ==== Cell 12: Define function to plot training history ====
def plot_training_history(history):
    """
    Plot the training history.
    
    Args:
        history: Training history object.
    """
    print("=" * 50)
    print("Plotting training history...")
    
    # Create figure and subplots
    plt.figure(figsize=(12, 5))
    
    # Plot accuracy
    plt.subplot(1, 2, 1)
    plt.plot(history.history['accuracy'], label='Training Accuracy')
    plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
    plt.title('Model Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()
    
    # Plot loss
    plt.subplot(1, 2, 2)
    plt.plot(history.history['loss'], label='Training Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.title('Model Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    
    # Save the plot
    plt.tight_layout()
    plt.savefig(os.path.join(FINETUNED_MODEL_FOLDER, 'training_history.png'))
    print(f"Saved training history plot to: {os.path.join(FINETUNED_MODEL_FOLDER, 'training_history.png')}")


In [None]:
# ==== Cell 13: Define function to calculate detailed metrics ====
def calculate_detailed_metrics(model, val_generator):
    """
    Calculate detailed metrics for the model performance.
    
    Args:
        model (Model): Fine-tuned model.
        val_generator: Validation data generator.
    """
    print("=" * 50)
    print("Calculating detailed metrics...")
    
    # Get predictions
    val_generator.reset()
    y_true = val_generator.classes
    y_pred_probs = model.predict(val_generator, steps=len(val_generator), verbose=1)
    y_pred = np.argmax(y_pred_probs, axis=1)
    
    # Calculate metrics
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    f1 = f1_score(y_true, y_pred, average='weighted')
    
    # Log metrics
    print(f"Detailed Metrics:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    
    # Calculate confusion matrix
    conf_matrix = confusion_matrix(y_true, y_pred)
    
    # Save metrics to a file
    with open(os.path.join(FINETUNED_MODEL_FOLDER, 'metrics.txt'), 'w') as f:
        f.write(f"Accuracy: {accuracy:.4f}\n")
        f.write(f"Precision: {precision:.4f}\n")
        f.write(f"Recall: {recall:.4f}\n")
        f.write(f"F1 Score: {f1:.4f}\n")
    
    print(f"Saved metrics to: {os.path.join(FINETUNED_MODEL_FOLDER, 'metrics.txt')}")
    
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'confusion_matrix': conf_matrix
    }

In [None]:


def cleanup_old_checkpoints(latest_checkpoint):
    """
    Deletes older checkpoints except the latest one.
    """
    checkpoint_files = sorted(
        glob.glob(os.path.join(FINETUNED_MODEL_FOLDER, 'checkpoint_*.keras')),
        key=os.path.getmtime
    )

    # Keep only the latest checkpoint
    for chkpt in checkpoint_files:
        if chkpt != latest_checkpoint:
            print(f"Deleting old checkpoint: {chkpt}")
            os.remove(chkpt)

def main():
    """
    Main function to run the full fine-tuning pipeline on Kaggle with multi-GPU support.
    """
    print("=" * 50)
    print("Starting VGGFace2 fine-tuning pipeline on Kaggle with multi-GPU support...")
    print("=" * 50)

    start_time = time.time()

    # Configure GPUs and get distribution strategy
    strategy = configure_gpus()

    # Kaggle dataset availability check
    print("Checking Kaggle dataset availability...")
    kaggle_dataset_paths = glob.glob('/kaggle/input/*vggface2*')
    manual_path = None

    if kaggle_dataset_paths:
        print(f"Found potential VGGFace2 datasets:")
        for i, path in enumerate(kaggle_dataset_paths):
            print(f"{i+1}. {path}")
        selection = input(f"Enter number to select dataset (1-{len(kaggle_dataset_paths)}), or 'other': ")

        if selection.lower() == 'other':
            manual_path = input("Enter the path to the dataset: ")
        else:
            try:
                manual_path = kaggle_dataset_paths[int(selection)-1]
            except:
                print("Invalid selection, using first dataset.")
                manual_path = kaggle_dataset_paths[0]

    # Step 1: Set up the dataset
    download_vggface2_dataset(manual_path)

    # Reconstruct and preprocess the dataset
    reconstruct_validation_set(TRAIN_DIR, VAL_DIR, PROCESSED_TRAIN_DIR, PROCESSED_VAL_DIR, VALIDATION_SPLIT)

    # Create data generators
    train_generator, val_generator, _, _ = create_data_generators(strategy)

    # Load pretrained VGGFace model
    with strategy.scope():
        vgg_model = load_vggface_model()

    # ======== Checkpoint Selection =========
    checkpoint_files = sorted(
        glob.glob(os.path.join(FINETUNED_MODEL_FOLDER, 'checkpoint_*.keras')),
        key=os.path.getmtime
    )

    resume_from_checkpoint = None

    if checkpoint_files:
        print("\nAvailable checkpoints:")
        for idx, chkpt in enumerate(checkpoint_files):
            print(f"{idx + 1}. {os.path.basename(chkpt)}")

        selection = input(f"\nEnter the checkpoint number to load (1-{len(checkpoint_files)}), or press Enter to use the latest: ")

        if selection.strip().isdigit():
            chkpt_index = int(selection) - 1
            if 0 <= chkpt_index < len(checkpoint_files):
                resume_from_checkpoint = checkpoint_files[chkpt_index]
            else:
                print("Invalid selection, using latest checkpoint.")
                resume_from_checkpoint = checkpoint_files[-1]
        else:
            resume_from_checkpoint = checkpoint_files[-1]  # Default to latest checkpoint

        print(f"Loading model from checkpoint: {resume_from_checkpoint}")

        # 🔥 Delete older checkpoints before starting training
        cleanup_old_checkpoints(resume_from_checkpoint)

    # ======== Fine-Tuning =========
    fine_tuned_model, history = build_and_finetune_model(
        vgg_model, 
        train_generator, 
        val_generator, 
        strategy, 
        resume_from_checkpoint=resume_from_checkpoint
    )

    # Evaluate the model
    eval_results = evaluate_model(fine_tuned_model, val_generator)

    # Save the model
    save_models(fine_tuned_model)

    # Plot training history
    plot_training_history(history)

    # Calculate detailed metrics
    metrics = calculate_detailed_metrics(fine_tuned_model, val_generator)

    # Calculate total time
    end_time = time.time()
    total_time = end_time - start_time

    # Log total time and results
    print("=" * 50)
    print(f"Multi-GPU fine-tuning completed in {total_time:.2f} seconds ({datetime.timedelta(seconds=total_time)})")
    print("=" * 50)

    # Summary of results
    print("Summary of results:")
    print(f"Model saved to: {FINETUNED_FULL_MODEL_PATH}")
    print(f"Feature extraction model saved to: {FINETUNED_FEATURE_MODEL_PATH}")
    print(f"Final validation accuracy: {metrics['accuracy']:.4f}")
    print(f"Final validation F1 score: {metrics['f1']:.4f}")
    print("=" * 50)

    # Kaggle-specific output information
    print("\nKaggle Environment Information:")
    print("- Models trained with multi-GPU strategy using", strategy.num_replicas_in_sync, "GPUs")
    print("- Outputs are saved in /kaggle/working directory")
    print("- To save your models permanently, use the 'Save Version' button")
    print("- Output artifacts are available in the 'Output' section")


In [None]:
# ==== Cell 15: Run the pipeline ====
if __name__ == "__main__":
    main()