# Face-Recognition Training
Este notebook tem como objetivo treinar os modelos presentes dentro do framework face-recognition.

## Import das bibliotecas e funções

In [1]:
import sys
import os
import torch
from types import SimpleNamespace

sys.path.append('../../src/models/face-recognition')
from train import main, parse_arguments

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

Using device: cuda
CUDA available: True
GPU: NVIDIA GeForce RTX 4090


## Configurar parâmetros de treinamento

In [6]:
# Configurar parâmetros de treinamento
root_path = "../../data/raw/vggface2_112x112/"  # Caminho para o dataset de treinamento
database = "VggFace2"  # Opções: WebFace, VggFace2, MS1M

# Configuração do modelo
network = "mobilenetv2"  # Opções: sphere20, sphere36, sphere64, mobilenetv1, mobilenetv2, mobilenetv3_small, mobilenetv3_large
classifier = "MCP"  # Opções: ARC (ArcFace), MCP (MarginCosineProduct), AL (SphereFace), L (Linear)

# Hiperparâmetros de treinamento
batch_size = 64
epochs = 1
learning_rate = 0.001
momentum = 0.9
weight_decay = 5e-4

# Learning Rate Scheduler
lr_scheduler = "MultiStepLR"  # Opções: StepLR, MultiStepLR
milestones = [10, 20, 25]  # For MultiStepLR
step_size = 10  # For StepLR
gamma = 0.1  # Decay factor

# Configurações adicionais
save_path = "../../src/models/face-recognition/weights"
lfw_dataset_path = "../../data/raw/lfw"
num_workers = 8
print_freq = 10000
checkpoint_path = None  # Definir se quiser retomar o treinamento de um checkpoint

print("Training Configuration:")
print(f"Dataset: {database} from {root_path}")
print(f"Model: {network} with {classifier} classifier")
print(f"Batch size: {batch_size}, Epochs: {epochs}")
print(f"Learning rate: {learning_rate}, Scheduler: {lr_scheduler}")
print(f"Save path: {save_path}")
print(f"LFW path: {lfw_dataset_path}") 

Training Configuration:
Dataset: VggFace2 from ../../data/raw/vggface2_112x112/
Model: mobilenetv2 with MCP classifier
Batch size: 64, Epochs: 1
Learning rate: 0.001, Scheduler: MultiStepLR
Save path: ../../src/models/face-recognition/weights
LFW path: ../../data/raw/lfw


## Opções compatíveis (apenas p/ referência)

In [7]:
# Opções compatíveis (apenas p/ referência)
available_models = [
    "sphere20", "sphere36", "sphere64",
    "mobilenetv1", "mobilenetv2", 
    "mobilenetv3_small", "mobilenetv3_large"
]

available_databases = ["WebFace", "VggFace2", "MS1M"]
available_classifiers = ["ARC", "MCP", "AL", "L"]
available_schedulers = ["StepLR", "MultiStepLR"]

print("Available Models:", available_models)
print("Available Databases:", available_databases)
print("Available Classifiers:", available_classifiers)
print("Available Schedulers:", available_schedulers)

# Database info
db_info = {
    'WebFace': {'num_classes': 10572},
    'VggFace2': {'num_classes': 8631},
    'MS1M': {'num_classes': 85742}
}
print(f"\nSelected database '{database}' has {db_info[database]['num_classes']} classes")

Available Models: ['sphere20', 'sphere36', 'sphere64', 'mobilenetv1', 'mobilenetv2', 'mobilenetv3_small', 'mobilenetv3_large']
Available Databases: ['WebFace', 'VggFace2', 'MS1M']
Available Classifiers: ['ARC', 'MCP', 'AL', 'L']
Available Schedulers: ['StepLR', 'MultiStepLR']

Selected database 'VggFace2' has 8631 classes


## Criar args de treinamento

In [8]:
args = SimpleNamespace(
    # Dataset
    root=root_path,
    database=database,
    
    # Model
    network=network,
    classifier=classifier,
    
    # Training hyperparameters
    batch_size=batch_size,
    epochs=epochs,
    lr=learning_rate,
    momentum=momentum,
    weight_decay=weight_decay,
    
    # Learning rate scheduler
    lr_scheduler=lr_scheduler,
    milestones=milestones,
    step_size=step_size,
    gamma=gamma,
    
    # Training configuration
    save_path=save_path,
    num_workers=num_workers,
    print_freq=print_freq,
    checkpoint=checkpoint_path,
    lfw_root=lfw_dataset_path,
    
    # Distributed training (single GPU setup)
    world_size=1,
    local_rank=0,
    distributed=False,
    
    # Additional options
    use_deterministic_algorithms=False
)

print("Arguments created successfully!")
print(f"Training will save models to: {args.save_path}")

# Create save directory if it doesn't exist
os.makedirs(args.save_path, exist_ok=True)
print(f"Save directory confirmed: {args.save_path}")

Arguments created successfully!
Training will save models to: ../../src/models/face-recognition/weights
Save directory confirmed: ../../src/models/face-recognition/weights


## Iniciar treinamento

In [9]:
# Verify dataset exists
if not os.path.exists(args.root):
    print(f"ERROR: Dataset path does not exist: {args.root}")
    print("Please verify the dataset path and try again.")
else:
    print(f"Dataset path verified: {args.root}")
    print("\n" + "="*50)
    print("STARTING TRAINING")
    print("="*50)
    
    try:
        # Start training
        main(args)
        print("\n" + "="*50)
        print("TRAINING COMPLETED SUCCESSFULLY!")
        print("="*50)
    except Exception as e:
        print(f"\nTraining failed with error: {str(e)}")
        print("Please check your configuration and try again.")

2025-09-24 21:44:18 - Loading training data.


Dataset path verified: ../../data/raw/vggface2_112x112/

STARTING TRAINING
Distributed mode not enabled. Falling back to single process.


2025-09-24 21:44:22 - Training samples: 2827910, Validation samples: 309897
2025-09-24 21:44:22 - Length of training dataset: 2827910, Number of Identities: 8631
2025-09-24 21:44:22 - Training started for mobilenetv2, Classifier: MCP
2025-09-24 21:44:23 - Epoch: [0/1][00000/44187] Loss: 21.579, Accuracy: 0.00%, LR: 0.00100 Time: 0.909s
2025-09-24 21:51:27 - Epoch: [0/1][10000/44187] Loss: 18.146, Accuracy: 0.00%, LR: 0.00100 Time: 0.042s
2025-09-24 21:58:31 - Epoch: [0/1][20000/44187] Loss: 16.035, Accuracy: 0.00%, LR: 0.00100 Time: 0.042s
2025-09-24 22:05:35 - Epoch: [0/1][30000/44187] Loss: 14.514, Accuracy: 0.06%, LR: 0.00100 Time: 0.042s
2025-09-24 22:12:39 - Epoch: [0/1][40000/44187] Loss: 13.413, Accuracy: 0.23%, LR: 0.00100 Time: 0.042s
2025-09-24 22:15:37 - Epoch: [0/1][44186/44187] Loss: 13.041, Accuracy: 0.34%, LR: 0.00100 Time: 0.042s
2025-09-24 22:15:37 - Epoch [0/1] Summary: Loss: 13.041, Accuracy: 0.34%, Total Time: 1873.888s


LFW - Avaliacao Simplificada (Somente Pares Positivos):
Similaridade Media: 0.6279 | Desvio Padrao: 0.1285


2025-09-24 22:16:53 - Validation accuracy (VGGFace2 subset): 0.0186
2025-09-24 22:16:54 - New best LFW similarity: 0.6279.Model saved to ../../src/models/face-recognition/weights with `_best` postfix.
2025-09-24 22:16:54 - Training completed.



TRAINING COMPLETED SUCCESSFULLY!


## Resume Training (Optional)

In [None]:
# To resume training from a checkpoint, uncomment and modify:
# checkpoint_path = "../../weights/mobilenetv2_MCP_last.ckpt"
# args.checkpoint = checkpoint_path
# main(args)

print("To resume training:")
print("1. Set checkpoint_path to your .ckpt file")
print("2. Update args.checkpoint")
print("3. Run main(args)")
print("\nExample checkpoint naming:")
print(f"Last: {network}_{classifier}_last.ckpt")
print(f"Best: {network}_{classifier}_best.ckpt")