In [None]:
# Memory Efficient Data Generator - PyTorch
# Otimizado para datasets grandes (50k+ imagens)

import os
import gc
import psutil
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms
import torchvision.models as models
import numpy as np
import cv2
from PIL import Image
import random
from pathlib import Path
from collections import Counter
import time
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# Configuração de reprodutibilidade
torch.manual_seed(42)
np.random.seed(42)
random.seed(42)

# Configuração otimizada de memória
torch.backends.cudnn.benchmark = True  # Otimiza convoluções
if torch.cuda.is_available():
    torch.cuda.empty_cache()

print("Memory Efficient Data Generator - Configurado")
print(f"PyTorch version: {torch.__version__}")
print(f"Device: {'CUDA' if torch.cuda.is_available() else 'CPU'}")

In [None]:
class AdvancedMemoryMonitor:
    """Monitor avançado de memória para datasets grandes"""
    
    def __init__(self):
        self.process = psutil.Process()
        self.initial_memory = self._get_memory_mb()
        self.peak_memory = self.initial_memory
        self.memory_history = []
        
    def _get_memory_mb(self):
        """Retorna uso de memória em MB"""
        return self.process.memory_info().rss / 1024 / 1024
    
    def _get_gpu_memory_mb(self):
        """Retorna uso de memória GPU em MB"""
        if torch.cuda.is_available():
            return torch.cuda.memory_allocated() / 1024 / 1024
        return 0
    
    def update(self):
        """Atualiza estatísticas de memória"""
        cpu_mem = self._get_memory_mb()
        gpu_mem = self._get_gpu_memory_mb()
        
        self.peak_memory = max(self.peak_memory, cpu_mem)
        self.memory_history.append({
            'timestamp': time.time(),
            'cpu_memory': cpu_mem,
            'gpu_memory': gpu_mem
        })
        
        return cpu_mem, gpu_mem
    
    def get_status(self):
        """Retorna status atual da memória"""
        cpu_mem, gpu_mem = self.update()
        return {
            'current_cpu_mb': cpu_mem,
            'current_gpu_mb': gpu_mem,
            'peak_cpu_mb': self.peak_memory,
            'memory_increase_mb': cpu_mem - self.initial_memory,
            'total_system_memory_gb': psutil.virtual_memory().total / 1024**3,
            'available_memory_gb': psutil.virtual_memory().available / 1024**3
        }
    
    def print_status(self):
        """Imprime status detalhado da memória"""
        status = self.get_status()
        print(f"Memory Status:")
        print(f"  CPU: {status['current_cpu_mb']:.1f} MB (Peak: {status['peak_cpu_mb']:.1f} MB)")
        print(f"  GPU: {status['current_gpu_mb']:.1f} MB")
        print(f"  Available: {status['available_memory_gb']:.1f} GB")
        print(f"  Increase: +{status['memory_increase_mb']:.1f} MB")

# Instancia monitor
memory_monitor = AdvancedMemoryMonitor()
memory_monitor.print_status()

In [None]:
class MemoryEfficientEmotionDataset(Dataset):
    """
    Dataset eficiente que carrega imagens sob demanda.
    Otimizado para datasets grandes (50k+ imagens).
    """
    
    def __init__(self, data_path, split='train', transform=None, 
                 cache_size=1000, img_size=224, lazy_load=True):
        """
        Args:
            data_path: Caminho para os dados
            split: 'train', 'val', ou 'test'
            transform: Transformações do PyTorch
            cache_size: Número de imagens em cache
            img_size: Tamanho da imagem
            lazy_load: Se True, carrega imagens sob demanda
        """
        self.data_path = Path(data_path)
        self.split = split
        self.transform = transform
        self.img_size = img_size
        self.lazy_load = lazy_load
        
        # Configuração de cache
        self.cache_size = cache_size
        self.image_cache = {}
        self.cache_access_count = {}
        
        # Mapeamento de emoções
        self.emotion_labels = {
            'Raiva': 0, 'Nojo': 1, 'Medo': 2, 'Felicidade': 3, 
            'Neutro': 4, 'Tristeza': 5, 'Surpresa': 6
        }
        
        # Carrega lista de arquivos (não as imagens)
        self.image_paths, self.labels = self._load_file_paths()
        
        print(f"Dataset {split} inicializado:")
        print(f"  Total de imagens: {len(self.image_paths)}")
        print(f"  Lazy loading: {'Ativado' if lazy_load else 'Desativado'}")
        print(f"  Cache size: {cache_size}")
        print(f"  Distribuição:", dict(Counter(self.labels)))
        
    def _load_file_paths(self):
        """Carrega apenas os caminhos dos arquivos, não as imagens"""
        split_path = self.data_path / self.split
        
        if not split_path.exists():
            raise ValueError(f"Caminho não encontrado: {split_path}")
        
        image_paths = []
        labels = []
        
        for emotion, label_id in self.emotion_labels.items():
            emotion_path = split_path / emotion
            
            if not emotion_path.exists():
                print(f"Aviso: {emotion_path} não encontrado")
                continue
            
            # Lista todos os arquivos de imagem
            image_files = list(emotion_path.glob('*.jpg')) + \
                         list(emotion_path.glob('*.jpeg')) + \
                         list(emotion_path.glob('*.png'))
            
            for img_path in image_files:
                image_paths.append(str(img_path))
                labels.append(label_id)
            
            print(f"  {emotion}: {len(image_files)} imagens encontradas")
        
        return image_paths, labels
    
    def _load_image(self, image_path):
        """Carrega uma única imagem de forma eficiente"""
        try:
            # Carrega com OpenCV (mais rápido para preprocessing)
            img = cv2.imread(image_path)
            if img is None:
                raise ValueError(f"Não foi possível carregar: {image_path}")
            
            # Converte BGR para RGB
            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
            
            # Redimensiona apenas se necessário
            if img.shape[:2] != (self.img_size, self.img_size):
                img = cv2.resize(img, (self.img_size, self.img_size), 
                               interpolation=cv2.INTER_AREA)
            
            return img
            
        except Exception as e:
            print(f"Erro ao carregar {image_path}: {e}")
            # Retorna imagem preta como fallback
            return np.zeros((self.img_size, self.img_size, 3), dtype=np.uint8)
    
    def _manage_cache(self, image_path, image):
        """Gerencia cache de imagens de forma inteligente"""
        if len(self.image_cache) >= self.cache_size:
            # Remove imagem menos acessada
            least_used = min(self.cache_access_count.items(), key=lambda x: x[1])
            least_used_path = least_used[0]
            
            del self.image_cache[least_used_path]
            del self.cache_access_count[least_used_path]
        
        # Adiciona nova imagem
        self.image_cache[image_path] = image
        self.cache_access_count[image_path] = 1
    
    def __len__(self):
        return len(self.image_paths)
    
    def __getitem__(self, idx):
        image_path = self.image_paths[idx]
        label = self.labels[idx]
        
        # Verifica cache primeiro
        if image_path in self.image_cache:
            image = self.image_cache[image_path]
            self.cache_access_count[image_path] += 1
        else:
            # Carrega imagem sob demanda
            image = self._load_image(image_path)
            
            # Adiciona ao cache se ativado
            if self.cache_size > 0:
                self._manage_cache(image_path, image)
        
        # Converte para PIL para transforms
        image = Image.fromarray(image)
        
        # Aplica transformações
        if self.transform:
            image = self.transform(image)
        
        return image, torch.tensor(label, dtype=torch.long)
    
    def get_cache_stats(self):
        """Retorna estatísticas do cache"""
        return {
            'cache_size': len(self.image_cache),
            'cache_limit': self.cache_size,
            'hit_rate': len(self.image_cache) / max(1, len(self.image_paths)),
            'most_accessed': max(self.cache_access_count.items(), 
                               key=lambda x: x[1]) if self.cache_access_count else None
        }

# Teste do dataset
print("Testando Dataset eficiente...")

In [None]:
# Configurações para o dataset grande
IMG_SIZE = 224
BATCH_SIZE = 32 if torch.cuda.is_available() else 16
BASE_PATH = "/home/leandro/Documents/TCC/emotion_recognition_tcc/data/augmented/raf_db_balanced"
CACHE_SIZE = 2000  # Número de imagens em cache
NUM_WORKERS = 4 if torch.cuda.is_available() else 2

# Transformações otimizadas
train_transform = transforms.Compose([
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.RandomRotation(10),
    transforms.ColorJitter(brightness=0.2, contrast=0.2),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

val_test_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Cria datasets eficientes
print("Criando datasets eficientes...")
memory_monitor.print_status()

try:
    train_dataset = MemoryEfficientEmotionDataset(
        BASE_PATH, split='train', transform=train_transform, 
        cache_size=CACHE_SIZE, img_size=IMG_SIZE
    )
    
    test_dataset = MemoryEfficientEmotionDataset(
        BASE_PATH, split='test', transform=val_test_transform,
        cache_size=CACHE_SIZE//2, img_size=IMG_SIZE  # Cache menor para test
    )
    
    print("Datasets criados com sucesso!")
    memory_monitor.print_status()
    
except Exception as e:
    print(f"Erro ao criar datasets: {e}")
    import traceback
    traceback.print_exc()

In [None]:
class MemoryEfficientDataLoader:
    """DataLoader wrapper com controle de memória"""
    
    def __init__(self, dataset, batch_size, shuffle=True, num_workers=2, 
                 memory_threshold_gb=8.0):
        self.dataset = dataset
        self.batch_size = batch_size
        self.memory_threshold_gb = memory_threshold_gb
        
        # Ajusta batch_size baseado na memória disponível
        available_memory = psutil.virtual_memory().available / 1024**3
        if available_memory < memory_threshold_gb:
            adjusted_batch_size = max(8, batch_size // 2)
            print(f"Ajustando batch_size: {batch_size} -> {adjusted_batch_size} "
                  f"(memória disponível: {available_memory:.1f}GB)")
            self.batch_size = adjusted_batch_size
        
        self.dataloader = DataLoader(
            dataset, 
            batch_size=self.batch_size,
            shuffle=shuffle,
            num_workers=num_workers,
            pin_memory=torch.cuda.is_available(),
            persistent_workers=True if num_workers > 0 else False,
            prefetch_factor=2 if num_workers > 0 else 2
        )
    
    def __iter__(self):
        return iter(self.dataloader)
    
    def __len__(self):
        return len(self.dataloader)

# Cria DataLoaders otimizados
print("Criando DataLoaders otimizados...")

# Split treino/validação eficiente
train_size = int(0.7 * len(train_dataset))
val_size = len(train_dataset) - train_size

train_subset, val_subset = torch.utils.data.random_split(
    train_dataset, [train_size, val_size],
    generator=torch.Generator().manual_seed(42)
)

# DataLoaders com controle de memória
train_loader = MemoryEfficientDataLoader(
    train_subset, BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS
)

val_loader = MemoryEfficientDataLoader(
    val_subset, BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS
)

test_loader = MemoryEfficientDataLoader(
    test_dataset, BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS
)

print(f"DataLoaders criados:")
print(f"  Train batches: {len(train_loader.dataloader)}")
print(f"  Val batches: {len(val_loader.dataloader)}")
print(f"  Test batches: {len(test_loader.dataloader)}")
print(f"  Batch size efetivo: {train_loader.batch_size}")

memory_monitor.print_status()

In [None]:
def test_memory_efficiency():
    """Testa eficiência de memória carregando alguns batches"""
    print("Testando eficiência de memória...")
    
    # Memoria inicial
    initial_status = memory_monitor.get_status()
    print(f"Memória inicial: {initial_status['current_cpu_mb']:.1f} MB")
    
    # Carrega alguns batches para teste
    test_batches = 5
    batch_times = []
    
    for i, (images, labels) in enumerate(train_loader):
        if i >= test_batches:
            break
            
        start_time = time.time()
        
        # Simula processamento
        if torch.cuda.is_available():
            images = images.cuda()
            labels = labels.cuda()
        
        batch_time = time.time() - start_time
        batch_times.append(batch_time)
        
        # Monitora memória
        memory_monitor.update()
        
        print(f"Batch {i+1}: {images.shape}, "
              f"Time: {batch_time:.3f}s, "
              f"Memory: {memory_monitor.get_status()['current_cpu_mb']:.1f}MB")
        
        # Limpeza
        if torch.cuda.is_available():
            del images, labels
            torch.cuda.empty_cache()
        
        gc.collect()
    
    # Estatísticas finais
    final_status = memory_monitor.get_status()
    print(f"\nResultados do teste:")
    print(f"  Memória inicial: {initial_status['current_cpu_mb']:.1f} MB")
    print(f"  Memória final: {final_status['current_cpu_mb']:.1f} MB")
    print(f"  Aumento: {final_status['current_cpu_mb'] - initial_status['current_cpu_mb']:.1f} MB")
    print(f"  Tempo médio por batch: {np.mean(batch_times):.3f}s")
    print(f"  Cache stats train:", train_dataset.get_cache_stats())

# Executa teste
test_memory_efficiency()

In [None]:
def aggressive_memory_cleanup():
    """Limpeza agressiva de memória"""
    print("Executando limpeza de memória...")
    
    # Limpa cache dos datasets
    if 'train_dataset' in globals():
        train_dataset.image_cache.clear()
        train_dataset.cache_access_count.clear()
    
    if 'test_dataset' in globals():
        test_dataset.image_cache.clear()
        test_dataset.cache_access_count.clear()
    
    # Limpeza Python
    gc.collect()
    
    # Limpeza CUDA se disponível
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.synchronize()
    
    print("Limpeza concluída!")
    memory_monitor.print_status()

def memory_checkpoint(checkpoint_name=""):
    """Cria checkpoint de memória"""
    status = memory_monitor.get_status()
    print(f"Checkpoint {checkpoint_name}:")
    print(f"  CPU Memory: {status['current_cpu_mb']:.1f} MB")
    print(f"  GPU Memory: {status['current_gpu_mb']:.1f} MB")
    print(f"  Available: {status['available_memory_gb']:.1f} GB")
    return status

# Checkpoint inicial
initial_checkpoint = memory_checkpoint("Inicial")

In [None]:
class MemoryOptimizedResNet50(nn.Module):
    """ResNet50 otimizado para uso eficiente de memória"""
    
    def __init__(self, num_classes=7, checkpoint_segments=4):
        super().__init__()
        
        # Carrega ResNet50 base
        self.backbone = models.resnet50(weights='IMAGENET1K_V2')
        
        # Remove classificador original
        self.backbone.fc = nn.Identity()
        
        # Classifier otimizado
        self.classifier = nn.Sequential(
            nn.Dropout(0.5),
            nn.Linear(2048, 512),
            nn.ReLU(inplace=True),
            nn.BatchNorm1d(512),
            nn.Dropout(0.3),
            nn.Linear(512, num_classes)
        )
        
        # Gradient checkpointing para economizar memória
        self.use_checkpointing = checkpoint_segments > 0
        if self.use_checkpointing:
            self._setup_checkpointing()
    
    def _setup_checkpointing(self):
        """Configura gradient checkpointing"""
        # Aplica checkpointing nas camadas pesadas
        for name, module in self.backbone.named_modules():
            if 'layer' in name and len(list(module.children())) > 0:
                module = torch.utils.checkpoint.checkpoint_sequential(
                    module, segments=2
                )
    
    def forward(self, x):
        # Backbone com checkpointing opcional
        if self.use_checkpointing and self.training:
            features = torch.utils.checkpoint.checkpoint(self.backbone, x)
        else:
            features = self.backbone(x)
        
        # Classifier
        return self.classifier(features)

# Cria modelo otimizado
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Criando modelo otimizado para {device}...")

model = MemoryOptimizedResNet50(num_classes=7, checkpoint_segments=4).to(device)

# Conta parâmetros
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"Modelo criado:")
print(f"  Total parâmetros: {total_params:,}")
print(f"  Treináveis: {trainable_params:,}")
print(f"  Gradient checkpointing: Ativado")

memory_checkpoint("Modelo criado")

In [None]:
# Configurações de treinamento otimizadas
EPOCHS = 50  # Reduzido para teste
LEARNING_RATE = 0.001
WEIGHT_DECAY = 0.01

# Otimizador com configurações de memória
optimizer = optim.AdamW(
    model.parameters(), 
    lr=LEARNING_RATE, 
    weight_decay=WEIGHT_DECAY,
    eps=1e-8  # Estabilidade numérica
)

# Scheduler
scheduler = optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode='min', factor=0.5, patience=5, verbose=True
)

# Loss function
criterion = nn.CrossEntropyLoss()

# Mixed Precision para economizar memória (se GPU disponível)
scaler = torch.cuda.amp.GradScaler() if torch.cuda.is_available() else None
use_amp = torch.cuda.is_available()

print(f"Configuração de treinamento:")
print(f"  Epochs: {EPOCHS}")
print(f"  Learning rate: {LEARNING_RATE}")
print(f"  Optimizer: AdamW")
print(f"  Mixed precision: {'Ativado' if use_amp else 'Desativado'}")
print(f"  Device: {device}")

memory_checkpoint("Treinamento configurado")

In [None]:
def train_with_memory_control():
    """Treinamento com controle rigoroso de memória"""
    
    print("Iniciando treinamento com controle de memória...")
    start_time = time.time()
    
    best_val_acc = 0.0
    train_history = {'train_loss': [], 'train_acc': [], 'val_loss': [], 'val_acc': []}
    
    for epoch in range(EPOCHS):
        epoch_start = time.time()
        
        # === TREINO ===
        model.train()
        train_loss = 0.0
        train_correct = 0
        train_total = 0
        
        for batch_idx, (data, target) in enumerate(train_loader):
            # Move para device
            data, target = data.to(device, non_blocking=True), target.to(device, non_blocking=True)
            
            optimizer.zero_grad()
            
            # Forward pass com mixed precision
            if use_amp:
                with torch.cuda.amp.autocast():
                    outputs = model(data)
                    loss = criterion(outputs, target)
                
                # Backward pass
                scaler.scale(loss).backward()
                scaler.step(optimizer)
                scaler.update()
            else:
                outputs = model(data)
                loss = criterion(outputs, target)
                loss.backward()
                optimizer.step()
            
            # Estatísticas
            train_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            train_total += target.size(0)
            train_correct += (predicted == target).sum().item()
            
            # Limpeza de memória a cada N batches
            if batch_idx % 50 == 0:
                if torch.cuda.is_available():
                    torch.cuda.empty_cache()
                
                # Print progresso
                print(f'Epoch {epoch+1}/{EPOCHS}, Batch {batch_idx}/{len(train_loader)}, '
                      f'Loss: {loss.item():.4f}, '
                      f'Acc: {100*train_correct/train_total:.1f}%, '
                      f'Mem: {memory_monitor.get_status()["current_cpu_mb"]:.0f}MB')
        
        # Médias do treino
        train_loss_avg = train_loss / len(train_loader)
        train_acc = 100 * train_correct / train_total
        
        # === VALIDAÇÃO ===
        model.eval()
        val_loss = 0.0
        val_correct = 0
        val_total = 0
        
        with torch.no_grad():
            for data, target in val_loader:
                data, target = data.to(device, non_blocking=True), target.to(device, non_blocking=True)
                
                if use_amp:
                    with torch.cuda.amp.autocast():
                        outputs = model(data)
                        loss = criterion(outputs, target)
                else:
                    outputs = model(data)
                    loss = criterion(outputs, target)
                
                val_loss += loss.item()
                _, predicted = torch.max(outputs.data, 1)
                val_total += target.size(0)
                val_correct += (predicted == target).sum().item()
        
        val_loss_avg = val_loss / len(val_loader)
        val_acc = 100 * val_correct / val_total
        
        # Atualiza scheduler
        scheduler.step(val_loss_avg)
        
        # Salva histórico
        train_history['train_loss'].append(train_loss_avg)
        train_history['train_acc'].append(train_acc)
        train_history['val_loss'].append(val_loss_avg)
        train_history['val_acc'].append(val_acc)
        
        # Época completa
        epoch_time = time.time() - epoch_start
        
        print(f'\nEpoch {epoch+1}/{EPOCHS} Summary:')
        print(f'  Train Loss: {train_loss_avg:.4f}, Train Acc: {train_acc:.2f}%')
        print(f'  Val Loss: {val_loss_avg:.4f}, Val Acc: {val_acc:.2f}%')
        print(f'  Time: {epoch_time:.1f}s, LR: {optimizer.param_groups[0]["lr"]:.6f}')
        
        # Salva melhor modelo
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'val_acc': val_acc,
                'train_history': train_history
            }, 'best_model_memory_efficient.pth')
            print(f'  ✓ Novo melhor modelo salvo: {val_acc:.2f}%')
        
        # Limpeza de memória
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
        gc.collect()
        
        memory_checkpoint(f"Epoch {epoch+1}")
        print("-" * 60)
        
        # Early stopping
        if len(train_history['val_acc']) > 10:
            recent_accs = train_history['val_acc'][-5:]
            if max(recent_accs) - min(recent_accs) < 0.5:
                print("Early stopping: sem melhoria significativa")
                break
    
    total_time = time.time() - start_time
    print(f"\nTreinamento concluído em {total_time/60:.1f} minutos")
    print(f"Melhor validação accuracy: {best_val_acc:.2f}%")
    
    return train_history, best_val_acc

# Executa treinamento
print("=== INICIANDO TREINAMENTO COM CONTROLE DE MEMÓRIA ===")
history, best_acc = train_with_memory_control()