### Importing Packages

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
import torch.optim as optim
from torch.optim.lr_scheduler import CosineAnnealingLR, StepLR
import matplotlib.pyplot as plt
import numpy as np
import time
import os

### GPU Checker

In [6]:
# Setting environment variables for optimal performance
os.environ['PYTORCH_MPS_HIGH_WATERMARK_RATIO'] = '0.0'

# GPU Checker Function
def setup_device():
    print("=" * 60)
    print("GPU ACCELERATION CHECK")
    print("=" * 60)
    
    print(f"PyTorch Version: {torch.__version__}")
    print(f"MPS Available: {torch.backends.mps.is_available()}")
    print(f"MPS Built: {torch.backends.mps.is_built()}")
    
    if torch.backends.mps.is_available():
        device = torch.device("mps")
        print("✅ USING MPS (APPLE SILICON GPU ACCELERATION)")
    elif torch.cuda.is_available():
        device = torch.device("cuda")
        print("✅ USING CUDA (NVIDIA GPU ACCELERATION)")
    else:
        device = torch.device("cpu")
        print("⚠️  USING CPU ONLY (NO GPU ACCELERATION)")
    
    print(f"Selected Device: {device}")
    print("=" * 60)
    return device

device = setup_device()

GPU ACCELERATION CHECK
PyTorch Version: 2.8.0
MPS Available: True
MPS Built: True
✅ USING MPS (APPLE SILICON GPU ACCELERATION)
Selected Device: mps


### Enhanced Components & Layers

In [None]:
class SELayer(nn.Module):
    """Squeeze-and-Excitation Layer"""
    def __init__(self, channel, reduction=16):
        super(SELayer, self).__init__()
        self.avg_pool = nn.AdaptiveAvgPool2d(1)
        self.fc = nn.Sequential(
            nn.Linear(channel, channel // reduction, bias=False),
            nn.ReLU(inplace=True),
            nn.Linear(channel // reduction, channel, bias=False),
            nn.Sigmoid()
        )
    
    def forward(self, x):
        b, c, _, _ = x.size()
        y = self.avg_pool(x).view(b, c)
        y = self.fc(y).view(b, c, 1, 1)
        return x * y.expand_as(x)

class StochasticDepth(nn.Module):
    """Stochastic Depth for Regularization"""
    def __init__(self, drop_prob):
        super(StochasticDepth, self).__init__()
        self.drop_prob = drop_prob
    
    def forward(self, x):
        if not self.training or self.drop_prob == 0.:
            return x
        
        keep_prob = 1 - self.drop_prob
        shape = (x.shape[0],) + (1,) * (x.ndim - 1)
        random_tensor = keep_prob + torch.rand(shape, dtype=x.dtype, device=x.device)
        random_tensor.floor_()
        return x / keep_prob * random_tensor

### Layer Integration with Enhanced Components

In [8]:
class EnhancedInvertedResidual(nn.Module):
    def __init__(self, in_channels, out_channels, stride, expansion_ratio=6, 
                 use_se=True, use_swish=True):
        super(EnhancedInvertedResidual, self).__init__()
        hidden_dim = int(in_channels * expansion_ratio)
        self.use_residual = stride == 1 and in_channels == out_channels
        
        layers = []
        # Enhanced pointwise convolution
        if expansion_ratio != 1:
            layers.append(nn.Conv2d(in_channels, hidden_dim, 1, bias=False))
            layers.append(nn.BatchNorm2d(hidden_dim))
            layers.append(nn.SiLU() if use_swish else nn.ReLU6(inplace=True))
        
        # Depthwise convolution
        layers.append(nn.Conv2d(hidden_dim, hidden_dim, 3, stride, 1, 
                               groups=hidden_dim, bias=False))
        layers.append(nn.BatchNorm2d(hidden_dim))
        layers.append(nn.SiLU() if use_swish else nn.ReLU6(inplace=True))
        
        # Squeeze-and-Excitation attention
        if use_se:
            layers.append(SELayer(hidden_dim, reduction=16))
        
        # Pointwise convolution
        layers.append(nn.Conv2d(hidden_dim, out_channels, 1, bias=False))
        layers.append(nn.BatchNorm2d(out_channels))
        
        self.conv = nn.Sequential(*layers)
        self.stochastic_depth = StochasticDepth(0.1) if self.use_residual else None
    
    def forward(self, x):
        if self.use_residual:
            return x + self.stochastic_depth(self.conv(x))
        else:
            return self.conv(x)


### Excitation Layers with Regularization

In [3]:
class SELayer(nn.Module):
    """Squeeze-and-Excitation Layer"""
    def __init__(self, channel, reduction=16):
        super(SELayer, self).__init__()
        self.avg_pool = nn.AdaptiveAvgPool2d(1)
        self.fc = nn.Sequential(
            nn.Linear(channel, channel // reduction, bias=False),
            nn.ReLU(inplace=True),
            nn.Linear(channel // reduction, channel, bias=False),
            nn.Sigmoid()
        )
    
    def forward(self, x):
        b, c, _, _ = x.size()
        y = self.avg_pool(x).view(b, c)
        y = self.fc(y).view(b, c, 1, 1)
        return x * y.expand_as(x)

class StochasticDepth(nn.Module):
    """Stochastic Depth for Regularization"""
    def __init__(self, drop_prob):
        super(StochasticDepth, self).__init__()
        self.drop_prob = drop_prob
    
    def forward(self, x):
        if not self.training or self.drop_prob == 0.:
            return x
        
        keep_prob = 1 - self.drop_prob
        shape = (x.shape[0],) + (1,) * (x.ndim - 1)
        random_tensor = keep_prob + torch.rand(shape, dtype=x.dtype, device=x.device)
        random_tensor.floor_()
        return x / keep_prob * random_tensor

### Building Layers

In [9]:
class EnhancedMobileNetV4(nn.Module):
    def __init__(self, num_classes=10, width_mult=1.2, dropout_rate=0.3, 
                 use_se=True, use_swish=True):
        super(EnhancedMobileNetV4, self).__init__()
        
        input_channel = 32
        last_channel = 1280
        
        # Enhanced configuration
        inverted_residual_setting = [
            [1, 24, 1, 1],    # Increased channels
            [6, 32, 2, 2],    # Increased channels
            [6, 48, 3, 2],    # Increased channels
            [6, 80, 4, 2],    # Increased channels
            [6, 112, 3, 1],   # Increased channels
            [6, 192, 3, 2],   # Increased channels
            [6, 384, 1, 1],   # Increased channels
        ]
        
        input_channel = int(input_channel * width_mult)
        self.last_channel = int(last_channel * max(1.0, width_mult))
        
        # Enhanced initial layers
        features = [nn.Sequential(
            nn.Conv2d(3, input_channel, 3, 2, 1, bias=False),
            nn.BatchNorm2d(input_channel),
            nn.SiLU() if use_swish else nn.ReLU6(inplace=True)
        )]
        
        # Build enhanced blocks
        for t, c, n, s in inverted_residual_setting:
            output_channel = int(c * width_mult)
            for i in range(n):
                stride = s if i == 0 else 1
                features.append(EnhancedInvertedResidual(
                    input_channel, output_channel, stride, t, use_se, use_swish
                ))
                input_channel = output_channel
        
        # Enhanced final layers
        features.append(nn.Sequential(
            nn.Conv2d(input_channel, self.last_channel, 1, bias=False),
            nn.BatchNorm2d(self.last_channel),
            nn.SiLU() if use_swish else nn.ReLU6(inplace=True),
            nn.AdaptiveAvgPool2d(1)
        ))
        
        self.features = nn.Sequential(*features)
        
        # Enhanced classifier
        self.classifier = nn.Sequential(
            nn.Dropout(dropout_rate),
            nn.Linear(self.last_channel, 512),
            nn.SiLU() if use_swish else nn.ReLU(),
            nn.Dropout(dropout_rate/2),
            nn.Linear(512, num_classes)
        )
        
        # Improved weight initialization
        self._initialize_weights()
    
    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
                if m.bias is not None:
                    nn.init.zeros_(m.bias)
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.ones_(m.weight)
                nn.init.zeros_(m.bias)
            elif isinstance(m, nn.Linear):
                nn.init.xavier_uniform_(m.weight)
                if m.bias is not None:
                    nn.init.zeros_(m.bias)
    
    def forward(self, x):
        x = self.features(x)
        x = torch.flatten(x, 1)
        x = self.classifier(x)
        return x

### Training Model

In [None]:
# Enhanced training function
def train_enhanced_model(model, train_loader, test_loader, device, epochs=20, lr=0.001):
    criterion = nn.CrossEntropyLoss(label_smoothing=0.1)  # Label smoothing
    optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=0.05)  # AdamW
    scheduler = CosineAnnealingLR(optimizer, T_max=epochs)
    
    train_losses = []
    test_accuracies = []
    
    # Tracking time
    start_time = time.time()
    
    for epoch in range(epochs):
        model.train()
        running_loss = 0.0
        epoch_start = time.time()
        
        for batch_idx, (data, target) in enumerate(train_loader):
            data, target = data.to(device), target.to(device)
            
            optimizer.zero_grad()
            output = model(data)
            loss = criterion(output, target)
            loss.backward()
            
            # Gradient clipping
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            
            running_loss += loss.item()
            
            if batch_idx % 100 == 0:
                print(f'Epoch: {epoch+1:2d}/{epochs} | '
                      f'Batch: {batch_idx:4d}/{len(train_loader)} | '
                      f'Loss: {loss.item():.4f}')
        
        scheduler.step()
        
        avg_loss = running_loss / len(train_loader)
        train_losses.append(avg_loss)
        
        # Enhanced evaluation with TTA
        model.eval()
        correct = 0
        total = 0
        
        with torch.no_grad():
            for data, target in test_loader:
                data, target = data.to(device), target.to(device)
                
                # Test-time augmentation
                outputs = model(data)
                outputs_flip = model(torch.flip(data, [3]))
                outputs = (outputs + outputs_flip) / 2.0
                
                _, predicted = torch.max(outputs.data, 1)
                total += target.size(0)
                correct += (predicted == target).sum().item()
        
        accuracy = 100 * correct / total
        test_accuracies.append(accuracy)
        
        epoch_time = time.time() - epoch_start
        total_time = time.time() - start_time
        
        # Printing GPU memory info if using MPS
        if torch.backends.mps.is_available():
            memory_allocated = torch.mps.current_allocated_memory() / 1024**2
            print(f'Epoch {epoch+1:2d}/{epochs} | '
                  f'Loss: {avg_loss:.4f} | '
                  f'Accuracy: {accuracy:.2f}% | '
                  f'Time: {epoch_time:.1f}s | '
                  f'GPU Mem: {memory_allocated:.1f}MB')
        else:
            print(f'Epoch {epoch+1:2d}/{epochs} | '
                  f'Loss: {avg_loss:.4f} | '
                  f'Accuracy: {accuracy:.2f}% | '
                  f'Time: {epoch_time:.1f}s')
    
    total_time = time.time() - start_time
    print(f'Total training time: {total_time/60:.1f} minutes')
    
    return train_losses, test_accuracies


### Data Preparation

In [None]:
def prepare_enhanced_data(batch_size=64):
    # Enhanced data augmentation
    train_transform = transforms.Compose([
        transforms.RandomCrop(32, padding=4),
        transforms.RandomHorizontalFlip(),
        transforms.RandomRotation(15),
        transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2),
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
    ])
    
    test_transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
    ])
    
    train_dataset = datasets.CIFAR10(root='./data', train=True, 
                                   download=True, transform=train_transform)
    test_dataset = datasets.CIFAR10(root='./data', train=False, 
                                  download=True, transform=test_transform)
    
    train_loader = DataLoader(train_dataset, batch_size=batch_size, 
                             shuffle=True, num_workers=2, pin_memory=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, 
                            shuffle=False, num_workers=2, pin_memory=True)
    
    return train_loader, test_loader


### Main Execution

In [None]:
if __name__ == "__main__":
    # Setting up device with GPU checker
    device = setup_device()
    
    # Preparing enhanced data with same batch size
    train_loader, test_loader = prepare_enhanced_data(batch_size=64)
    
    # Creating and train enhanced model
    print("\n" + "=" * 60)
    print("TRAINING ENHANCED MOBILENETV4 MODEL")
    print("=" * 60)
    
    enhanced_model = EnhancedMobileNetV4(num_classes=10).to(device)
    
    # Counting parameters
    total_params = sum(p.numel() for p in enhanced_model.parameters())
    print(f"Model Parameters: {total_params:,}")
    print("=" * 60)
    
    train_losses_enhanced, test_accuracies_enhanced = train_enhanced_model(
        enhanced_model, train_loader, test_loader, device, epochs=20, lr=0.001
    )
    
    print(f"\nFinal Test Accuracy (Enhanced): {test_accuracies_enhanced[-1]:.2f}%")
    
    # Showing final GPU memory usage
    if torch.backends.mps.is_available():
        peak_memory = torch.mps.driver_allocated_memory() / 1024**3
        print(f"Peak GPU Memory Usage: {peak_memory:.2f} GB")

GPU ACCELERATION CHECK
PyTorch Version: 2.8.0
MPS Available: True
MPS Built: True
✅ USING MPS (APPLE SILICON GPU ACCELERATION)
Selected Device: mps

TRAINING ENHANCED MOBILENETV4 MODEL
Model Parameters: 6,413,864




Epoch:  1/20 | Batch:    0/782 | Loss: 2.5234
Epoch:  1/20 | Batch:  100/782 | Loss: 2.3426
Epoch:  1/20 | Batch:  200/782 | Loss: 1.7985
Epoch:  1/20 | Batch:  300/782 | Loss: 1.9857
Epoch:  1/20 | Batch:  400/782 | Loss: 1.8432
Epoch:  1/20 | Batch:  500/782 | Loss: 2.0651
Epoch:  1/20 | Batch:  600/782 | Loss: 1.9161
Epoch:  1/20 | Batch:  700/782 | Loss: 1.8794
Epoch  1/20 | Loss: 2.0309 | Accuracy: 39.30% | Time: 91.3s | GPU Mem: 186.2MB
Epoch:  2/20 | Batch:    0/782 | Loss: 1.7923
Epoch:  2/20 | Batch:  100/782 | Loss: 1.7336
Epoch:  2/20 | Batch:  200/782 | Loss: 1.7918
Epoch:  2/20 | Batch:  300/782 | Loss: 1.7717
Epoch:  2/20 | Batch:  400/782 | Loss: 1.6729
Epoch:  2/20 | Batch:  500/782 | Loss: 1.9002
Epoch:  2/20 | Batch:  600/782 | Loss: 1.6235
Epoch:  2/20 | Batch:  700/782 | Loss: 1.4110
Epoch  2/20 | Loss: 1.7364 | Accuracy: 47.36% | Time: 78.2s | GPU Mem: 195.1MB
Epoch:  3/20 | Batch:    0/782 | Loss: 1.5337
Epoch:  3/20 | Batch:  100/782 | Loss: 1.6695
Epoch:  3/20 |