# Imports

In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import json
import os

In [2]:
from tqdm import tqdm

import numpy as np
import keras
from tensorflow import keras
from keras.datasets import cifar10
from __future__ import print_function
from keras.models import Sequential
from keras.models import save_model, load_model
from keras.layers import Dense, Activation, Flatten
from keras.layers import Conv2D, MaxPooling2D

import keras.backend as K
K.clear_session()

# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import accuracy_score
# from model import Mamba, ModelArgs  # Import your custom Mamba implementation
# Assuming the model classes are defined in `model.py`
from model import ImageMamba, ModelArgs




# CUDA

In [6]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Thu_Sep_12_02:55:00_Pacific_Daylight_Time_2024
Cuda compilation tools, release 12.6, V12.6.77
Build cuda_12.6.r12.6/compiler.34841621_0


In [7]:
#!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124

# Initialization

In [8]:
# Set the device (GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [1]:
from data_loader import load_cifar10, get_class_names

# Load data consistently
train_loader, test_loader, X_train, X_test, Y_train, Y_test = load_cifar10(batch_size=64, seed=42)
class_names = get_class_names()

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz


100%|██████████| 170M/170M [00:14<00:00, 11.4MB/s] 


Extracting ./data/cifar-10-python.tar.gz to ./data
Files already downloaded and verified


# CNN

In [8]:
class ComparableCNN(nn.Module):
    def __init__(self):
        super(ComparableCNN, self).__init__()
        # First conv block - similar to MAMBA's first conv
        self.conv1 = nn.Conv2d(3, 64, kernel_size=3, padding=1)
        self.bn1 = nn.BatchNorm2d(64)
        
        # Second conv block - similar to MAMBA's second conv
        self.conv2 = nn.Conv2d(64, 128, kernel_size=3, padding=1)
        self.bn2 = nn.BatchNorm2d(128)
        
        # Additional conv layers to match MAMBA's 4 layers
        self.conv3 = nn.Conv2d(128, 128, kernel_size=3, padding=1)
        self.bn3 = nn.BatchNorm2d(128)
        
        self.conv4 = nn.Conv2d(128, 128, kernel_size=3, padding=1)
        self.bn4 = nn.BatchNorm2d(128)
        
        # Global average pooling and final dense layer
        self.avg_pool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(128, 10)
        
    def forward(self, x):
        # First block
        x = F.relu(self.bn1(self.conv1(x)))
        x = F.max_pool2d(x, 2)
        
        # Second block
        x = F.relu(self.bn2(self.conv2(x)))
        x = F.max_pool2d(x, 2)
        
        # Third block
        x = F.relu(self.bn3(self.conv3(x)))
        
        # Fourth block
        x = F.relu(self.bn4(self.conv4(x)))
        
        # Global average pooling
        x = self.avg_pool(x)
        x = x.view(x.size(0), -1)
        
        # Final classification
        logits = self.fc(x)
        probabilities = F.softmax(logits, dim=-1)
        
        return logits, probabilities

def load_training_metrics(metrics_path):
    """Load training metrics from JSON file"""
    with open(metrics_path, 'r') as f:
        metrics = json.load(f)
        # Convert lists back to numpy arrays where needed
        metrics['epoch_train_confidences'] = [np.array(arr) for arr in metrics['epoch_train_confidences']]
        metrics['epoch_test_confidences'] = [np.array(arr) for arr in metrics['epoch_test_confidences']]
    return metrics

def save_cnn_checkpoint(model, optimizer, epoch, metrics, path):
    """Save CNN model checkpoint"""
    checkpoint = {
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'metrics': metrics
    }
    torch.save(checkpoint, path)
    print(f'Checkpoint saved: {path}')

def train_evaluate_cnn(model, train_loader, test_loader, num_epochs=100, device='cuda', 
                      checkpoint_dir='cnn_checkpoints'):
    """Train and evaluate CNN with checkpointing"""
    os.makedirs(checkpoint_dir, exist_ok=True)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5)
    
    metrics = {
        'train_losses': [], 'test_losses': [],
        'train_accuracies': [], 'test_accuracies': [],
        'train_confidences': [], 'test_confidences': [],
        'epoch_train_confidences': [], 'epoch_test_confidences': []
    }
    
    for epoch in range(num_epochs):
        # Training
        model.train()
        running_loss = 0.0
        running_correct = 0
        total_samples = 0
        train_confidence_sum = 0
        train_epoch_confidences = []
        
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            logits, probabilities = model(inputs)
            loss = criterion(logits, labels.squeeze())
            loss.backward()
            optimizer.step()
            
            _, predicted = torch.max(logits, 1)
            confidence, _ = torch.max(probabilities, 1)
            running_correct += (predicted == labels.squeeze()).sum().item()
            total_samples += labels.size(0)
            train_confidence_sum += confidence.sum().item()
            train_epoch_confidences.extend(confidence.detach().cpu().numpy())
            running_loss += loss.item()
        
        train_loss = running_loss / len(train_loader)
        train_accuracy = (running_correct / total_samples) * 100
        train_avg_confidence = train_confidence_sum / total_samples
        
        # Testing
        model.eval()
        test_loss = 0.0
        test_correct = 0
        test_total = 0
        test_confidence_sum = 0
        test_epoch_confidences = []
        
        with torch.no_grad():
            for inputs, labels in test_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                logits, probabilities = model(inputs)
                loss = criterion(logits, labels.squeeze())
                
                _, predicted = torch.max(logits, 1)
                confidence, _ = torch.max(probabilities, 1)
                test_correct += (predicted == labels.squeeze()).sum().item()
                test_total += labels.size(0)
                test_confidence_sum += confidence.sum().item()
                test_epoch_confidences.extend(confidence.detach().cpu().numpy())
                test_loss += loss.item()
        
        test_loss = test_loss / len(test_loader)
        test_accuracy = (test_correct / test_total) * 100
        test_avg_confidence = test_confidence_sum / test_total
        
        # Store metrics
        metrics['train_losses'].append(train_loss)
        metrics['test_losses'].append(test_loss)
        metrics['train_accuracies'].append(train_accuracy)
        metrics['test_accuracies'].append(test_accuracy)
        metrics['train_confidences'].append(train_avg_confidence)
        metrics['test_confidences'].append(test_avg_confidence)
        metrics['epoch_train_confidences'].append(train_epoch_confidences)
        metrics['epoch_test_confidences'].append(test_epoch_confidences)
        
        # Save metrics to JSON file after each epoch
        metrics_path = os.path.join(checkpoint_dir, 'training_metrics.json')
        with open(metrics_path, 'w') as f:
            # Convert to float with reduced precision (4 decimal places)
            json_metrics = {
                'train_losses': [round(float(x), 4) for x in metrics['train_losses']],
                'test_losses': [round(float(x), 4) for x in metrics['test_losses']],
                'train_accuracies': [round(float(x), 4) for x in metrics['train_accuracies']],
                'test_accuracies': [round(float(x), 4) for x in metrics['test_accuracies']],
                'train_confidences': [round(float(x), 4) for x in metrics['train_confidences']],
                'test_confidences': [round(float(x), 4) for x in metrics['test_confidences']],
                'epoch_train_confidences': [[round(float(x), 4) for x in arr] if isinstance(arr, (np.ndarray, list)) else round(float(arr), 4)
                                          for arr in metrics['epoch_train_confidences']],
                'epoch_test_confidences': [[round(float(x), 4) for x in arr] if isinstance(arr, (np.ndarray, list)) else round(float(arr), 4)
                                         for arr in metrics['epoch_test_confidences']],
                'current_epoch': epoch + 1
            }
            json.dump(json_metrics, f, indent=4)
        
        # Print progress every 10 epochs
        if epoch % 10 == 0:
            print(f'Epoch [{epoch}/{num_epochs}]')
            print(f'  Training: Loss: {train_loss:.4f}, Accuracy: {train_accuracy:.2f}%, Confidence: {train_avg_confidence:.4f}')
            print(f'  Testing:  Loss: {test_loss:.4f}, Accuracy: {test_accuracy:.2f}%, Confidence: {test_avg_confidence:.4f}')
        
        # Save checkpoint every 50 epochs
        if (epoch + 1) % 50 == 0:
            checkpoint_path = os.path.join(checkpoint_dir, f'cnn_model_epoch_{epoch+1}.pt')
            save_cnn_checkpoint(model, optimizer, epoch, metrics, checkpoint_path)
    
    return metrics

# Run training

In [None]:
# Initialize model and data loaders
cnn_model = ComparableCNN().to(device)
checkpoint_dir = 'cnn_checkpoints'

print(f"Model device: {next(cnn_model.parameters()).device}")
print(f"Is CUDA available? {torch.cuda.is_available()}")

# Train model with checkpointing
metrics = train_evaluate_cnn(
    model=cnn_model,
    train_loader=train_loader,
    test_loader=test_loader,
    num_epochs=400,  # matching your MAMBA training
    device=device,
    checkpoint_dir=checkpoint_dir
)

In [13]:
class SmallerComparableCNN(nn.Module):
    def __init__(self):
        super(SmallerComparableCNN, self).__init__()
        # Reduced initial channels and total layers
        self.conv1 = nn.Conv2d(3, 32, kernel_size=3, padding=1)  # Reduced from 64 to 32
        self.bn1 = nn.BatchNorm2d(32)
        
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)  # Reduced from 128 to 64
        self.bn2 = nn.BatchNorm2d(64)
        
        # Removed two conv layers to reduce parameters
        
        # Global average pooling and final dense layer
        self.avg_pool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(64, 10)  # Changed input features to match last conv layer
        
    def forward(self, x):
        # First block
        x = F.relu(self.bn1(self.conv1(x)))
        x = F.max_pool2d(x, 2)
        
        # Second block
        x = F.relu(self.bn2(self.conv2(x)))
        x = F.max_pool2d(x, 2)
        
        # Global average pooling
        x = self.avg_pool(x)
        x = x.view(x.size(0), -1)
        
        # Final classification
        logits = self.fc(x)
        probabilities = F.softmax(logits, dim=-1)
        
        return logits, probabilities
    

# 400 to 1000

In [14]:
def continue_training_from_checkpoint(model, train_loader, test_loader, 
                                last_epoch=400, target_epochs=1000, 
                                checkpoint_dir='smaller_cnn_checkpoints', device='cuda'):
    """Continue training from the last checkpoint up to target epochs"""
    
    # Load the last checkpoint
    checkpoint_path = os.path.join(checkpoint_dir, f'cnn_model_epoch_{last_epoch}.pt')
    checkpoint = torch.load(checkpoint_path, map_location=device)
    
    # Load model and optimizer state
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5)
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    
    # Load existing metrics
    metrics_path = os.path.join(checkpoint_dir, 'training_metrics.json')
    with open(metrics_path, 'r') as f:
        metrics = json.load(f)
    
    # Convert metrics lists back to proper format
    metrics['epoch_train_confidences'] = [np.array(conf) for conf in metrics['epoch_train_confidences']]
    metrics['epoch_test_confidences'] = [np.array(conf) for conf in metrics['epoch_test_confidences']]
    
    criterion = nn.CrossEntropyLoss()
    
    # Continue training from last_epoch to target_epochs
    for epoch in range(last_epoch, target_epochs):
        # Training
        model.train()
        running_loss = 0.0
        running_correct = 0
        total_samples = 0
        train_confidence_sum = 0
        train_epoch_confidences = []
        
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            logits, probabilities = model(inputs)
            loss = criterion(logits, labels.squeeze())
            loss.backward()
            optimizer.step()
            
            _, predicted = torch.max(logits, 1)
            confidence, _ = torch.max(probabilities, 1)
            running_correct += (predicted == labels.squeeze()).sum().item()
            total_samples += labels.size(0)
            train_confidence_sum += confidence.sum().item()
            train_epoch_confidences.extend(confidence.detach().cpu().numpy())
            running_loss += loss.item()
        
        train_loss = running_loss / len(train_loader)
        train_accuracy = (running_correct / total_samples) * 100
        train_avg_confidence = train_confidence_sum / total_samples
        
        # Testing
        model.eval()
        test_loss = 0.0
        test_correct = 0
        test_total = 0
        test_confidence_sum = 0
        test_epoch_confidences = []
        
        with torch.no_grad():
            for inputs, labels in test_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                logits, probabilities = model(inputs)
                loss = criterion(logits, labels.squeeze())
                
                _, predicted = torch.max(logits, 1)
                confidence, _ = torch.max(probabilities, 1)
                test_correct += (predicted == labels.squeeze()).sum().item()
                test_total += labels.size(0)
                test_confidence_sum += confidence.sum().item()
                test_epoch_confidences.extend(confidence.detach().cpu().numpy())
                test_loss += loss.item()
        
        test_loss = test_loss / len(test_loader)
        test_accuracy = (test_correct / test_total) * 100
        test_avg_confidence = test_confidence_sum / test_total
        
        # Store metrics
        metrics['train_losses'].append(train_loss)
        metrics['test_losses'].append(test_loss)
        metrics['train_accuracies'].append(train_accuracy)
        metrics['test_accuracies'].append(test_accuracy)
        metrics['train_confidences'].append(train_avg_confidence)
        metrics['test_confidences'].append(test_avg_confidence)
        metrics['epoch_train_confidences'].append(train_epoch_confidences)
        metrics['epoch_test_confidences'].append(test_epoch_confidences)
        
        # Save metrics to JSON file after each epoch
        with open(metrics_path, 'w') as f:
            json_metrics = {
                'train_losses': [round(float(x), 4) for x in metrics['train_losses']],
                'test_losses': [round(float(x), 4) for x in metrics['test_losses']],
                'train_accuracies': [round(float(x), 4) for x in metrics['train_accuracies']],
                'test_accuracies': [round(float(x), 4) for x in metrics['test_accuracies']],
                'train_confidences': [round(float(x), 4) for x in metrics['train_confidences']],
                'test_confidences': [round(float(x), 4) for x in metrics['test_confidences']],
                'epoch_train_confidences': [[round(float(x), 4) for x in arr] if isinstance(arr, (np.ndarray, list)) else round(float(arr), 4)
                                          for arr in metrics['epoch_train_confidences']],
                'epoch_test_confidences': [[round(float(x), 4) for x in arr] if isinstance(arr, (np.ndarray, list)) else round(float(arr), 4)
                                         for arr in metrics['epoch_test_confidences']],
                'current_epoch': epoch + 1
            }
            json.dump(json_metrics, f, indent=4)
        
        # Print progress every 10 epochs
        if epoch % 10 == 0:
            print(f'Epoch [{epoch}/{target_epochs}]')
            print(f'  Training: Loss: {train_loss:.4f}, Accuracy: {train_accuracy:.2f}%, Confidence: {train_avg_confidence:.4f}')
            print(f'  Testing:  Loss: {test_loss:.4f}, Accuracy: {test_accuracy:.2f}%, Confidence: {test_avg_confidence:.4f}')
        
        # Save checkpoint every 50 epochs
        if (epoch + 1) % 50 == 0:
            checkpoint_path = os.path.join(checkpoint_dir, f'cnn_model_epoch_{epoch+1}.pt')
            checkpoint = {
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'metrics': metrics
            }
            torch.save(checkpoint, checkpoint_path)
            print(f'Checkpoint saved: {checkpoint_path}')
    
    return metrics

In [None]:
# Initialize the model (same as before)
smaller_cnn_model = SmallerComparableCNN().to(device)

# Continue training from epoch 400 to 1000
metrics = continue_training_from_checkpoint(
    model=smaller_cnn_model,
    train_loader=train_loader,
    test_loader=test_loader,
    last_epoch=400,
    target_epochs=1000,
    checkpoint_dir='smaller_cnn_checkpoints',
    device=device
)

  checkpoint = torch.load(checkpoint_path, map_location=device)


Epoch [400/1000]
  Training: Loss: 0.6960, Accuracy: 76.71%, Confidence: 0.6963
  Testing:  Loss: 0.8859, Accuracy: 69.49%, Confidence: 0.6957
Epoch [410/1000]
  Training: Loss: 0.6870, Accuracy: 76.91%, Confidence: 0.6975
  Testing:  Loss: 0.8848, Accuracy: 68.89%, Confidence: 0.6946
Epoch [420/1000]
  Training: Loss: 0.6837, Accuracy: 76.94%, Confidence: 0.7002
  Testing:  Loss: 0.8646, Accuracy: 69.93%, Confidence: 0.6880
Epoch [430/1000]
  Training: Loss: 0.6774, Accuracy: 77.22%, Confidence: 0.7030
  Testing:  Loss: 0.9861, Accuracy: 66.10%, Confidence: 0.6905
Epoch [440/1000]
  Training: Loss: 0.6740, Accuracy: 77.30%, Confidence: 0.7041
  Testing:  Loss: 0.8687, Accuracy: 69.70%, Confidence: 0.6998
Checkpoint saved: smaller_cnn_checkpoints\cnn_model_epoch_450.pt
Epoch [450/1000]
  Training: Loss: 0.6718, Accuracy: 77.29%, Confidence: 0.7064
  Testing:  Loss: 0.8864, Accuracy: 69.41%, Confidence: 0.7069
Epoch [460/1000]
  Training: Loss: 0.6656, Accuracy: 77.69%, Confidence: 0.70

# Continue training to epoch 1500

In [16]:
def find_last_checkpoint(checkpoint_dir='smaller_cnn_checkpoints'):
    """Find the most recent checkpoint file"""
    checkpoints = []
    for file in os.listdir(checkpoint_dir):
        if file.startswith('cnn_model_epoch_') and file.endswith('.pt'):
            epoch = int(file.split('_')[-1].replace('.pt', ''))
            checkpoints.append(epoch)
    
    if not checkpoints:
        raise FileNotFoundError("No checkpoints found in directory")
    
    last_epoch = max(checkpoints)
    return last_epoch, os.path.join(checkpoint_dir, f'cnn_model_epoch_{last_epoch}.pt')

def continue_training(model, train_loader, test_loader, 
                     target_epochs=1500, 
                     checkpoint_dir='smaller_cnn_checkpoints',
                     device='cuda'):
    """Continue training from last checkpoint to target epochs"""
    
    # Find and load the last checkpoint
    last_epoch, checkpoint_path = find_last_checkpoint(checkpoint_dir)
    print(f"Found last checkpoint at epoch {last_epoch}")
    
    checkpoint = torch.load(checkpoint_path, map_location=device)
    model.load_state_dict(checkpoint['model_state_dict'])
    
    # Initialize optimizer
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5)
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    
    criterion = nn.CrossEntropyLoss()
    
    print(f"Continuing training from epoch {last_epoch} to {target_epochs}")
    
    # Training loop
    for epoch in range(last_epoch, target_epochs):
        # Training phase
        model.train()
        running_loss = 0.0
        running_correct = 0
        total_samples = 0
        
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            logits, probabilities = model(inputs)
            loss = criterion(logits, labels.squeeze())
            loss.backward()
            optimizer.step()
            
            _, predicted = torch.max(logits, 1)
            running_correct += (predicted == labels.squeeze()).sum().item()
            total_samples += labels.size(0)
            running_loss += loss.item()
        
        train_loss = running_loss / len(train_loader)
        train_accuracy = (running_correct / total_samples) * 100
        
        # Testing phase
        model.eval()
        test_loss = 0.0
        test_correct = 0
        test_total = 0
        
        with torch.no_grad():
            for inputs, labels in test_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                logits, probabilities = model(inputs)
                loss = criterion(logits, labels.squeeze())
                
                _, predicted = torch.max(logits, 1)
                test_correct += (predicted == labels.squeeze()).sum().item()
                test_total += labels.size(0)
                test_loss += loss.item()
        
        test_loss = test_loss / len(test_loader)
        test_accuracy = (test_correct / test_total) * 100
        
        # Print progress every 10 epochs
        if epoch % 10 == 0:
            print(f'Epoch [{epoch}/{target_epochs}]')
            print(f'  Training: Loss: {train_loss:.4f}, Accuracy: {train_accuracy:.2f}%')
            print(f'  Testing:  Loss: {test_loss:.4f}, Accuracy: {test_accuracy:.2f}%')
        
        # Save checkpoint every 50 epochs
        if (epoch + 1) % 50 == 0:
            checkpoint_path = os.path.join(checkpoint_dir, f'cnn_model_epoch_{epoch+1}.pt')
            checkpoint = {
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
            }
            torch.save(checkpoint, checkpoint_path)
            print(f'Checkpoint saved: {checkpoint_path}')
            
    return model

In [17]:
# Initialize the model
smaller_cnn_model = SmallerComparableCNN().to(device)

# Continue training
model = continue_training(
    model=smaller_cnn_model,
    train_loader=train_loader,
    test_loader=test_loader,
    target_epochs=1500,
    checkpoint_dir='smaller_cnn_checkpoints',
    device=device
)

Found last checkpoint at epoch 750


  checkpoint = torch.load(checkpoint_path, map_location=device)


Continuing training from epoch 750 to 1500
Epoch [750/1500]
  Training: Loss: 0.5724, Accuracy: 80.51%
  Testing:  Loss: 0.8450, Accuracy: 71.03%
Epoch [760/1500]
  Training: Loss: 0.5668, Accuracy: 81.00%
  Testing:  Loss: 0.8788, Accuracy: 69.55%
Epoch [770/1500]
  Training: Loss: 0.5668, Accuracy: 80.84%
  Testing:  Loss: 0.9401, Accuracy: 68.40%
Epoch [780/1500]
  Training: Loss: 0.5613, Accuracy: 80.88%
  Testing:  Loss: 0.8488, Accuracy: 70.98%
Epoch [790/1500]
  Training: Loss: 0.5614, Accuracy: 81.03%
  Testing:  Loss: 0.8679, Accuracy: 70.04%
Checkpoint saved: smaller_cnn_checkpoints\cnn_model_epoch_800.pt
Epoch [800/1500]
  Training: Loss: 0.5581, Accuracy: 81.08%
  Testing:  Loss: 0.8693, Accuracy: 70.34%
Epoch [810/1500]
  Training: Loss: 0.5564, Accuracy: 81.08%
  Testing:  Loss: 0.8695, Accuracy: 70.20%
Epoch [820/1500]
  Training: Loss: 0.5543, Accuracy: 81.17%
  Testing:  Loss: 0.9483, Accuracy: 68.39%
Epoch [830/1500]
  Training: Loss: 0.5524, Accuracy: 81.25%
  Testin