# Imports

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import json
import os

In [2]:
from tqdm import tqdm

import numpy as np
import keras
from tensorflow import keras
from keras.datasets import cifar10
from __future__ import print_function
from keras.models import Sequential
from keras.models import save_model, load_model
from keras.layers import Dense, Activation, Flatten
from keras.layers import Conv2D, MaxPooling2D

import keras.backend as K
K.clear_session()

# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import accuracy_score
# from model import Mamba, ModelArgs  # Import your custom Mamba implementation
# Assuming the model classes are defined in `model.py`
from model import ImageMamba, ModelArgs




# CUDA

In [3]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Thu_Sep_12_02:55:00_Pacific_Daylight_Time_2024
Cuda compilation tools, release 12.6, V12.6.77
Build cuda_12.6.r12.6/compiler.34841621_0


In [4]:
#!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124

# Initialization

In [5]:
# Set the device (GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [6]:
# Load CIFAR-10 data
(X_train, Y_train), (X_test, Y_test) = cifar10.load_data()

# Reshape and preprocess the CIFAR-10 dataset for PyTorch models
X_train = X_train.transpose(0, 3, 1, 2)  # Shape: (batch_size, channels, height, width)
X_test = X_test.transpose(0, 3, 1, 2)

# Convert data to float and normalize pixel values in the range [0, 1]
X_train = X_train.astype('float32') / 255.0
X_test = X_test.astype('float32') / 255.0

# Convert the train/test data into PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
Y_train_tensor = torch.tensor(Y_train, dtype=torch.long)
Y_test_tensor = torch.tensor(Y_test, dtype=torch.long)

# Your existing train loader code
train_dataset = TensorDataset(X_train_tensor, Y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

# Add test loader
test_dataset = TensorDataset(X_test_tensor, Y_test_tensor)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)  # Note: shuffle=False for test data

In [7]:
# Define the unique class names for CIFAR-10
class_names = ['airplane', 'automobile', 'bird', 'cat', 'deer', 
               'dog', 'frog', 'horse', 'ship', 'truck']


# CNN

In [8]:
class ComparableCNN(nn.Module):
    def __init__(self):
        super(ComparableCNN, self).__init__()
        # First conv block - similar to MAMBA's first conv
        self.conv1 = nn.Conv2d(3, 64, kernel_size=3, padding=1)
        self.bn1 = nn.BatchNorm2d(64)
        
        # Second conv block - similar to MAMBA's second conv
        self.conv2 = nn.Conv2d(64, 128, kernel_size=3, padding=1)
        self.bn2 = nn.BatchNorm2d(128)
        
        # Additional conv layers to match MAMBA's 4 layers
        self.conv3 = nn.Conv2d(128, 128, kernel_size=3, padding=1)
        self.bn3 = nn.BatchNorm2d(128)
        
        self.conv4 = nn.Conv2d(128, 128, kernel_size=3, padding=1)
        self.bn4 = nn.BatchNorm2d(128)
        
        # Global average pooling and final dense layer
        self.avg_pool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(128, 10)
        
    def forward(self, x):
        # First block
        x = F.relu(self.bn1(self.conv1(x)))
        x = F.max_pool2d(x, 2)
        
        # Second block
        x = F.relu(self.bn2(self.conv2(x)))
        x = F.max_pool2d(x, 2)
        
        # Third block
        x = F.relu(self.bn3(self.conv3(x)))
        
        # Fourth block
        x = F.relu(self.bn4(self.conv4(x)))
        
        # Global average pooling
        x = self.avg_pool(x)
        x = x.view(x.size(0), -1)
        
        # Final classification
        logits = self.fc(x)
        probabilities = F.softmax(logits, dim=-1)
        
        return logits, probabilities

def load_training_metrics(metrics_path):
    """Load training metrics from JSON file"""
    with open(metrics_path, 'r') as f:
        metrics = json.load(f)
        # Convert lists back to numpy arrays where needed
        metrics['epoch_train_confidences'] = [np.array(arr) for arr in metrics['epoch_train_confidences']]
        metrics['epoch_test_confidences'] = [np.array(arr) for arr in metrics['epoch_test_confidences']]
    return metrics

def save_cnn_checkpoint(model, optimizer, epoch, metrics, path):
    """Save CNN model checkpoint"""
    checkpoint = {
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'metrics': metrics
    }
    torch.save(checkpoint, path)
    print(f'Checkpoint saved: {path}')

def train_evaluate_cnn(model, train_loader, test_loader, num_epochs=100, device='cuda', 
                      checkpoint_dir='cnn_checkpoints'):
    """Train and evaluate CNN with checkpointing"""
    os.makedirs(checkpoint_dir, exist_ok=True)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5)
    
    metrics = {
        'train_losses': [], 'test_losses': [],
        'train_accuracies': [], 'test_accuracies': [],
        'train_confidences': [], 'test_confidences': [],
        'epoch_train_confidences': [], 'epoch_test_confidences': []
    }
    
    for epoch in range(num_epochs):
        # Training
        model.train()
        running_loss = 0.0
        running_correct = 0
        total_samples = 0
        train_confidence_sum = 0
        train_epoch_confidences = []
        
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            logits, probabilities = model(inputs)
            loss = criterion(logits, labels.squeeze())
            loss.backward()
            optimizer.step()
            
            _, predicted = torch.max(logits, 1)
            confidence, _ = torch.max(probabilities, 1)
            running_correct += (predicted == labels.squeeze()).sum().item()
            total_samples += labels.size(0)
            train_confidence_sum += confidence.sum().item()
            train_epoch_confidences.extend(confidence.detach().cpu().numpy())
            running_loss += loss.item()
        
        train_loss = running_loss / len(train_loader)
        train_accuracy = (running_correct / total_samples) * 100
        train_avg_confidence = train_confidence_sum / total_samples
        
        # Testing
        model.eval()
        test_loss = 0.0
        test_correct = 0
        test_total = 0
        test_confidence_sum = 0
        test_epoch_confidences = []
        
        with torch.no_grad():
            for inputs, labels in test_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                logits, probabilities = model(inputs)
                loss = criterion(logits, labels.squeeze())
                
                _, predicted = torch.max(logits, 1)
                confidence, _ = torch.max(probabilities, 1)
                test_correct += (predicted == labels.squeeze()).sum().item()
                test_total += labels.size(0)
                test_confidence_sum += confidence.sum().item()
                test_epoch_confidences.extend(confidence.detach().cpu().numpy())
                test_loss += loss.item()
        
        test_loss = test_loss / len(test_loader)
        test_accuracy = (test_correct / test_total) * 100
        test_avg_confidence = test_confidence_sum / test_total
        
        # Store metrics
        metrics['train_losses'].append(train_loss)
        metrics['test_losses'].append(test_loss)
        metrics['train_accuracies'].append(train_accuracy)
        metrics['test_accuracies'].append(test_accuracy)
        metrics['train_confidences'].append(train_avg_confidence)
        metrics['test_confidences'].append(test_avg_confidence)
        metrics['epoch_train_confidences'].append(train_epoch_confidences)
        metrics['epoch_test_confidences'].append(test_epoch_confidences)
        
        # Save metrics to JSON file after each epoch
        metrics_path = os.path.join(checkpoint_dir, 'training_metrics.json')
        with open(metrics_path, 'w') as f:
            # Convert to float with reduced precision (4 decimal places)
            json_metrics = {
                'train_losses': [round(float(x), 4) for x in metrics['train_losses']],
                'test_losses': [round(float(x), 4) for x in metrics['test_losses']],
                'train_accuracies': [round(float(x), 4) for x in metrics['train_accuracies']],
                'test_accuracies': [round(float(x), 4) for x in metrics['test_accuracies']],
                'train_confidences': [round(float(x), 4) for x in metrics['train_confidences']],
                'test_confidences': [round(float(x), 4) for x in metrics['test_confidences']],
                'epoch_train_confidences': [[round(float(x), 4) for x in arr] if isinstance(arr, (np.ndarray, list)) else round(float(arr), 4)
                                          for arr in metrics['epoch_train_confidences']],
                'epoch_test_confidences': [[round(float(x), 4) for x in arr] if isinstance(arr, (np.ndarray, list)) else round(float(arr), 4)
                                         for arr in metrics['epoch_test_confidences']],
                'current_epoch': epoch + 1
            }
            json.dump(json_metrics, f, indent=4)
        
        # Print progress every 10 epochs
        if epoch % 10 == 0:
            print(f'Epoch [{epoch}/{num_epochs}]')
            print(f'  Training: Loss: {train_loss:.4f}, Accuracy: {train_accuracy:.2f}%, Confidence: {train_avg_confidence:.4f}')
            print(f'  Testing:  Loss: {test_loss:.4f}, Accuracy: {test_accuracy:.2f}%, Confidence: {test_avg_confidence:.4f}')
        
        # Save checkpoint every 50 epochs
        if (epoch + 1) % 50 == 0:
            checkpoint_path = os.path.join(checkpoint_dir, f'cnn_model_epoch_{epoch+1}.pt')
            save_cnn_checkpoint(model, optimizer, epoch, metrics, checkpoint_path)
    
    return metrics

# Run training

In [None]:
# Initialize model and data loaders
cnn_model = ComparableCNN().to(device)
checkpoint_dir = 'cnn_checkpoints'

print(f"Model device: {next(cnn_model.parameters()).device}")
print(f"Is CUDA available? {torch.cuda.is_available()}")

# Train model with checkpointing
metrics = train_evaluate_cnn(
    model=cnn_model,
    train_loader=train_loader,
    test_loader=test_loader,
    num_epochs=400,  # matching your MAMBA training
    device=device,
    checkpoint_dir=checkpoint_dir
)

In [9]:
class SmallerComparableCNN(nn.Module):
    def __init__(self):
        super(SmallerComparableCNN, self).__init__()
        # Reduced initial channels and total layers
        self.conv1 = nn.Conv2d(3, 32, kernel_size=3, padding=1)  # Reduced from 64 to 32
        self.bn1 = nn.BatchNorm2d(32)
        
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)  # Reduced from 128 to 64
        self.bn2 = nn.BatchNorm2d(64)
        
        # Removed two conv layers to reduce parameters
        
        # Global average pooling and final dense layer
        self.avg_pool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(64, 10)  # Changed input features to match last conv layer
        
    def forward(self, x):
        # First block
        x = F.relu(self.bn1(self.conv1(x)))
        x = F.max_pool2d(x, 2)
        
        # Second block
        x = F.relu(self.bn2(self.conv2(x)))
        x = F.max_pool2d(x, 2)
        
        # Global average pooling
        x = self.avg_pool(x)
        x = x.view(x.size(0), -1)
        
        # Final classification
        logits = self.fc(x)
        probabilities = F.softmax(logits, dim=-1)
        
        return logits, probabilities

# Initialize the smaller model and new checkpoint directory
smaller_cnn_model = SmallerComparableCNN().to(device)
new_checkpoint_dir = 'smaller_cnn_checkpoints'  # New directory for the smaller model
print(f"Model device: {next(smaller_cnn_model.parameters()).device}")
print(f"Is CUDA available? {torch.cuda.is_available()}")

# Train model with checkpointing in new directory
metrics = train_evaluate_cnn(
    model=smaller_cnn_model,
    train_loader=train_loader,
    test_loader=test_loader,
    num_epochs=400,
    device=device,
    checkpoint_dir=new_checkpoint_dir  # Using new directory
)

Model device: cuda:0
Is CUDA available? True
Epoch [0/400]
  Training: Loss: 1.9716, Accuracy: 30.76%, Confidence: 0.2065
  Testing:  Loss: 1.8292, Accuracy: 38.22%, Confidence: 0.2355
Epoch [10/400]
  Training: Loss: 1.3993, Accuracy: 51.56%, Confidence: 0.4150
  Testing:  Loss: 1.4306, Accuracy: 48.93%, Confidence: 0.4302
Epoch [20/400]
  Training: Loss: 1.2921, Accuracy: 55.39%, Confidence: 0.4616
  Testing:  Loss: 1.3186, Accuracy: 53.02%, Confidence: 0.4654
Epoch [30/400]
  Training: Loss: 1.2209, Accuracy: 57.70%, Confidence: 0.4916
  Testing:  Loss: 1.2365, Accuracy: 57.20%, Confidence: 0.4861
Epoch [40/400]
  Training: Loss: 1.1728, Accuracy: 59.38%, Confidence: 0.5115
  Testing:  Loss: 1.1988, Accuracy: 57.94%, Confidence: 0.5187
Checkpoint saved: smaller_cnn_checkpoints\cnn_model_epoch_50.pt
Epoch [50/400]
  Training: Loss: 1.1275, Accuracy: 61.03%, Confidence: 0.5286
  Testing:  Loss: 1.1371, Accuracy: 60.48%, Confidence: 0.5240
Epoch [60/400]
  Training: Loss: 1.0950, Accur