In [1]:
import os
import urllib.request
import zipfile
import time
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms, models
from torch.optim import lr_scheduler
from torch.utils.data import DataLoader, random_split


torch.cuda.empty_cache()

# Step 1: Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Using device: {device}')

# Step 2: Download TinyImageNet dataset
url = "http://cs231n.stanford.edu/tiny-imagenet-200.zip"
data_dir = './data/tiny-imagenet-200'

if not os.path.exists(data_dir):
    print("Downloading TinyImageNet dataset...")
    urllib.request.urlretrieve(url, './tiny-imagenet-200.zip')

    # Extracting the dataset
    with zipfile.ZipFile('./tiny-imagenet-200.zip', 'r') as zip_ref:
        zip_ref.extractall('./data')
    print("Dataset downloaded and extracted.")

# Step 3: Define data transformations
input_size = 84
batch_size = 32

data_transforms = {
    'train': transforms.Compose([
        transforms.RandomResizedCrop(input_size),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
    'val': transforms.Compose([
        transforms.Resize(input_size),
        transforms.CenterCrop(input_size),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
}

# Step 4: Load datasets with class subset handling
train_dir = os.path.join(data_dir, 'train')

# TinyImageNet has no explicit train/val split; let's create one
full_dataset = datasets.ImageFolder(train_dir, transform=data_transforms['train'])

# Use only the first 80 classes for your use case
class_subset = 40
targets = torch.tensor([sample[1] for sample in full_dataset.samples])  # Extract class labels
indices = [i for i in range(len(targets)) if targets[i] < class_subset]  # Filter indices for class subset
subset_dataset = torch.utils.data.Subset(full_dataset, indices)  # Create subset dataset

# Split the subset into training and validation (80% train, 20% val)
train_size = int(0.8 * len(subset_dataset))
val_size = len(subset_dataset) - train_size
train_dataset, val_dataset = random_split(subset_dataset, [train_size, val_size])

# Data loaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=2)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=2)

print(f'Train set size: {len(train_dataset)}, Validation set size: {len(val_dataset)}')

# Step 5: Define ResNet-50 model with WS + RPQ simulation
class WSResNet50WithRPQ(nn.Module):
    def __init__(self):
        super(WSResNet50WithRPQ, self).__init__()
        self.resnet = models.resnet50(pretrained=False)
        num_features = self.resnet.fc.in_features
        self.resnet.fc = nn.Linear(num_features, class_subset)  # Modify for 80 classes
        self.cached_weights = {}  # Cache for stationary weights
        self.mcache = {}  # Cache for RPQ outputs
        self.random_matrix = None  # Placeholder for RPQ random projection matrix

    def generate_signature(self, input_tensor):
        """Generate RPQ signature from input tensor."""
        input_vector = input_tensor.view(input_tensor.size(0), -1)  # Flatten input tensor
        flattened_size = input_vector.size(1)

        # Initialize random matrix dynamically if not already created or mismatched
        if self.random_matrix is None or self.random_matrix.size(0) != flattened_size:
            self.random_matrix = torch.randn(flattened_size, 512, device=input_tensor.device)

        # Perform random projection
        projected = torch.matmul(input_vector, self.random_matrix)  # Shape: (batch_size, 512)

        # Generate hashable RPQ signatures
        signatures = [tuple((row > 0).int().tolist()) for row in projected]
        return signatures

    def forward(self, x):
        for name, layer in self.resnet.named_children():
            if isinstance(layer, nn.Conv2d):
                # Generate RPQ signatures for the batch
                signatures = self.generate_signature(x)

                # Process each input in the batch
                outputs = []
                for i, signature in enumerate(signatures):
                    if signature in self.mcache:
                        outputs.append(self.mcache[signature])  # Cache hit
                    else:
                        # Cache miss: Compute the output
                        if name not in self.cached_weights:
                            self.cached_weights[name] = layer.weight.clone().detach().to(x.device)
                        output = nn.functional.conv2d(
                            x[i:i+1], self.cached_weights[name], bias=layer.bias,
                            stride=layer.stride, padding=layer.padding, dilation=layer.dilation, groups=layer.groups
                        )
                        self.mcache[signature] = output
                        outputs.append(output)

                # Combine outputs into a batch
                x = torch.cat(outputs, dim=0)
            elif name == "fc":
                x = torch.flatten(x, 1)  # Flatten before fc layer
                x = layer(x)
            else:
                x = layer(x)
        return x

model = WSResNet50WithRPQ().to(device)

# Step 6: Define loss function, optimizer, and scheduler
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
scheduler = lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)

# Step 7: Training function with WS + RPQ profiling
def train_model_with_ws_and_rpq(model, train_loader, val_loader, criterion, optimizer, scheduler, num_epochs=10):
    total_start_time = time.time()
    for epoch in range(num_epochs):
        print(f'Epoch {epoch + 1}/{num_epochs}')
        print('-' * 20)

        # Training phase
        model.train()
        running_loss = 0.0
        correct = 0
        total = 0
        epoch_start_time = time.time()

        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item() * inputs.size(0)
            _, preds = torch.max(outputs, 1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

        epoch_duration = time.time() - epoch_start_time
        epoch_loss = running_loss / len(train_loader.dataset)
        epoch_acc = correct / total
        print(f'Train Loss: {epoch_loss:.4f}, Train Accuracy: {epoch_acc:.4f}')
        print(f'Epoch {epoch + 1} duration: {epoch_duration:.4f} seconds')

        # Validation phase
        print('Entering Val phase')
        model.eval()
        val_loss = 0.0
        val_correct = 0
        val_total = 0
        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs)
                loss = criterion(outputs, labels)

                val_loss += loss.item() * inputs.size(0)
                _, preds = torch.max(outputs, 1)
                val_correct += (preds == labels).sum().item()
                val_total += labels.size(0)

        val_epoch_loss = val_loss / len(val_loader.dataset)
        val_epoch_acc = val_correct / val_total
        print(f'Val Loss: {val_epoch_loss:.4f}, Val Accuracy: {val_epoch_acc:.4f}')

        scheduler.step()

    total_duration = time.time() - total_start_time
    print(f'Total training time with WS + RPQ: {total_duration:.4f} seconds')
    return total_duration

# Step 8: Start training and measure time
total_training_time = train_model_with_ws_and_rpq(model, train_loader, val_loader, criterion, optimizer, scheduler, num_epochs=10)

# Clock cycle profiling
cpu_clock_speed_ghz = 2.3  # Adjust based on your system
total_clock_cycles = total_training_time * cpu_clock_speed_ghz * 1e9
print(f"Estimated Total Clock Cycles with WS + RPQ: {total_clock_cycles:.2e} cycles")



Using device: cuda
Train set size: 16000, Validation set size: 4000


  f"The parameter '{pretrained_param}' is deprecated since 0.13 and may be removed in the future, "


Epoch 1/10
--------------------
Train Loss: 3.5310, Train Accuracy: 0.0984
Epoch 1 duration: 397.5417 seconds
Entering Val phase
Val Loss: 5.1698, Val Accuracy: 0.1400
Epoch 2/10
--------------------


OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 MiB (GPU 0; 8.00 GiB total capacity; 12.33 GiB already allocated; 0 bytes free; 14.31 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF