In [2]:
#!pip install pytorch-lightning


In [None]:
import torch
print(torch.__version__)

2.4.1+cu121


In [None]:
import time
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
import pytorch_lightning as pl
from torch.utils.data import DataLoader
from torchvision.datasets import MNIST

To ensure that CuDNN is being used, you can verify by checking if PyTorch is built with CuDNN support and if it is utilizing GPU acceleration properly:Should return True if CuDNN is enabled

In [None]:
print(torch.backends.cudnn.enabled)

True


1. native Pytorch With NN  module

In [None]:
import time
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
import torchvision.transforms as transforms
from torchvision.datasets import MNIST

# Define the neural network using standard PyTorch
class SimpleNet(nn.Module):
    def __init__(self, input_size=784, hidden_size=500, num_classes=10):
        super(SimpleNet, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()  # CuDNN-optimized ReLU
        self.fc2 = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        return out

def train(model, device, train_loader, criterion, optimizer):
    model.train()
    for batch_idx, (images, labels) in enumerate(train_loader):
        images, labels = images.to(device), labels.to(device)
        images = images.view(images.size(0), -1)  # Flatten images

        # Forward pass
        outputs = model(images)
        loss = criterion(outputs, labels)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if (batch_idx + 1) % 100 == 0:
            print(f'Batch [{batch_idx + 1}/{len(train_loader)}], Loss: {loss.item():.4f}')

def test(model, device, test_loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for images, labels in test_loader:
            images, labels = images.to(device), labels.to(device)
            images = images.view(images.size(0), -1)  # Flatten images
            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    accuracy = 100 * correct / total
    print(f'Accuracy of the model on the test images: {accuracy:.2f}%')

def main():
    # Hyperparameters
    input_size = 784  # 28x28 images
    hidden_size = 500
    num_classes = 10
    batch_size = 100
    learning_rate = 0.001
    num_epochs = 10

    # Transformations
    transform = transforms.Compose([
        transforms.ToTensor(),
    ])

    # MNIST datasets
    train_dataset = MNIST(root='./data', train=True, transform=transform, download=True)
    test_dataset = MNIST(root='./data', train=False, transform=transform, download=True)

    # Data loaders
    num_workers = 2  # Adjust based on your system
    train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers)
    test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers)

    # Initialize the network
    model = SimpleNet(input_size, hidden_size, num_classes)

    # Use GPU if available
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    # Loss and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # Start timing for training
    training_start_time = time.time()

    # Train the model
    for epoch in range(num_epochs):
        print(f'\nEpoch [{epoch + 1}/{num_epochs}]')
        train(model, device, train_loader, criterion, optimizer)

    # End timing for training
    training_time = time.time() - training_start_time
    print(f'\nTraining Time: {training_time:.2f} seconds')

    # Start timing for testing
    testing_start_time = time.time()

    # Test the model
    test(model, device, test_loader)

    # End timing for testing
    testing_time = time.time() - testing_start_time
    print(f'Testing Time: {testing_time:.2f} seconds')

if __name__ == '__main__':
    main()



Epoch [1/10]
Batch [100/600], Loss: 0.3195
Batch [200/600], Loss: 0.1786
Batch [300/600], Loss: 0.2914
Batch [400/600], Loss: 0.1957
Batch [500/600], Loss: 0.1359
Batch [600/600], Loss: 0.2207

Epoch [2/10]
Batch [100/600], Loss: 0.1524
Batch [200/600], Loss: 0.1678
Batch [300/600], Loss: 0.2206
Batch [400/600], Loss: 0.2007
Batch [500/600], Loss: 0.1100
Batch [600/600], Loss: 0.0926

Epoch [3/10]
Batch [100/600], Loss: 0.0396
Batch [200/600], Loss: 0.1530
Batch [300/600], Loss: 0.0622
Batch [400/600], Loss: 0.0476
Batch [500/600], Loss: 0.2064
Batch [600/600], Loss: 0.0775

Epoch [4/10]
Batch [100/600], Loss: 0.0274
Batch [200/600], Loss: 0.0543
Batch [300/600], Loss: 0.0312
Batch [400/600], Loss: 0.0440
Batch [500/600], Loss: 0.0672
Batch [600/600], Loss: 0.1565

Epoch [5/10]
Batch [100/600], Loss: 0.0488
Batch [200/600], Loss: 0.0743
Batch [300/600], Loss: 0.1070
Batch [400/600], Loss: 0.0355
Batch [500/600], Loss: 0.0488
Batch [600/600], Loss: 0.0613

Epoch [6/10]
Batch [100/600],

2.Native lightening , with CuDNN Optimized ReLU:just using nn.ReLU()

In [None]:
import time
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
import pytorch_lightning as pl
from torch.utils.data import DataLoader
from torchvision.datasets import MNIST

# Define the neural network using PyTorch Lightning
class SimpleNet(pl.LightningModule):
    def __init__(self, input_size=784, hidden_size=500, num_classes=10, learning_rate=0.001):
        super(SimpleNet, self).__init__()
        self.save_hyperparameters()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()  # Using CuDNN-optimized ReLU from PyTorch
        self.fc2 = nn.Linear(hidden_size, num_classes)
        self.learning_rate = learning_rate
        self.criterion = nn.CrossEntropyLoss()

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        return out

    def training_step(self, batch, batch_idx):
        images, labels = batch
        images = images.view(images.size(0), -1)  # Flatten images
        outputs = self(images)
        loss = self.criterion(outputs, labels)
        # Log training loss
        self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        images, labels = batch
        images = images.view(images.size(0), -1)  # Flatten images
        outputs = self(images)
        loss = self.criterion(outputs, labels)
        # Calculate accuracy
        preds = torch.argmax(outputs, dim=1)
        acc = (preds == labels).float().mean()
        # Log validation loss and accuracy
        self.log('val_loss', loss, prog_bar=True)
        self.log('val_acc', acc, prog_bar=True)

    def test_step(self, batch, batch_idx):
        images, labels = batch
        images = images.view(images.size(0), -1)  # Flatten images
        outputs = self(images)
        # Calculate accuracy
        preds = torch.argmax(outputs, dim=1)
        acc = (preds == labels).float().mean()
        # Log test accuracy
        self.log('test_acc', acc, prog_bar=True)

    def configure_optimizers(self):
        optimizer = optim.Adam(self.parameters(), lr=self.learning_rate)
        return optimizer

def main():
    # Hyperparameters
    input_size = 784  # 28x28 images
    hidden_size = 500
    num_classes = 10
    batch_size = 100
    learning_rate = 0.001
    num_epochs = 10

    # Transformations
    transform = transforms.Compose([
        transforms.ToTensor(),
    ])

    # MNIST datasets
    train_dataset = MNIST(root='./data', train=True, transform=transform, download=True)
    val_dataset = MNIST(root='./data', train=False, transform=transform, download=True)
    test_dataset = MNIST(root='./data', train=False, transform=transform, download=True)

    # Data loaders
    num_workers = 2  # Adjust based on your system
    train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers)
    val_loader = DataLoader(dataset=val_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers)
    test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers)

    # Initialize the network
    model = SimpleNet(input_size, hidden_size, num_classes, learning_rate)

    # Check if GPU is available
    accelerator = 'gpu' if torch.cuda.is_available() else 'cpu'
    devices = [0] if torch.cuda.is_available() else None  # Use the first GPU if available

    # Initialize a trainer
    trainer = pl.Trainer(
        max_epochs=num_epochs,
        accelerator=accelerator,
        devices=devices,
        log_every_n_steps=20,
    )

    # Start timing for training
    training_start_time = time.time()

    # Train the model
    trainer.fit(model, train_loader, val_loader)

    # End timing for training
    training_time = time.time() - training_start_time
    print(f'\nTraining Time: {training_time:.2f} seconds')

    # Start timing for testing
    testing_start_time = time.time()

    # Test the model
    trainer.test(model, dataloaders=test_loader)

    # End timing for testing
    testing_time = time.time() - testing_start_time
    print(f'Testing Time: {testing_time:.2f} seconds')

if __name__ == '__main__':
    main()


INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name      | Type             | Params | Mode 
-------------------------------------------------------
0 | fc1       | Linear           | 392 K  | train
1 | relu      | ReLU             | 0      | train
2 | fc2       | Linear           | 5.0 K  | train
3 | criterion | CrossEntropyLoss | 0      | train
-------------------------------------------------------
397 K     Trainable params
0         Non-trainable params
397 K     Total params
1.590     Total estimated model params size (MB)
4         Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=10` reached.
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]



Training Time: 139.27 seconds


Testing: |          | 0/? [00:00<?, ?it/s]

Testing Time: 1.64 seconds


3.	Native lightening , with CuDNN Optimized ReLU:
And precision='16-mixed'


In [None]:
import time
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from torchvision.datasets import MNIST
import pytorch_lightning as pl
from pytorch_lightning.callbacks import EarlyStopping, DeviceStatsMonitor

# Define the neural network using PyTorch Lightning
class SimpleNet(pl.LightningModule):
    def __init__(self, input_size=784, hidden_size=500, num_classes=10, learning_rate=0.001):
        super(SimpleNet, self).__init__()
        self.save_hyperparameters()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()  # Using CuDNN-optimized ReLU from PyTorch
        self.fc2 = nn.Linear(hidden_size, num_classes)
        self.learning_rate = learning_rate
        self.criterion = nn.CrossEntropyLoss()

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        return out

    def training_step(self, batch, batch_idx):
        images, labels = batch
        images = images.view(images.size(0), -1)  # Flatten images
        outputs = self(images)
        loss = self.criterion(outputs, labels)
        # Log training loss
        self.log('train_loss', loss, on_epoch=True, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        images, labels = batch
        images = images.view(images.size(0), -1)  # Flatten images
        outputs = self(images)
        loss = self.criterion(outputs, labels)
        # Calculate accuracy
        preds = torch.argmax(outputs, dim=1)
        acc = (preds == labels).float().mean()
        # Log validation loss and accuracy
        self.log('val_loss', loss, prog_bar=True)
        self.log('val_acc', acc, prog_bar=True)

    def test_step(self, batch, batch_idx):
        images, labels = batch
        images = images.view(images.size(0), -1)  # Flatten images
        outputs = self(images)
        # Calculate accuracy
        preds = torch.argmax(outputs, dim=1)
        acc = (preds == labels).float().mean()
        # Log test accuracy
        self.log('test_acc', acc, prog_bar=True)

    def configure_optimizers(self):
        optimizer = optim.Adam(self.parameters(), lr=self.learning_rate)
        scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)
        return [optimizer], [scheduler]

def main():
    # Hyperparameters
    input_size = 784  # 28x28 images
    hidden_size = 500
    num_classes = 10
    batch_size = 256  # Increased batch size
    learning_rate = 0.001
    num_epochs = 10

    # Transformations
    transform = transforms.Compose([transforms.ToTensor()])

    # MNIST datasets
    train_dataset = MNIST(root='./data', train=True, transform=transform, download=True)
    val_dataset = MNIST(root='./data', train=False, transform=transform, download=True)
    test_dataset = MNIST(root='./data', train=False, transform=transform, download=True)

    # Data loaders
    num_workers = 2  # Reduced number of workers to prevent warning
    train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers, persistent_workers=True)
    val_loader = DataLoader(dataset=val_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers, persistent_workers=True)
    test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers)

    # Initialize the network
    model = SimpleNet(input_size, hidden_size, num_classes, learning_rate)

    # Check if GPU is available
    accelerator = 'gpu' if torch.cuda.is_available() else 'cpu'
    devices = [0] if torch.cuda.is_available() else None

    # Initialize a trainer
    trainer = pl.Trainer(
        max_epochs=num_epochs,
        accelerator=accelerator,
        devices=devices,
        log_every_n_steps=20,
        precision='16-mixed',  # Updated precision
        callbacks=[EarlyStopping(monitor='val_loss', patience=3), DeviceStatsMonitor()],
    )

    # Start timing for training
    training_start_time = time.time()

    # Train the model
    trainer.fit(model, train_loader, val_loader)

    # End timing for training
    training_time = time.time() - training_start_time
    print(f'\nTraining Time: {training_time:.2f} seconds')

    # Start timing for testing
    testing_start_time = time.time()

    # Test the model
    trainer.test(model, dataloaders=test_loader)

    # End timing for testing
    testing_time = time.time() - testing_start_time
    print(f'Testing Time: {testing_time:.2f} seconds')

if __name__ == '__main__':
    main()


INFO:pytorch_lightning.utilities.rank_zero:Using 16bit Automatic Mixed Precision (AMP)
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name      | Type             | Params | Mode 
-------------------------------------------------------
0 | fc1       | Linear           | 392 K  | train
1 | relu      | ReLU             | 0      | train
2 | fc2       | Linear           | 5.0 K  | train
3 | criterion | CrossEntropyLoss | 0      | train
-------------------------------------------------------
397 K     Trainable params
0         Non-trainable params
397 K     Total params
1.590     Total estimated model params size (MB)
4         Modules in train mode
0         Mod

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=10` reached.
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]



Training Time: 108.51 seconds


Testing: |          | 0/? [00:00<?, ?it/s]

Testing Time: 1.57 seconds


In [None]:
!pip install triton

Collecting triton
  Downloading triton-3.0.0-1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.3 kB)
Downloading triton-3.0.0-1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (209.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.4/209.4 MB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: triton
Successfully installed triton-3.0.0


4.Native Triton with Custom Triton Kernel , Triton ReLU function., neural network using PyTorch nn_module/native  

In [None]:
import time  # Import time module for execution time measurement
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
import triton
import triton.language as tl

# Triton ReLU kernel
@triton.jit
def relu_kernel(input_ptr, output_ptr, n_elements, BLOCK_SIZE: tl.constexpr):
    # Obtain program ID
    pid = tl.program_id(axis=0)
    # Calculate the starting index for this program
    block_start = pid * BLOCK_SIZE
    # Create an offset for each element within the block
    offsets = block_start + tl.arange(0, BLOCK_SIZE)
    # Create a mask for out-of-bounds indices
    mask = offsets < n_elements
    # Load input data
    x = tl.load(input_ptr + offsets, mask=mask)
    # Apply ReLU activation
    y = tl.maximum(x, 0.0)
    # Store the result
    tl.store(output_ptr + offsets, y, mask=mask)

def triton_relu(input_tensor):
    output = torch.empty_like(input_tensor)
    n_elements = input_tensor.numel()
    # Define the grid (number of blocks)
    grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),)
    # Launch the kernel
    relu_kernel[grid](input_tensor, output, n_elements, BLOCK_SIZE=1024)
    return output

# Define the neural network
class SimpleNet(nn.Module):
    def __init__(self, input_size=784, hidden_size=500, num_classes=10):
        super(SimpleNet, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        # We will replace the standard ReLU with our Triton ReLU
        self.fc2 = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        out = self.fc1(x)
        # Apply Triton ReLU
        out = triton_relu(out)
        out = self.fc2(out)
        return out

def main():
    # Device configuration
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # Hyperparameters
    input_size = 784  # 28x28 images
    hidden_size = 500
    num_classes = 10
    num_epochs = 10
    batch_size = 100
    learning_rate = 0.001

    # MNIST dataset
    train_dataset = torchvision.datasets.MNIST(root='./data',
                                               train=True,
                                               transform=transforms.ToTensor(),
                                               download=True)

    test_dataset = torchvision.datasets.MNIST(root='./data',
                                              train=False,
                                              transform=transforms.ToTensor())

    # Data loaders
    train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                               batch_size=batch_size,
                                               shuffle=True)

    test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
                                              batch_size=batch_size,
                                              shuffle=False)

    # Initialize the network
    model = SimpleNet(input_size, hidden_size, num_classes).to(device)

    # Loss function and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # Start timing for training
    training_start_time = time.time()

    # Training loop
    total_step = len(train_loader)
    for epoch in range(num_epochs):
        for i, (images, labels) in enumerate(train_loader):
            # Flatten images to match input size
            images = images.reshape(-1, 28*28).to(device)
            labels = labels.to(device)

            # Forward pass
            outputs = model(images)
            loss = criterion(outputs, labels)

            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # Print training information
            if (i+1) % 100 == 0:
                print(f'Epoch [{epoch+1}/{num_epochs}], '
                      f'Step [{i+1}/{total_step}], '
                      f'Loss: {loss.item():.4f}')

    # End timing for training
    training_time = time.time() - training_start_time
    print(f'\nTraining Time: {training_time:.2f} seconds')

    # Start timing for testing
    testing_start_time = time.time()

    # Switch to evaluation mode
    model.eval()

    # Disable gradient calculation
    with torch.no_grad():
        correct = 0
        total = 0
        for images, labels in test_loader:
            images = images.reshape(-1, 28*28).to(device)
            labels = labels.to(device)
            outputs = model(images)
            # Predicted class is the one with highest score
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

        print(f'Test Accuracy of the model on the 10000 test images: '
              f'{100 * correct / total:.2f}%')

    # End timing for testing
    testing_time = time.time() - testing_start_time
    print(f'Testing Time: {testing_time:.2f} seconds')

if __name__ == '__main__':
    main()


Epoch [1/10], Step [100/600], Loss: 1.5435
Epoch [1/10], Step [200/600], Loss: 1.1367
Epoch [1/10], Step [300/600], Loss: 0.9468
Epoch [1/10], Step [400/600], Loss: 0.8825
Epoch [1/10], Step [500/600], Loss: 0.7548
Epoch [1/10], Step [600/600], Loss: 0.6175
Epoch [2/10], Step [100/600], Loss: 0.6881
Epoch [2/10], Step [200/600], Loss: 0.6182
Epoch [2/10], Step [300/600], Loss: 0.5048
Epoch [2/10], Step [400/600], Loss: 0.6491
Epoch [2/10], Step [500/600], Loss: 0.4639
Epoch [2/10], Step [600/600], Loss: 0.3004
Epoch [3/10], Step [100/600], Loss: 0.4010
Epoch [3/10], Step [200/600], Loss: 0.4157
Epoch [3/10], Step [300/600], Loss: 0.3248
Epoch [3/10], Step [400/600], Loss: 0.4256
Epoch [3/10], Step [500/600], Loss: 0.3757
Epoch [3/10], Step [600/600], Loss: 0.3985
Epoch [4/10], Step [100/600], Loss: 0.3598
Epoch [4/10], Step [200/600], Loss: 0.2965
Epoch [4/10], Step [300/600], Loss: 0.3637
Epoch [4/10], Step [400/600], Loss: 0.2327
Epoch [4/10], Step [500/600], Loss: 0.3814
Epoch [4/10

5.	Native Triton with Custom Triton Kernel , Triton ReLU function., neural network using PyTorch nn_module/native   with old Autocast and GradScaler (Mixed presession from Triton)
T


In [None]:
import time  # Import time module for execution time measurement
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
import triton
import triton.language as tl
from torch.cuda.amp import autocast, GradScaler

# Triton ReLU kernel
@triton.jit
def relu_kernel(input_ptr, output_ptr, n_elements, BLOCK_SIZE: tl.constexpr):
    # Obtain program ID
    pid = tl.program_id(axis=0)
    # Calculate the starting index for this program
    block_start = pid * BLOCK_SIZE
    # Create an offset for each element within the block
    offsets = block_start + tl.arange(0, BLOCK_SIZE)
    # Create a mask for out-of-bounds indices
    mask = offsets < n_elements
    # Load input data
    x = tl.load(input_ptr + offsets, mask=mask)
    # Apply ReLU activation
    y = tl.maximum(x, 0.0)
    # Store the result
    tl.store(output_ptr + offsets, y, mask=mask)

def triton_relu(input_tensor):
    output = torch.empty_like(input_tensor)
    n_elements = input_tensor.numel()
    # Define the grid (number of blocks)
    grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),)
    # Launch the kernel
    relu_kernel[grid](input_tensor, output, n_elements, BLOCK_SIZE=1024)
    return output

# Define the neural network
class SimpleNet(nn.Module):
    def __init__(self, input_size=784, hidden_size=500, num_classes=10):
        super(SimpleNet, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        # We will replace the standard ReLU with our Triton ReLU
        self.fc2 = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        out = self.fc1(x)
        # Apply Triton ReLU
        out = triton_relu(out)
        out = self.fc2(out)
        return out

def main():
    # Device configuration
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # Hyperparameters
    input_size = 784  # 28x28 images
    hidden_size = 500
    num_classes = 10
    num_epochs = 10
    batch_size = 100
    learning_rate = 0.001
    num_workers = 2  # Number of workers for data loading

    # MNIST dataset
    train_dataset = torchvision.datasets.MNIST(root='./data',
                                               train=True,
                                               transform=transforms.ToTensor(),
                                               download=True)

    test_dataset = torchvision.datasets.MNIST(root='./data',
                                              train=False,
                                              transform=transforms.ToTensor())

    # Data loaders
    train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                               batch_size=batch_size,
                                               shuffle=True,
                                               num_workers=num_workers)

    test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
                                              batch_size=batch_size,
                                              shuffle=False,
                                              num_workers=num_workers)

    # Initialize the network
    model = SimpleNet(input_size, hidden_size, num_classes).to(device)

    # Loss function and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # Mixed Precision Scaler
    scaler = GradScaler()

    # Start timing for training
    training_start_time = time.time()

    # Training loop
    total_step = len(train_loader)
    for epoch in range(num_epochs):
        model.train()  # Set the model to training mode
        for i, (images, labels) in enumerate(train_loader):
            # Flatten images to match input size
            images = images.reshape(-1, 28*28).to(device)
            labels = labels.to(device)

            # Forward and backward pass with mixed precision
            with autocast():  # Enable mixed precision
                outputs = model(images)
                loss = criterion(outputs, labels)

            optimizer.zero_grad()
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()

            # Print training information
            if (i+1) % 100 == 0:
                print(f'Epoch [{epoch+1}/{num_epochs}], '
                      f'Step [{i+1}/{total_step}], '
                      f'Loss: {loss.item():.4f}')

    # End timing for training
    training_time = time.time() - training_start_time
    print(f'\nTraining Time: {training_time:.2f} seconds')

    # Start timing for testing
    testing_start_time = time.time()

    # Switch to evaluation mode
    model.eval()

    # Disable gradient calculation
    with torch.no_grad():
        correct = 0
        total = 0
        for images, labels in test_loader:
            images = images.reshape(-1, 28*28).to(device)
            labels = labels.to(device)

            # Mixed precision inference
            with autocast():
                outputs = model(images)
                # Predicted class is the one with highest score
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()

        print(f'Test Accuracy of the model on the 10000 test images: '
              f'{100 * correct / total:.2f}%')

    # End timing for testing
    testing_time = time.time() - testing_start_time
    print(f'Testing Time: {testing_time:.2f} seconds')

if __name__ == '__main__':
    main()


  scaler = GradScaler()
  with autocast():  # Enable mixed precision
  with autocast():  # Enable mixed precision


Epoch [1/10], Step [100/600], Loss: 1.6054
Epoch [1/10], Step [200/600], Loss: 1.1688
Epoch [1/10], Step [300/600], Loss: 1.0150
Epoch [1/10], Step [400/600], Loss: 0.7707
Epoch [1/10], Step [500/600], Loss: 0.6278
Epoch [1/10], Step [600/600], Loss: 0.7433
Epoch [2/10], Step [100/600], Loss: 0.6039
Epoch [2/10], Step [200/600], Loss: 0.5750
Epoch [2/10], Step [300/600], Loss: 0.6101
Epoch [2/10], Step [400/600], Loss: 0.5917
Epoch [2/10], Step [500/600], Loss: 0.6455
Epoch [2/10], Step [600/600], Loss: 0.4352
Epoch [3/10], Step [100/600], Loss: 0.3941
Epoch [3/10], Step [200/600], Loss: 0.4324
Epoch [3/10], Step [300/600], Loss: 0.4357
Epoch [3/10], Step [400/600], Loss: 0.4754
Epoch [3/10], Step [500/600], Loss: 0.3079
Epoch [3/10], Step [600/600], Loss: 0.2827
Epoch [4/10], Step [100/600], Loss: 0.3528
Epoch [4/10], Step [200/600], Loss: 0.2963
Epoch [4/10], Step [300/600], Loss: 0.5231
Epoch [4/10], Step [400/600], Loss: 0.5822
Epoch [4/10], Step [500/600], Loss: 0.3819
Epoch [4/10

  with autocast():


Test Accuracy of the model on the 10000 test images: 92.84%
Testing Time: 2.18 seconds


6. Model Description: Native Triton with Custom Triton Kernel , Triton ReLU function., neural network using PyTorch nn_module/native   with new  Autocast 'cuda' and GradScaler 'cuda'(Mixed presession from Triton)

In [None]:
import time  # Import time module for execution time measurement
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
import triton
import triton.language as tl
from torch.amp import autocast, GradScaler

# Triton ReLU kernel
@triton.jit
def relu_kernel(input_ptr, output_ptr, n_elements, BLOCK_SIZE: tl.constexpr):
    # Obtain program ID
    pid = tl.program_id(axis=0)
    # Calculate the starting index for this program
    block_start = pid * BLOCK_SIZE
    # Create an offset for each element within the block
    offsets = block_start + tl.arange(0, BLOCK_SIZE)
    # Create a mask for out-of-bounds indices
    mask = offsets < n_elements
    # Load input data
    x = tl.load(input_ptr + offsets, mask=mask)
    # Apply ReLU activation
    y = tl.maximum(x, 0.0)
    # Store the result
    tl.store(output_ptr + offsets, y, mask=mask)

def triton_relu(input_tensor):
    output = torch.empty_like(input_tensor)
    n_elements = input_tensor.numel()
    # Define the grid (number of blocks)
    grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),)
    # Launch the kernel
    relu_kernel[grid](input_tensor, output, n_elements, BLOCK_SIZE=1024)
    return output

# Define the neural network
class SimpleNet(nn.Module):
    def __init__(self, input_size=784, hidden_size=500, num_classes=10):
        super(SimpleNet, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        # We will replace the standard ReLU with our Triton ReLU
        self.fc2 = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        out = self.fc1(x)
        # Apply Triton ReLU
        out = triton_relu(out)
        out = self.fc2(out)
        return out

def main():
    # Device configuration
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # Hyperparameters
    input_size = 784  # 28x28 images
    hidden_size = 500
    num_classes = 10
    num_epochs = 10
    batch_size = 100
    learning_rate = 0.001
    num_workers = 2  # Adjusted number of workers to prevent warnings

    # MNIST dataset
    train_dataset = torchvision.datasets.MNIST(root='./data',
                                               train=True,
                                               transform=transforms.ToTensor(),
                                               download=True)

    test_dataset = torchvision.datasets.MNIST(root='./data',
                                              train=False,
                                              transform=transforms.ToTensor())

    # Data loaders
    train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                               batch_size=batch_size,
                                               shuffle=True,
                                               num_workers=num_workers)

    test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
                                              batch_size=batch_size,
                                              shuffle=False,
                                              num_workers=num_workers)

    # Initialize the network
    model = SimpleNet(input_size, hidden_size, num_classes).to(device)

    # Loss function and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # Mixed Precision Scaler
    scaler = GradScaler('cuda')

    # Start timing for training
    training_start_time = time.time()

    # Training loop
    total_step = len(train_loader)
    for epoch in range(num_epochs):
        model.train()  # Set the model to training mode
        for i, (images, labels) in enumerate(train_loader):
            # Flatten images to match input size
            images = images.reshape(-1, 28*28).to(device)
            labels = labels.to(device)

            # Forward and backward pass with mixed precision
            with autocast('cuda'):  # Enable mixed precision
                outputs = model(images)
                loss = criterion(outputs, labels)

            optimizer.zero_grad()
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()

            # Print training information
            if (i+1) % 100 == 0:
                print(f'Epoch [{epoch+1}/{num_epochs}], '
                      f'Step [{i+1}/{total_step}], '
                      f'Loss: {loss.item():.4f}')

    # End timing for training
    training_time = time.time() - training_start_time
    print(f'\nTraining Time: {training_time:.2f} seconds')

    # Start timing for testing
    testing_start_time = time.time()

    # Switch to evaluation mode
    model.eval()

    # Disable gradient calculation
    with torch.no_grad():
        correct = 0
        total = 0
        for images, labels in test_loader:
            images = images.reshape(-1, 28*28).to(device)
            labels = labels.to(device)

            # Mixed precision inference
            with autocast('cuda'):
                outputs = model(images)
                # Predicted class is the one with highest score
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()

        print(f'Test Accuracy of the model on the 10000 test images: '
              f'{100 * correct / total:.2f}%')

    # End timing for testing
    testing_time = time.time() - testing_start_time
    print(f'Testing Time: {testing_time:.2f} seconds')

if __name__ == '__main__':
    main()


Epoch [1/10], Step [100/600], Loss: 1.5507
Epoch [1/10], Step [200/600], Loss: 1.1084
Epoch [1/10], Step [300/600], Loss: 0.9852
Epoch [1/10], Step [400/600], Loss: 0.7049
Epoch [1/10], Step [500/600], Loss: 0.7044
Epoch [1/10], Step [600/600], Loss: 0.5087
Epoch [2/10], Step [100/600], Loss: 0.6690
Epoch [2/10], Step [200/600], Loss: 0.5910
Epoch [2/10], Step [300/600], Loss: 0.5497
Epoch [2/10], Step [400/600], Loss: 0.5150
Epoch [2/10], Step [500/600], Loss: 0.5014
Epoch [2/10], Step [600/600], Loss: 0.3953
Epoch [3/10], Step [100/600], Loss: 0.3669
Epoch [3/10], Step [200/600], Loss: 0.4202
Epoch [3/10], Step [300/600], Loss: 0.4898
Epoch [3/10], Step [400/600], Loss: 0.4725
Epoch [3/10], Step [500/600], Loss: 0.3369
Epoch [3/10], Step [600/600], Loss: 0.4195
Epoch [4/10], Step [100/600], Loss: 0.3342
Epoch [4/10], Step [200/600], Loss: 0.3418
Epoch [4/10], Step [300/600], Loss: 0.5494
Epoch [4/10], Step [400/600], Loss: 0.3075
Epoch [4/10], Step [500/600], Loss: 0.3847
Epoch [4/10

7.	Native Triton with Custom Triton Kernel , Triton ReLU function., neural network using PyTorch pl.LightningModule

In [None]:
import time
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
import triton
import triton.language as tl
import pytorch_lightning as pl
from torch.utils.data import DataLoader
from torchvision.datasets import MNIST

# Triton ReLU kernel
@triton.jit
def relu_kernel(input_ptr, output_ptr, n_elements, BLOCK_SIZE: tl.constexpr):
    # Obtain program ID
    pid = tl.program_id(axis=0)
    # Calculate the starting index for this program
    block_start = pid * BLOCK_SIZE
    # Create an offset for each element within the block
    offsets = block_start + tl.arange(0, BLOCK_SIZE)
    # Create a mask for out-of-bounds indices
    mask = offsets < n_elements
    # Load input data
    x = tl.load(input_ptr + offsets, mask=mask)
    # Apply ReLU activation
    y = tl.maximum(x, 0.0)
    # Store the result
    tl.store(output_ptr + offsets, y, mask=mask)

def triton_relu(input_tensor):
    output = torch.empty_like(input_tensor)
    n_elements = input_tensor.numel()
    # Define the grid (number of blocks)
    grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),)
    # Launch the kernel
    relu_kernel[grid](input_tensor, output, n_elements, BLOCK_SIZE=1024)
    return output

# Define the neural network using PyTorch Lightning
class SimpleNet(pl.LightningModule):
    def __init__(self, input_size=784, hidden_size=500, num_classes=10, learning_rate=0.001):
        super(SimpleNet, self).__init__()
        self.save_hyperparameters()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, num_classes)
        self.learning_rate = learning_rate
        self.criterion = nn.CrossEntropyLoss()

    def forward(self, x):
        out = self.fc1(x)
        # Apply Triton ReLU
        out = triton_relu(out)
        out = self.fc2(out)
        return out

    def training_step(self, batch, batch_idx):
        images, labels = batch
        images = images.view(images.size(0), -1)  # Flatten images
        outputs = self(images)
        loss = self.criterion(outputs, labels)
        # Log training loss
        self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        images, labels = batch
        images = images.view(images.size(0), -1)  # Flatten images
        outputs = self(images)
        loss = self.criterion(outputs, labels)
        # Calculate accuracy
        preds = torch.argmax(outputs, dim=1)
        acc = (preds == labels).float().mean()
        # Log validation loss and accuracy
        self.log('val_loss', loss, prog_bar=True)
        self.log('val_acc', acc, prog_bar=True)

    def test_step(self, batch, batch_idx):
        images, labels = batch
        images = images.view(images.size(0), -1)  # Flatten images
        outputs = self(images)
        # Calculate accuracy
        preds = torch.argmax(outputs, dim=1)
        acc = (preds == labels).float().mean()
        # Log test accuracy
        self.log('test_acc', acc, prog_bar=True)

    def configure_optimizers(self):
        optimizer = optim.Adam(self.parameters(), lr=self.learning_rate)
        return optimizer

def main():
    # Hyperparameters
    input_size = 784  # 28x28 images
    hidden_size = 500
    num_classes = 10
    batch_size = 100
    learning_rate = 0.001
    num_epochs = 10

    # Transformations
    transform = transforms.Compose([
        transforms.ToTensor(),
    ])

    # MNIST datasets
    train_dataset = MNIST(root='./data', train=True, transform=transform, download=True)
    val_dataset = MNIST(root='./data', train=False, transform=transform, download=True)
    test_dataset = MNIST(root='./data', train=False, transform=transform, download=True)

    # Data loaders
    num_workers = 2  # Adjust based on your system
    train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers)
    val_loader = DataLoader(dataset=val_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers)
    test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers)

    # Initialize the network
    model = SimpleNet(input_size, hidden_size, num_classes, learning_rate)

    # Check if GPU is available
    accelerator = 'gpu' if torch.cuda.is_available() else 'cpu'
    devices = [0] if torch.cuda.is_available() else None  # Use the first GPU if available

    # Initialize a trainer
    trainer = pl.Trainer(
        max_epochs=num_epochs,
        accelerator=accelerator,
        devices=devices,
        log_every_n_steps=20,
    )

    # Start timing for training
    training_start_time = time.time()

    # Train the model
    trainer.fit(model, train_loader, val_loader)

    # End timing for training
    training_time = time.time() - training_start_time
    print(f'\nTraining Time: {training_time:.2f} seconds')

    # Start timing for testing
    testing_start_time = time.time()

    # Test the model
    trainer.test(model, dataloaders=test_loader)

    # End timing for testing
    testing_time = time.time() - testing_start_time
    print(f'Testing Time: {testing_time:.2f} seconds')

if __name__ == '__main__':
    main()


INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name      | Type             | Params | Mode 
-------------------------------------------------------
0 | fc1       | Linear           | 392 K  | train
1 | fc2       | Linear           | 5.0 K  | train
2 | criterion | CrossEntropyLoss | 0      | train
-------------------------------------------------------
397 K     Trainable params
0         Non-trainable params
397 K     Total params
1.590     Total estimated model params size (MB)
3         Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=10` reached.
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]



Training Time: 143.75 seconds


Testing: |          | 0/? [00:00<?, ?it/s]

Testing Time: 1.70 seconds


8.	Native Triton with Custom Triton Kernel , Triton ReLU function., neural network using PyTorch pl.LightningModule, num_workers , Lightening precision='16-mixed'


10.	Model name : Native pytorch  with  Triton language & new Mixed precision  & number of workers 4


In [None]:
import time
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
import triton
import triton.language as tl
import pytorch_lightning as pl
from torch.utils.data import DataLoader
from torchvision.datasets import MNIST
from torch.amp import autocast

# Triton ReLU kernel
@triton.jit
def relu_kernel(input_ptr, output_ptr, n_elements, BLOCK_SIZE: tl.constexpr):
    pid = tl.program_id(axis=0)
    block_start = pid * BLOCK_SIZE
    offsets = block_start + tl.arange(0, BLOCK_SIZE)
    mask = offsets < n_elements
    x = tl.load(input_ptr + offsets, mask=mask)
    y = tl.maximum(x, 0.0)
    tl.store(output_ptr + offsets, y, mask=mask)

def triton_relu(input_tensor):
    output = torch.empty_like(input_tensor)
    n_elements = input_tensor.numel()
    grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),)
    relu_kernel[grid](input_tensor, output, n_elements, BLOCK_SIZE=1024)
    return output

# Define the neural network using PyTorch Lightning
class SimpleNet(pl.LightningModule):
    def __init__(self, input_size=784, hidden_size=500, num_classes=10, learning_rate=0.001):
        super(SimpleNet, self).__init__()
        self.save_hyperparameters()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, num_classes)
        self.learning_rate = learning_rate
        self.criterion = nn.CrossEntropyLoss()

    def forward(self, x):
        out = self.fc1(x)
        out = triton_relu(out)  # Use Triton ReLU
        out = self.fc2(out)
        return out

    def training_step(self, batch, batch_idx):
        images, labels = batch
        images = images.view(images.size(0), -1)  # Flatten images
        # Forward pass with mixed precision
        with autocast(device_type='cuda'):
            outputs = self(images)
            loss = self.criterion(outputs, labels)

        # Log training loss
        self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        images, labels = batch
        images = images.view(images.size(0), -1)  # Flatten images
        outputs = self(images)
        loss = self.criterion(outputs, labels)
        # Calculate accuracy
        preds = torch.argmax(outputs, dim=1)
        acc = (preds == labels).float().mean()
        # Log validation loss and accuracy
        self.log('val_loss', loss, prog_bar=True)
        self.log('val_acc', acc, prog_bar=True)

    def test_step(self, batch, batch_idx):
        images, labels = batch
        images = images.view(images.size(0), -1)  # Flatten images
        outputs = self(images)
        # Calculate accuracy
        preds = torch.argmax(outputs, dim=1)
        acc = (preds == labels).float().mean()
        # Log test accuracy
        self.log('test_acc', acc, prog_bar=True)

    def configure_optimizers(self):
        optimizer = optim.Adam(self.parameters(), lr=self.learning_rate)
        return optimizer

def main():
    # Hyperparameters
    input_size = 784  # 28x28 images
    hidden_size = 500
    num_classes = 10
    batch_size = 100
    learning_rate = 0.001
    num_epochs = 10
    num_workers = 2  # Adjust number of workers for optimal performance

    # Transformations
    transform = transforms.Compose([transforms.ToTensor()])

    # MNIST datasets
    train_dataset = MNIST(root='./data', train=True, transform=transform, download=True)
    val_dataset = MNIST(root='./data', train=False, transform=transform, download=True)
    test_dataset = MNIST(root='./data', train=False, transform=transform, download=True)

    # Data loaders
    train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers)
    val_loader = DataLoader(dataset=val_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers)
    test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers)

    # Initialize the network
    model = SimpleNet(input_size, hidden_size, num_classes, learning_rate)

    # Check if GPU is available
    accelerator = 'gpu' if torch.cuda.is_available() else 'cpu'
    devices = [0] if torch.cuda.is_available() else None  # Use the first GPU if available

    # Initialize a trainer with mixed precision and GPU acceleration
    trainer = pl.Trainer(
        max_epochs=num_epochs,
        accelerator=accelerator,
        devices=devices,
        precision='16-mixed',  # Enable mixed precision
        log_every_n_steps=20,
    )

    # Start timing for training
    training_start_time = time.time()

    # Train the model
    trainer.fit(model, train_loader, val_loader)

    # End timing for training
    training_time = time.time() - training_start_time
    print(f'\nTraining Time: {training_time:.2f} seconds')

    # Start timing for testing
    testing_start_time = time.time()

    # Test the model
    trainer.test(model, dataloaders=test_loader)

    # End timing for testing
    testing_time = time.time() - testing_start_time
    print(f'Testing Time: {testing_time:.2f} seconds')

if __name__ == '__main__':
    main()


INFO:pytorch_lightning.utilities.rank_zero:Using 16bit Automatic Mixed Precision (AMP)
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name      | Type             | Params | Mode 
-------------------------------------------------------
0 | fc1       | Linear           | 392 K  | train
1 | fc2       | Linear           | 5.0 K  | train
2 | criterion | CrossEntropyLoss | 0      | train
-------------------------------------------------------
397 K     Trainable params
0         Non-trainable params
397 K     Total params
1.590     Total estimated model params size (MB)
3         Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=10` reached.
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]



Training Time: 141.71 seconds


Testing: |          | 0/? [00:00<?, ?it/s]

Testing Time: 1.70 seconds


9.	Native Triton with Custom Triton Kernel , Triton ReLU function., neural network using PyTorch pl.LightningModule, num_workers , Forward pass with mixed precision with triton and
Backward pass with Lightening precision='16-mixed'


In [None]:
import time
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
import triton
import triton.language as tl
import pytorch_lightning as pl
from torch.utils.data import DataLoader
from torchvision.datasets import MNIST
from torch.amp import autocast

# Triton ReLU kernel
@triton.jit
def relu_kernel(input_ptr, output_ptr, n_elements, BLOCK_SIZE: tl.constexpr):
    pid = tl.program_id(axis=0)
    block_start = pid * BLOCK_SIZE
    offsets = block_start + tl.arange(0, BLOCK_SIZE)
    mask = offsets < n_elements
    x = tl.load(input_ptr + offsets, mask=mask)
    y = tl.maximum(x, 0.0)
    tl.store(output_ptr + offsets, y, mask=mask)

def triton_relu(input_tensor):
    output = torch.empty_like(input_tensor)
    n_elements = input_tensor.numel()
    grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),)
    relu_kernel[grid](input_tensor, output, n_elements, BLOCK_SIZE=1024)
    return output

# Define the neural network using PyTorch Lightning
class SimpleNet(pl.LightningModule):
    def __init__(self, input_size=784, hidden_size=500, num_classes=10, learning_rate=0.001):
        super(SimpleNet, self).__init__()
        self.save_hyperparameters()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, num_classes)
        self.learning_rate = learning_rate
        self.criterion = nn.CrossEntropyLoss()

    def forward(self, x):
        out = self.fc1(x)
        out = triton_relu(out)  # Use Triton ReLU
        out = self.fc2(out)
        return out

    def training_step(self, batch, batch_idx):
        images, labels = batch
        images = images.view(images.size(0), -1)  # Flatten images

        # Forward pass with mixed precision
        with autocast(device_type='cuda'):
            outputs = self(images)
            loss = self.criterion(outputs, labels)

        # Log training loss
        self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        images, labels = batch
        images = images.view(images.size(0), -1)  # Flatten images

        # Forward pass with mixed precision
        with autocast(device_type='cuda'):
            outputs = self(images)
            loss = self.criterion(outputs, labels)

        # Calculate accuracy
        preds = torch.argmax(outputs, dim=1)
        acc = (preds == labels).float().mean()

        # Log validation loss and accuracy
        self.log('val_loss', loss, prog_bar=True)
        self.log('val_acc', acc, prog_bar=True)

    def test_step(self, batch, batch_idx):
        images, labels = batch
        images = images.view(images.size(0), -1)  # Flatten images

        # Forward pass with mixed precision
        with autocast(device_type='cuda'):
            outputs = self(images)

        # Calculate accuracy
        preds = torch.argmax(outputs, dim=1)
        acc = (preds == labels).float().mean()

        # Log test accuracy
        self.log('test_acc', acc, prog_bar=True)

    def configure_optimizers(self):
        optimizer = optim.Adam(self.parameters(), lr=self.learning_rate)
        return optimizer

def main():
    # Hyperparameters
    input_size = 784  # 28x28 images
    hidden_size = 500
    num_classes = 10
    batch_size = 100
    learning_rate = 0.001
    num_epochs = 10
    num_workers = 2  # Adjust number of workers for optimal performance

    # Transformations
    transform = transforms.Compose([transforms.ToTensor()])

    # MNIST datasets
    train_dataset = MNIST(root='./data', train=True, transform=transform, download=True)
    val_dataset = MNIST(root='./data', train=False, transform=transform, download=True)
    test_dataset = MNIST(root='./data', train=False, transform=transform, download=True)

    # Data loaders
    train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers)
    val_loader = DataLoader(dataset=val_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers)
    test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers)

    # Initialize the network
    model = SimpleNet(input_size, hidden_size, num_classes, learning_rate)

    # Check if GPU is available
    accelerator = 'gpu' if torch.cuda.is_available() else 'cpu'
    devices = [0] if torch.cuda.is_available() else None  # Use the first GPU if available

    # Initialize a trainer with mixed precision and GPU acceleration
    trainer = pl.Trainer(
        max_epochs=num_epochs,
        accelerator=accelerator,
        devices=devices,
        precision='16-mixed',  # Enable mixed precision
        log_every_n_steps=20,
    )

    # Start timing for training
    training_start_time = time.time()

    # Train the model
    trainer.fit(model, train_loader, val_loader)

    # End timing for training
    training_time = time.time() - training_start_time
    print(f'\nTraining Time: {training_time:.2f} seconds')

    # Start timing for testing
    testing_start_time = time.time()

    # Test the model
    trainer.test(model, dataloaders=test_loader)

    # End timing for testing
    testing_time = time.time() - testing_start_time
    print(f'Testing Time: {testing_time:.2f} seconds')

if __name__ == '__main__':
    main()


INFO:pytorch_lightning.utilities.rank_zero:Using 16bit Automatic Mixed Precision (AMP)
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name      | Type             | Params | Mode 
-------------------------------------------------------
0 | fc1       | Linear           | 392 K  | train
1 | fc2       | Linear           | 5.0 K  | train
2 | criterion | CrossEntropyLoss | 0      | train
-------------------------------------------------------
397 K     Trainable params
0         Non-trainable params
397 K     Total params
1.590     Total estimated model params size (MB)
3         Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=10` reached.
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]



Training Time: 134.68 seconds


Testing: |          | 0/? [00:00<?, ?it/s]

Testing Time: 1.58 seconds


In [None]:
import time  # Import time module for execution time measurement
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
import triton
import triton.language as tl
from torch.amp import autocast, GradScaler

# Triton ReLU kernel
@triton.jit
def relu_kernel(input_ptr, output_ptr, n_elements, BLOCK_SIZE: tl.constexpr):
    # Obtain program ID
    pid = tl.program_id(axis=0)
    # Calculate the starting index for this program
    block_start = pid * BLOCK_SIZE
    # Create an offset for each element within the block
    offsets = block_start + tl.arange(0, BLOCK_SIZE)
    # Create a mask for out-of-bounds indices
    mask = offsets < n_elements
    # Load input data
    x = tl.load(input_ptr + offsets, mask=mask)
    # Apply ReLU activation
    y = tl.maximum(x, 0.0)
    # Store the result
    tl.store(output_ptr + offsets, y, mask=mask)

def triton_relu(input_tensor):
    output = torch.empty_like(input_tensor)
    n_elements = input_tensor.numel()
    # Define the grid (number of blocks)
    grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),)
    # Launch the kernel
    relu_kernel[grid](input_tensor, output, n_elements, BLOCK_SIZE=1024)
    return output

# Define the neural network
class SimpleNet(nn.Module):
    def __init__(self, input_size=784, hidden_size=500, num_classes=10):
        super(SimpleNet, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        # We will replace the standard ReLU with our Triton ReLU
        self.fc2 = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        out = self.fc1(x)
        # Apply Triton ReLU
        out = triton_relu(out)
        out = self.fc2(out)
        return out

def main():
    # Device configuration
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # Hyperparameters
    input_size = 784  # 28x28 images
    hidden_size = 500
    num_classes = 10
    num_epochs = 10
    batch_size = 100
    learning_rate = 0.001
    num_workers = 4  # Adjusted number of workers to prevent warnings

    # MNIST dataset
    train_dataset = torchvision.datasets.MNIST(root='./data',
                                               train=True,
                                               transform=transforms.ToTensor(),
                                               download=True)

    test_dataset = torchvision.datasets.MNIST(root='./data',
                                              train=False,
                                              transform=transforms.ToTensor())

    # Data loaders
    train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                               batch_size=batch_size,
                                               shuffle=True,
                                               num_workers=num_workers)

    test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
                                              batch_size=batch_size,
                                              shuffle=False,
                                              num_workers=num_workers)

    # Initialize the network
    model = SimpleNet(input_size, hidden_size, num_classes).to(device)

    # Loss function and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # Mixed Precision Scaler
    scaler = GradScaler('cuda')

    # Start timing for training
    training_start_time = time.time()

    # Training loop
    total_step = len(train_loader)
    for epoch in range(num_epochs):
        model.train()  # Set the model to training mode
        for i, (images, labels) in enumerate(train_loader):
            # Flatten images to match input size
            images = images.reshape(-1, 28*28).to(device)
            labels = labels.to(device)

            # Forward and backward pass with mixed precision
            with autocast('cuda'):  # Enable mixed precision
                outputs = model(images)
                loss = criterion(outputs, labels)

            optimizer.zero_grad()
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()

            # Print training information
            if (i+1) % 100 == 0:
                print(f'Epoch [{epoch+1}/{num_epochs}], '
                      f'Step [{i+1}/{total_step}], '
                      f'Loss: {loss.item():.4f}')

    # End timing for training
    training_time = time.time() - training_start_time
    print(f'\nTraining Time: {training_time:.2f} seconds')

    # Start timing for testing
    testing_start_time = time.time()

    # Switch to evaluation mode
    model.eval()

    # Disable gradient calculation
    with torch.no_grad():
        correct = 0
        total = 0
        for images, labels in test_loader:
            images = images.reshape(-1, 28*28).to(device)
            labels = labels.to(device)

            # Mixed precision inference
            with autocast('cuda'):
                outputs = model(images)
                # Predicted class is the one with highest score
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()

        print(f'Test Accuracy of the model on the 10000 test images: '
              f'{100 * correct / total:.2f}%')

    # End timing for testing
    testing_time = time.time() - testing_start_time
    print(f'Testing Time: {testing_time:.2f} seconds')

if __name__ == '__main__':
    main()


Epoch [1/10], Step [100/600], Loss: 1.5626
Epoch [1/10], Step [200/600], Loss: 1.2552
Epoch [1/10], Step [300/600], Loss: 0.9105
Epoch [1/10], Step [400/600], Loss: 0.7924
Epoch [1/10], Step [500/600], Loss: 0.7422
Epoch [1/10], Step [600/600], Loss: 0.6990
Epoch [2/10], Step [100/600], Loss: 0.5304
Epoch [2/10], Step [200/600], Loss: 0.5079
Epoch [2/10], Step [300/600], Loss: 0.5263
Epoch [2/10], Step [400/600], Loss: 0.4998
Epoch [2/10], Step [500/600], Loss: 0.3818
Epoch [2/10], Step [600/600], Loss: 0.3307
Epoch [3/10], Step [100/600], Loss: 0.5337
Epoch [3/10], Step [200/600], Loss: 0.3550
Epoch [3/10], Step [300/600], Loss: 0.4301
Epoch [3/10], Step [400/600], Loss: 0.4663
Epoch [3/10], Step [500/600], Loss: 0.4136
Epoch [3/10], Step [600/600], Loss: 0.4986
Epoch [4/10], Step [100/600], Loss: 0.3250
Epoch [4/10], Step [200/600], Loss: 0.3163
Epoch [4/10], Step [300/600], Loss: 0.3471
Epoch [4/10], Step [400/600], Loss: 0.3185
Epoch [4/10], Step [500/600], Loss: 0.4504
Epoch [4/10