In [None]:
!pip install torchmetrics



In [None]:
!pip install pytorch-lightning

Collecting pytorch-lightning
  Downloading pytorch_lightning-2.4.0-py3-none-any.whl.metadata (21 kB)
Collecting torchmetrics>=0.7.0 (from pytorch-lightning)
  Downloading torchmetrics-1.4.2-py3-none-any.whl.metadata (19 kB)
Collecting lightning-utilities>=0.10.0 (from pytorch-lightning)
  Downloading lightning_utilities-0.11.7-py3-none-any.whl.metadata (5.2 kB)
Downloading pytorch_lightning-2.4.0-py3-none-any.whl (815 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m815.2/815.2 kB[0m [31m18.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading lightning_utilities-0.11.7-py3-none-any.whl (26 kB)
Downloading torchmetrics-1.4.2-py3-none-any.whl (869 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m869.2/869.2 kB[0m [31m48.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: lightning-utilities, torchmetrics, pytorch-lightning
Successfully installed lightning-utilities-0.11.7 pytorch-lightning-2.4.0 torchmetrics-1.4.2


In [None]:
!pip install triton

Collecting triton
  Downloading triton-3.0.0-1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.3 kB)
Downloading triton-3.0.0-1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (209.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.4/209.4 MB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: triton
Successfully installed triton-3.0.0


Naive Lighteining

In [None]:
import time
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
import pytorch_lightning as pl
from torch.utils.data import DataLoader
from torchvision.datasets import MNIST

# Define the neural network using PyTorch Lightning
class SimpleNet(pl.LightningModule):
    def __init__(self, input_size=784, hidden_size=500, num_classes=10, learning_rate=0.001):
        super(SimpleNet, self).__init__()
        self.save_hyperparameters()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()  # Using CuDNN-optimized ReLU from PyTorch
        self.fc2 = nn.Linear(hidden_size, num_classes)
        self.learning_rate = learning_rate
        self.criterion = nn.CrossEntropyLoss()

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        return out

    def training_step(self, batch, batch_idx):
        images, labels = batch
        images = images.view(images.size(0), -1)  # Flatten images
        outputs = self(images)
        loss = self.criterion(outputs, labels)
        # Log training loss
        self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        images, labels = batch
        images = images.view(images.size(0), -1)  # Flatten images
        outputs = self(images)
        loss = self.criterion(outputs, labels)
        # Calculate accuracy
        preds = torch.argmax(outputs, dim=1)
        acc = (preds == labels).float().mean()
        # Log validation loss and accuracy
        self.log('val_loss', loss, prog_bar=True)
        self.log('val_acc', acc, prog_bar=True)

    def test_step(self, batch, batch_idx):
        images, labels = batch
        images = images.view(images.size(0), -1)  # Flatten images
        outputs = self(images)
        # Calculate accuracy
        preds = torch.argmax(outputs, dim=1)
        acc = (preds == labels).float().mean()
        # Log test accuracy
        self.log('test_acc', acc, prog_bar=True)

    def configure_optimizers(self):
        optimizer = optim.Adam(self.parameters(), lr=self.learning_rate)
        return optimizer

def main():
    # Hyperparameters
    input_size = 784  # 28x28 images
    hidden_size = 500
    num_classes = 10
    batch_size = 100
    learning_rate = 0.001
    num_epochs = 10

    # Transformations
    transform = transforms.Compose([
        transforms.ToTensor(),
    ])

    # MNIST datasets
    train_dataset = MNIST(root='./data', train=True, transform=transform, download=True)
    val_dataset = MNIST(root='./data', train=False, transform=transform, download=True)
    test_dataset = MNIST(root='./data', train=False, transform=transform, download=True)

    # Data loaders
    num_workers = 2  # Adjust based on your system
    train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers)
    val_loader = DataLoader(dataset=val_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers)
    test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers)

    # Initialize the network
    model = SimpleNet(input_size, hidden_size, num_classes, learning_rate)

    # Check if GPU is available
    accelerator = 'gpu' if torch.cuda.is_available() else 'cpu'
    devices = [0] if torch.cuda.is_available() else None  # Use the first GPU if available

    # Initialize a trainer
    trainer = pl.Trainer(
        max_epochs=num_epochs,
        accelerator=accelerator,
        devices=devices,
        log_every_n_steps=20,
    )

    # Start timing for training
    training_start_time = time.time()

    # Train the model
    trainer.fit(model, train_loader, val_loader)

    # End timing for training
    training_time = time.time() - training_start_time
    print(f'\nTraining Time: {training_time:.2f} seconds')

    # Start timing for testing
    testing_start_time = time.time()

    # Test the model
    trainer.test(model, dataloaders=test_loader)

    # End timing for testing
    testing_time = time.time() - testing_start_time
    print(f'Testing Time: {testing_time:.2f} seconds')

if __name__ == '__main__':
    main()


INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name      | Type             | Params | Mode 
-------------------------------------------------------
0 | fc1       | Linear           | 392 K  | train
1 | relu      | ReLU             | 0      | train
2 | fc2       | Linear           | 5.0 K  | train
3 | criterion | CrossEntropyLoss | 0      | train
-------------------------------------------------------
397 K     Trainable params
0         Non-trainable params
397 K     Total params
1.590     Total estimated model params size (MB)
4         Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=10` reached.
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]



Training Time: 114.35 seconds


Testing: |          | 0/? [00:00<?, ?it/s]

Testing Time: 1.42 seconds


Native Pytorch Lighetning without Lamda

In [None]:
import time
import torch
from torchvision import transforms
from torchvision.datasets import MNIST
from torch.utils.data import DataLoader
import pytorch_lightning as pl
import torch.nn.functional as F

class SimpleNet(pl.LightningModule):
    def __init__(self, input_size=784, hidden_size=500, num_classes=10, lr=0.001):
        super().__init__()
        self.save_hyperparameters()
        self.model = torch.nn.Sequential(
            torch.nn.Linear(input_size, hidden_size),
            torch.nn.ReLU(),
            torch.nn.Linear(hidden_size, num_classes)
        )

    def forward(self, x):
        return self.model(x.view(x.size(0), -1))

    def training_step(self, batch, _):
        x, y = batch
        loss = F.cross_entropy(self(x), y)
        self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True)
        return loss

    def validation_step(self, batch, _):
        x, y = batch
        acc = (torch.argmax(self(x), dim=1) == y).float().mean()
        self.log('val_acc', acc, prog_bar=True)

    def test_step(self, batch, _):
        x, y = batch
        acc = (torch.argmax(self(x), dim=1) == y).float().mean()
        self.log('test_acc', acc, prog_bar=True)

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.hparams.lr)

def main():
    transform = transforms.ToTensor()
    train_loader = DataLoader(MNIST(root='./data', train=True, download=True, transform=transform), batch_size=100, num_workers=2, shuffle=True)
    val_loader = DataLoader(MNIST(root='./data', train=False, download=True, transform=transform), batch_size=100, num_workers=2)

    model = SimpleNet()
    trainer = pl.Trainer(max_epochs=10, accelerator='auto', devices='auto', log_every_n_steps=20)

    start_time = time.time()
    trainer.fit(model, train_loader, val_loader)
    print(f'\nTraining Time: {time.time() - start_time:.2f} seconds')

    start_time = time.time()
    trainer.test(model, val_loader)
    print(f'Testing Time: {time.time() - start_time:.2f} seconds')

if __name__ == '__main__':
    main()


INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type       | Params | Mode 
---------------------------------------------
0 | model | Sequential | 397 K  | train
---------------------------------------------
397 K     Trainable params
0         Non-trainable params
397 K     Total params
1.590     Total estimated model params size (MB)
4         Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=10` reached.
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]



Training Time: 112.91 seconds


Testing: |          | 0/? [00:00<?, ?it/s]

Testing Time: 1.41 seconds


With Lamda

In [None]:
import time
import torch
import torch.nn.functional as F
from torchvision import transforms
from torchvision.datasets import MNIST
from torch.utils.data import DataLoader
import pytorch_lightning as pl

class SimpleNet(pl.LightningModule):
    def __init__(self, input_size=784, hidden_size=500, num_classes=10, lr=0.001):
        super().__init__()
        self.save_hyperparameters()
        self.model = torch.nn.Sequential(
            torch.nn.Linear(input_size, hidden_size),
            torch.nn.ReLU(),
            torch.nn.Linear(hidden_size, num_classes)
        )
        self.lr = lr

    forward = lambda self, x: self.model(x.view(x.size(0), -1))
    training_step = lambda self, batch, _: self._shared_step(batch, 'train_loss')
    validation_step = lambda self, batch, _: self._shared_step(batch, 'val_acc')
    test_step = lambda self, batch, _: self._shared_step(batch, 'test_acc')

    def _shared_step(self, batch, log_name):
        x, y = batch
        preds = torch.argmax(self(x), dim=1)
        metric = F.cross_entropy(self(x), y) if 'loss' in log_name else (preds == y).float().mean()
        self.log(log_name, metric, prog_bar=True)
        return metric if 'loss' in log_name else None

    configure_optimizers = lambda self: torch.optim.Adam(self.parameters(), lr=self.lr)

def main():
    transform = transforms.ToTensor()
    train_loader = DataLoader(MNIST(root='./data', train=True, download=True, transform=transform), batch_size=100, num_workers=2, shuffle=True)
    val_loader = DataLoader(MNIST(root='./data', train=False, download=True, transform=transform), batch_size=100, num_workers=2)

    model = SimpleNet()
    trainer = pl.Trainer(max_epochs=10, accelerator='auto', devices='auto', log_every_n_steps=20)

    # Measure training time
    start_time = time.time()
    trainer.fit(model, train_loader, val_loader)
    print(f'\nTraining Time: {time.time() - start_time:.2f} seconds')

    # Measure testing time
    start_time = time.time()
    trainer.test(model, val_loader)
    print(f'Testing Time: {time.time() - start_time:.2f} seconds')

if __name__ == '__main__':
    main()


INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type       | Params | Mode 
---------------------------------------------
0 | model | Sequential | 397 K  | train
---------------------------------------------
397 K     Trainable params
0         Non-trainable params
397 K     Total params
1.590     Total estimated model params size (MB)
4         Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=10` reached.
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]



Training Time: 116.01 seconds


Testing: |          | 0/? [00:00<?, ?it/s]

Testing Time: 1.38 seconds


With More optimzation

In [None]:
import time
import torch
from torchvision import transforms
from torchvision.datasets import MNIST
from torch.utils.data import DataLoader
import pytorch_lightning as pl
import torch.nn.functional as F
from torchmetrics.functional import accuracy


class SimpleNet(pl.LightningModule):
    def __init__(self, input_size=784, hidden_size=500, num_classes=10, lr=0.001):
        super().__init__()
        self.save_hyperparameters()
        self.model = torch.nn.Sequential(
            torch.nn.Linear(input_size, hidden_size),
            torch.nn.ReLU(),
            torch.nn.Linear(hidden_size, num_classes)
        )

    forward = lambda self, x: self.model(x.view(x.size(0), -1))

    shared_step = lambda self, batch, log_name: (
    self.log(log_name, F.cross_entropy(self(batch[0]), batch[1]), prog_bar=True) if 'loss' in log_name
    else self.log(log_name, accuracy(torch.argmax(self(batch[0]), dim=1), batch[1], task="multiclass", num_classes=10), prog_bar=True)
)


    training_step = lambda self, batch, _: self.shared_step(batch, 'train_loss')
    validation_step = lambda self, batch, _: self.shared_step(batch, 'val_acc')
    test_step = lambda self, batch, _: self.shared_step(batch, 'test_acc')

    configure_optimizers = lambda self: torch.optim.Adam(self.parameters(), lr=self.hparams.lr)

def main():
    transform = transforms.ToTensor()
    loaders = {name: DataLoader(MNIST('./data', train=(name == 'train'), download=True, transform=transform), batch_size=100, num_workers=2, shuffle=(name == 'train'))
               for name in ['train', 'val']}

    model = SimpleNet()
    trainer = pl.Trainer(max_epochs=10, accelerator='auto', devices='auto', log_every_n_steps=20)

    for stage in ['fit', 'test']:
        start_time = time.time()
        getattr(trainer, stage)(model, loaders['train'] if stage == 'fit' else loaders['val'])
        print(f'\n{stage.capitalize()} Time: {time.time() - start_time:.2f} seconds')

if __name__ == '__main__':
    main()


INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type       | Params | Mode 
---------------------------------------------
0 | model | Sequential | 397 K  | train
---------------------------------------------
397 K     Trainable params
0         Non-trainable params
397 K     Total params
1.590     Total estimated model params size (MB)
4         Modules in train mode
0         Modules in eval mode


Training: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=10` reached.
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]



Fit Time: 88.85 seconds


Testing: |          | 0/? [00:00<?, ?it/s]


Test Time: 2.01 seconds
