<a href="https://colab.research.google.com/github/AhmedEssam19/Graduation-Project/blob/Hyperparameter-Tuning/Hyperparameters_Tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install gdown

In [2]:
!gdown https://drive.google.com/uc?id=1_bAXzdCRBjoPSkO_MrQ_FRoM_-npeJll

Downloading...
From: https://drive.google.com/uc?id=1_bAXzdCRBjoPSkO_MrQ_FRoM_-npeJll
To: /content/data.zip
100% 2.05G/2.05G [00:40<00:00, 50.8MB/s]


In [None]:
!unzip '/content/data.zip' -d '/content/'

In [None]:
pip install "ray[tune]" torch torchvision pytorch-lightning

In [5]:
import torch
import torch.nn as nn
import torch.optim as optim  
import torchvision.transforms as transforms
import torchvision
import os
from torchvision.io import decode_jpeg
import pandas as pd
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torchvision import datasets,models
from torchvision.transforms import ToTensor
import matplotlib.pyplot as plt
from torchvision.io import read_image

In [6]:
NUM_CLASSES = 10
BATCH_SIZE = 32

In [7]:
class CreateDataset(Dataset):
    def __init__(self, df,transform=False):
        self.df = df
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):   
        img_path = self.df.iloc[index, 0]
        image = read_image(img_path) / 255.0
        label = self.df.iloc[index, 1]

        if self.transform:
            image = self.transform(image)

        return image, label

In [28]:
read_image('data/Camera 1/train/c0/1589.jpg')

tensor([[[ 63,  65,  67,  ..., 234, 234, 234],
         [ 63,  65,  67,  ..., 234, 234, 234],
         [ 63,  65,  67,  ..., 234, 234, 234],
         ...,
         [ 54,  56,  58,  ...,   0,   0,   0],
         [ 54,  56,  58,  ...,   0,   0,   0],
         [ 54,  56,  58,  ...,   0,   0,   0]],

        [[ 64,  66,  68,  ..., 235, 235, 235],
         [ 64,  66,  68,  ..., 235, 235, 235],
         [ 64,  66,  68,  ..., 235, 235, 235],
         ...,
         [ 27,  29,  31,  ...,   2,   2,   2],
         [ 27,  29,  31,  ...,   2,   2,   2],
         [ 27,  29,  31,  ...,   2,   2,   2]],

        [[ 46,  48,  50,  ..., 229, 229, 229],
         [ 46,  48,  50,  ..., 229, 229, 229],
         [ 46,  48,  50,  ..., 229, 229, 229],
         ...,
         [  0,   2,   4,  ...,   1,   1,   1],
         [  0,   2,   4,  ...,   1,   1,   1],
         [  0,   2,   4,  ...,   1,   1,   1]]], dtype=torch.uint8)

In [8]:
transformers = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

train_df = pd.read_csv("data/train.csv")
val_df = pd.read_csv("data/val.csv")
test_df = pd.read_csv("data/test.csv")

train_dataset=CreateDataset(train_df, transformers)
test_dataset=CreateDataset(test_df, transformers)
val_dataset=CreateDataset(val_df, transformers)

In [29]:
train_dataloader = DataLoader(dataset=train_dataset, batch_size=32, shuffle=True)
test_dataloader = DataLoader(dataset=test_dataset, batch_size=32,shuffle=False)
val_dataloader = DataLoader(dataset=val_dataset, batch_size=32,shuffle=False)

In [48]:
import pytorch_lightning as pl
import torchmetrics
from torch import nn

class Model(pl.LightningModule):
   
    def __init__(self, output_units, config, freeze_base=False):
        super().__init__()
        self.base_model = torchvision.models.resnet50(pretrained=True)

        freezing_layers = [
            self.base_model.conv1,
            self.base_model.bn1,
            self.base_model.layer1,
            self.base_model.layer2,
        ]
        for layer in freezing_layers:
          for param in layer.parameters():
            param.requires_grad = False

        self.base_model.fc = torch.nn.Linear(in_features=self.base_model.fc.in_features, out_features=500)
        self.clf = torch.nn.Linear(in_features=500, out_features=output_units)
        self.lr = config["lr"]
        self.dropout = torch.nn.Dropout(p=config["dropout"])
        #self.batch_size = config["batch_size"]

        self.criterion = nn.CrossEntropyLoss()
        self.train_acc = torchmetrics.Accuracy()
        self.val_acc = torchmetrics.Accuracy()

        self.save_hyperparameters()

    def forward(self, input_data):
        features = self.base_model(input_data)
        features = self.dropout(features)
        return self.clf(features)

    def cross_entropy_loss(self, logits, labels):
        return F.nll_loss(logits, labels)

    def accuracy(self, logits, labels):
        _, predicted = torch.max(logits.data, 1)
        correct = (predicted == labels).sum().item()
        accuracy = correct / len(labels)
        return torch.tensor(accuracy)

    def training_step(self, batch, batch_nb):
        input_data, targets = batch
        preds = self(input_data)
        loss = self.criterion(preds, targets)
        self.log('train_loss', loss)
        self.train_acc(preds, targets)
        self.log('train_acc', self.train_acc, on_step=True, on_epoch=False, prog_bar=True)
        
        return loss

    def validation_step(self, batch, batch_nb):
        input_data, targets = batch
        preds = self(input_data)
        loss = self.criterion(preds, targets)
        self.log('val_loss', loss, on_step=False, on_epoch=True, prog_bar=True)
        self.val_acc(preds, targets)
        self.log('val_acc', self.val_acc, on_step=False, on_epoch=True, prog_bar=True)

    def validation_epoch_end(self, outputs):
        avg_loss = torch.stack([x["val_loss"] for x in outputs]).mean()
        avg_acc = torch.stack([x["val_accuracy"] for x in outputs]).mean()
        self.log("ptl/val_loss", avg_loss)
        self.log("ptl/val_accuracy", avg_acc)

    def test_step(self, batch, batch_nb):
        self.validation_step(batch, batch_nb)
        
    def predict_step(self, batch, batch_nb):
        input_data, targets = batch
        preds = self(input_data)
        return torch.argmax(preds, dim=1)

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.lr)
        return optimizer

In [47]:
def train_model(config):
    model = Model(NUM_CLASSES,config)
    trainer = pl.Trainer(max_epochs=10, show_progress_bar=False)

    trainer.fit(model,train_dataloader,val_dataloader)

In [13]:
import math
from pytorch_lightning.loggers import TensorBoardLogger
from ray import tune
from ray.tune import CLIReporter
from ray.tune.schedulers import ASHAScheduler, PopulationBasedTraining
from ray.tune.integration.pytorch_lightning import TuneReportCallback, \
    TuneReportCheckpointCallback

In [46]:
def train_model_tune(config, num_epochs=10, num_gpus=1):
    model = Model(NUM_CLASSES,config)
    trainer = pl.Trainer(
        max_epochs=num_epochs,
        # If fractional GPUs passed in, convert to int.
        gpus=math.ceil(num_gpus),
        logger=TensorBoardLogger(
            save_dir=tune.get_trial_dir(), name="", version="."),
        progress_bar_refresh_rate=0,
        callbacks=[
            TuneReportCallback(
                {
                    "loss": "ptl/val_loss",
                    "mean_accuracy": "ptl/val_accuracy"
                },
                on="validation_end")
        ])

    trainer.fit(model,train_dataloader,val_dataloader)

In [44]:
def tune_model_asha(num_samples=10, num_epochs=10, gpus_per_trial=1):
    config = {
      "dropout": tune.choice([0.1, 0.15, 0.2, 0.25, 0.3]),
      "lr": tune.loguniform(1e-4, 1e-1)
      #"batch_size": tune.choice([32, 64, 128]),
    }

    scheduler = ASHAScheduler(
        max_t=num_epochs,
        grace_period=1,
        reduction_factor=2)

    reporter = CLIReporter(
        parameter_columns=["dropout", "lr"],
        metric_columns=["loss", "mean_accuracy", "training_iteration"])

    train_fn_with_parameters = tune.with_parameters(train_model_tune,
                                                    num_epochs=num_epochs,
                                                    num_gpus=gpus_per_trial,
                                                    )
    resources_per_trial = {"cpu": 1, "gpu": gpus_per_trial}

    analysis = tune.run(train_fn_with_parameters,
        resources_per_trial=resources_per_trial,
        metric="loss",
        mode="min",
        config=config,
        num_samples=num_samples,
        scheduler=scheduler,
        progress_reporter=reporter,
        name="tune_model_asha")

    best_result = analysis.best_config
    print("Best hyperparameters found were: ", best_result)

In [45]:
tune_model_asha()

== Status ==
Current time: 2022-02-14 23:10:06 (running for 00:00:00.29)
Memory usage on this node: 2.2/12.7 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Resources requested: 0/2 CPUs, 0/1 GPUs, 0.0/6.53 GiB heap, 0.0/3.27 GiB objects (0.0/1.0 accelerator_type:T4)
Result logdir: /root/ray_results/tune_model_asha
Number of trials: 10/10 (10 PENDING)
+------------------------------+----------+-------+-----------+-------------+
| Trial name                   | status   | loc   |   dropout |          lr |
|------------------------------+----------+-------+-----------+-------------|
| train_model_tune_3fae1_00000 | PENDING  |       |      0.1  | 0.0143832   |
| train_model_tune_3fae1_00001 | PENDING  |       |      0.2  | 0.000219493 |
| train_model_tune_3fae1_00002 | PENDING  |       |      0.1  | 0.0124832   |
| train_model_tune_3fae1_00003 | PENDING  |       |      0.25 | 0.000480705 |
| train_model_tune_3fae1_

[2m[36m(train_model_tune pid=1272)[0m   f"Setting `Trainer(progress_bar_refresh_rate={progress_bar_refresh_rate})` is deprecated in v1.5 and"
[2m[36m(train_model_tune pid=1272)[0m GPU available: True, used: True
[2m[36m(train_model_tune pid=1272)[0m TPU available: False, using: 0 TPU cores
[2m[36m(train_model_tune pid=1272)[0m IPU available: False, using: 0 IPUs
[2m[36m(train_model_tune pid=1272)[0m   "The `on_keyboard_interrupt` callback hook was deprecated in v1.5 and will be removed in v1.7."
[2m[36m(train_model_tune pid=1272)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


== Status ==
Current time: 2022-02-14 23:10:12 (running for 00:00:05.38)
Memory usage on this node: 3.2/12.7 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Resources requested: 1.0/2 CPUs, 1.0/1 GPUs, 0.0/6.53 GiB heap, 0.0/3.27 GiB objects (0.0/1.0 accelerator_type:T4)
Result logdir: /root/ray_results/tune_model_asha
Number of trials: 10/10 (9 PENDING, 1 RUNNING)
+------------------------------+----------+-----------------+-----------+-------------+
| Trial name                   | status   | loc             |   dropout |          lr |
|------------------------------+----------+-----------------+-----------+-------------|
| train_model_tune_3fae1_00000 | RUNNING  | 172.28.0.2:1272 |      0.1  | 0.0143832   |
| train_model_tune_3fae1_00001 | PENDING  |                 |      0.2  | 0.000219493 |
| train_model_tune_3fae1_00002 | PENDING  |                 |      0.1  | 0.0124832   |
| train_model_tune_3fae1_0000

[2m[36m(train_model_tune pid=1272)[0m 
[2m[36m(train_model_tune pid=1272)[0m   | Name       | Type             | Params
[2m[36m(train_model_tune pid=1272)[0m ------------------------------------------------
[2m[36m(train_model_tune pid=1272)[0m 0 | base_model | ResNet           | 24.5 M
[2m[36m(train_model_tune pid=1272)[0m 1 | clf        | Linear           | 5.0 K 
[2m[36m(train_model_tune pid=1272)[0m 2 | dropout    | Dropout          | 0     
[2m[36m(train_model_tune pid=1272)[0m 3 | criterion  | CrossEntropyLoss | 0     
[2m[36m(train_model_tune pid=1272)[0m 4 | train_acc  | Accuracy         | 0     
[2m[36m(train_model_tune pid=1272)[0m 5 | val_acc    | Accuracy         | 0     
[2m[36m(train_model_tune pid=1272)[0m ------------------------------------------------
[2m[36m(train_model_tune pid=1272)[0m 23.1 M    Trainable params
[2m[36m(train_model_tune pid=1272)[0m 1.4 M     Non-trainable params
[2m[36m(train_model_tune pid=1272)[0m 24.5 M   

Result for train_model_tune_3fae1_00000:
  date: 2022-02-14_23-10-10
  experiment_id: 126e3badbad547dca60ebb0480f9f1f1
  hostname: a5da6c3c5d00
  node_ip: 172.28.0.2
  pid: 1272
  timestamp: 1644880210
  trial_id: 3fae1_00000
  
== Status ==
Current time: 2022-02-14 23:10:17 (running for 00:00:11.00)
Memory usage on this node: 4.6/12.7 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Resources requested: 0/2 CPUs, 0/1 GPUs, 0.0/6.53 GiB heap, 0.0/3.27 GiB objects (0.0/1.0 accelerator_type:T4)
Result logdir: /root/ray_results/tune_model_asha
Number of trials: 10/10 (1 ERROR, 9 PENDING)
+------------------------------+----------+-----------------+-----------+-------------+
| Trial name                   | status   | loc             |   dropout |          lr |
|------------------------------+----------+-----------------+-----------+-------------|
| train_model_tune_3fae1_00001 | PENDING  |                 |      0.2

[2m[36m(train_model_tune pid=1271)[0m   f"Setting `Trainer(progress_bar_refresh_rate={progress_bar_refresh_rate})` is deprecated in v1.5 and"
[2m[36m(train_model_tune pid=1271)[0m GPU available: True, used: True
[2m[36m(train_model_tune pid=1271)[0m TPU available: False, using: 0 TPU cores
[2m[36m(train_model_tune pid=1271)[0m IPU available: False, using: 0 IPUs
[2m[36m(train_model_tune pid=1271)[0m   "The `on_keyboard_interrupt` callback hook was deprecated in v1.5 and will be removed in v1.7."
[2m[36m(train_model_tune pid=1271)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


== Status ==
Current time: 2022-02-14 23:10:23 (running for 00:00:16.97)
Memory usage on this node: 3.1/12.7 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Resources requested: 1.0/2 CPUs, 1.0/1 GPUs, 0.0/6.53 GiB heap, 0.0/3.27 GiB objects (0.0/1.0 accelerator_type:T4)
Result logdir: /root/ray_results/tune_model_asha
Number of trials: 10/10 (1 ERROR, 8 PENDING, 1 RUNNING)
+------------------------------+----------+-----------------+-----------+-------------+
| Trial name                   | status   | loc             |   dropout |          lr |
|------------------------------+----------+-----------------+-----------+-------------|
| train_model_tune_3fae1_00001 | RUNNING  | 172.28.0.2:1271 |      0.2  | 0.000219493 |
| train_model_tune_3fae1_00002 | PENDING  |                 |      0.1  | 0.0124832   |
| train_model_tune_3fae1_00003 | PENDING  |                 |      0.25 | 0.000480705 |
| train_model_tune_3

[2m[36m(train_model_tune pid=1271)[0m 
[2m[36m(train_model_tune pid=1271)[0m   | Name       | Type             | Params
[2m[36m(train_model_tune pid=1271)[0m ------------------------------------------------
[2m[36m(train_model_tune pid=1271)[0m 0 | base_model | ResNet           | 24.5 M
[2m[36m(train_model_tune pid=1271)[0m 1 | clf        | Linear           | 5.0 K 
[2m[36m(train_model_tune pid=1271)[0m 2 | dropout    | Dropout          | 0     
[2m[36m(train_model_tune pid=1271)[0m 3 | criterion  | CrossEntropyLoss | 0     
[2m[36m(train_model_tune pid=1271)[0m 4 | train_acc  | Accuracy         | 0     
[2m[36m(train_model_tune pid=1271)[0m 5 | val_acc    | Accuracy         | 0     
[2m[36m(train_model_tune pid=1271)[0m ------------------------------------------------
[2m[36m(train_model_tune pid=1271)[0m 23.1 M    Trainable params
[2m[36m(train_model_tune pid=1271)[0m 1.4 M     Non-trainable params
[2m[36m(train_model_tune pid=1271)[0m 24.5 M   

Result for train_model_tune_3fae1_00001:
  date: 2022-02-14_23-10-22
  experiment_id: 91df609a0b8549cfaa09557ff2a900fa
  hostname: a5da6c3c5d00
  node_ip: 172.28.0.2
  pid: 1271
  timestamp: 1644880222
  trial_id: 3fae1_00001
  
== Status ==
Current time: 2022-02-14 23:10:34 (running for 00:00:28.07)
Memory usage on this node: 2.7/12.7 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Resources requested: 1.0/2 CPUs, 1.0/1 GPUs, 0.0/6.53 GiB heap, 0.0/3.27 GiB objects (0.0/1.0 accelerator_type:T4)
Result logdir: /root/ray_results/tune_model_asha
Number of trials: 10/10 (2 ERROR, 7 PENDING, 1 RUNNING)
+------------------------------+----------+-----------------+-----------+-------------+
| Trial name                   | status   | loc             |   dropout |          lr |
|------------------------------+----------+-----------------+-----------+-------------|
| train_model_tune_3fae1_00002 | RUNNING  | 172.28.0.2:

[2m[36m(train_model_tune pid=1345)[0m   f"Setting `Trainer(progress_bar_refresh_rate={progress_bar_refresh_rate})` is deprecated in v1.5 and"
[2m[36m(train_model_tune pid=1345)[0m GPU available: True, used: True
[2m[36m(train_model_tune pid=1345)[0m TPU available: False, using: 0 TPU cores
[2m[36m(train_model_tune pid=1345)[0m IPU available: False, using: 0 IPUs
[2m[36m(train_model_tune pid=1345)[0m   "The `on_keyboard_interrupt` callback hook was deprecated in v1.5 and will be removed in v1.7."
[2m[36m(train_model_tune pid=1345)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[2m[36m(train_model_tune pid=1345)[0m 2022-02-14 23:10:36,878	ERROR worker.py:432 -- SystemExit was raised from the worker.
[2m[36m(train_model_tune pid=1345)[0m Traceback (most recent call last):
[2m[36m(train_model_tune pid=1345)[0m   File "python/ray/_raylet.pyx", line 770, in ray._raylet.task_execution_handler
[2m[36m(train_model_tune pid=1345)[0m   File "python/ray/_raylet.pyx", li

== Status ==
Current time: 2022-02-14 23:10:36 (running for 00:00:30.17)
Memory usage on this node: 3.6/12.7 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Resources requested: 1.0/2 CPUs, 1.0/1 GPUs, 0.0/6.53 GiB heap, 0.0/3.27 GiB objects (0.0/1.0 accelerator_type:T4)
Result logdir: /root/ray_results/tune_model_asha
Number of trials: 10/10 (2 ERROR, 7 PENDING, 1 RUNNING)
+------------------------------+----------+-----------------+-----------+-------------+
| Trial name                   | status   | loc             |   dropout |          lr |
|------------------------------+----------+-----------------+-----------+-------------|
| train_model_tune_3fae1_00002 | RUNNING  | 172.28.0.2:1345 |      0.1  | 0.0124832   |
| train_model_tune_3fae1_00003 | PENDING  |                 |      0.25 | 0.000480705 |
| train_model_tune_3fae1_00004 | PENDING  |                 |      0.2  | 0.00302288  |
| train_model_tune_3

2022-02-14 23:10:37,101	ERROR tune.py:632 -- Trials did not complete: [train_model_tune_3fae1_00000, train_model_tune_3fae1_00001, train_model_tune_3fae1_00002, train_model_tune_3fae1_00003, train_model_tune_3fae1_00004, train_model_tune_3fae1_00005, train_model_tune_3fae1_00006, train_model_tune_3fae1_00007, train_model_tune_3fae1_00008, train_model_tune_3fae1_00009]
2022-02-14 23:10:37,103	INFO tune.py:636 -- Total run time: 30.45 seconds (30.16 seconds for the tuning loop).


Best hyperparameters found were:  None
