In [1]:
import os
os.chdir("../")
%pwd

'd:\\python-projects\\chest-cancer-classification'

In [2]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class LogConfig:
    root_dir: Path
    tensorboard: Path
    lightning: Path
    mlflow: Path


@dataclass
class TrainingConfig:
    root_dir: Path
    trained_model_path: Path
    updated_base_model_path: Path
    training_data: Path
    params_epochs: int
    params_batch_size: int
    params_is_augmentation: bool
    params_image_size: list
    params_learning_rate: float

In [3]:
from src.cnnClassifier.constants import CONFIG_FILE_PATH, PARAMS_FILE_PATH
from src.cnnClassifier.utils.common import read_yaml, create_directories

In [4]:
class ConfigurationManager:
    def __init__(
        self, 
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        create_directories([Path(self.config.artifacts_root)])


    
    def get_log_config(self) -> LogConfig:
        logs = self.config.logs
        create_directories([Path(logs.root_dir)])

        logs_config = LogConfig(
            root_dir=Path(logs.root_dir),
            tensorboard=Path(logs.tensorboard),
            lightning=Path(logs.lightning),
            mlflow = Path(logs.mlflow)
        )

        return logs_config



    def get_training_config(self) -> TrainingConfig:
        training = self.config.training
        prepare_base_model = self.config.prepare_base_model
        params = self.params
        training_data = Path(self.config.data_ingestion.unzip_dir) / "Chest-CT-Scan-data"
        create_directories([Path(training.root_dir)])

        training_config = TrainingConfig(
            root_dir=Path(training.root_dir),
            trained_model_path=Path(training.trained_model_path),
            updated_base_model_path=Path(prepare_base_model.updated_base_model_path),
            training_data=Path(training_data),
            params_epochs=params.EPOCHS,
            params_batch_size=params.BATCH_SIZE,
            params_is_augmentation=params.AUGMENTATION,
            params_image_size=params.IMAGE_SIZE,
            params_learning_rate=params.LEARNING_RATE
        )

        return training_config

In [5]:
import torch
from torchvision import transforms, datasets
from torch.utils.data import DataLoader, random_split
from src.cnnClassifier.components.prepare_base_model import MyImageClassifier
import pytorch_lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger, MLFlowLogger
from pytorch_lightning import Trainer

In [6]:
class TrainingDataModule(pl.LightningDataModule):
    def __init__(self, config: TrainingConfig):
        super().__init__()
        self.config = config


    def setup(self, stage=None):
        transform_list = [
            transforms.Resize(self.config.params_image_size[:-1]),
            transforms.ToTensor()
        ]
        if getattr(self.config, "params_is_augmentation", False):
            transform_list = [
                transforms.RandomHorizontalFlip(),
                transforms.RandomRotation(20),
                transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2),
            ] + transform_list

        transform = transforms.Compose(transform_list)
        dataset = datasets.ImageFolder(self.config.training_data, transform=transform)

        val_size = int(0.2 * len(dataset))
        train_size = len(dataset) - val_size

        generator = torch.Generator().manual_seed(42)
        self.train_dataset, self.val_dataset = random_split(dataset, [train_size, val_size], generator=generator)

    def train_dataloader(self):
        return DataLoader(self.train_dataset, batch_size=self.config.params_batch_size, shuffle=True, num_workers=4, persistent_workers=True)

    def val_dataloader(self):
        return DataLoader(self.val_dataset, batch_size=self.config.params_batch_size, num_workers=4, persistent_workers=True)

In [12]:
class Training:
    def __init__(self, config: TrainingConfig, logs: LogConfig):
        self.config = config
        self.logs = logs
        self.model = None
        self.datamodule = None

    def get_base_model(self):
        base_model = torch.load(self.config.updated_base_model_path, weights_only=False)
        self.model = MyImageClassifier(model=base_model, config=self.config)

    def get_data_module(self):
        self.datamodule = TrainingDataModule(self.config)

    def train(self):
        self.get_base_model()
        self.get_data_module()

        tb_logger = TensorBoardLogger(self.logs.tensorboard)
        mlflow_logger = MLFlowLogger(
            experiment_name="ChestCancerClassification",
            tracking_uri= str(self.logs.mlflow)
            )
        
        trainer = Trainer(
            max_epochs=self.config.params_epochs,
            accelerator="auto",
            logger=[tb_logger, mlflow_logger],
            enable_progress_bar=True,
            log_every_n_steps=1
        )

        trainer.fit(self.model, datamodule=self.datamodule)

        torch.save(self.model.model, self.config.trained_model_path)

In [None]:
try:
    config = ConfigurationManager()
    training_config = config.get_training_config()
    logs_config = config.get_log_config()
    training = Training(config=training_config, logs= logs_config)
    training.train()
except Exception as e:
    raise e

# Hyper-parameter Tuning

In [7]:
from copy import deepcopy
import optuna
import mlflow
from pytorch_lightning.callbacks import ModelCheckpoint
import shutil
import glob

  from .autonotebook import tqdm as notebook_tqdm


In [12]:
class Training:
    def __init__(self, config: TrainingConfig, logs: LogConfig):
        self.config = config
        self.logs = logs
        self.model = None
        self.datamodule = None
        self.best_val_loss_overall = float('inf')

    def get_base_model(self):
        base_model = torch.load(self.config.updated_base_model_path, weights_only=False)
        self.model = MyImageClassifier(model=base_model, config=self.config)

    def get_data_module(self):
        self.datamodule = TrainingDataModule(self.config)


    def get_loggers(self):
        tb_logger = TensorBoardLogger(self.logs.tensorboard)
        mlflow_logger = MLFlowLogger(
            experiment_name="ChestCancerClassification",
            tracking_uri= str(self.logs.mlflow)
            )
        
        return [tb_logger, mlflow_logger]
    


    def get_checkpoint_callback(self):
        checkpoint_dir = getattr(self.config, "checkpoint_dir", self.config.root_dir)
        checkpoint_callback = ModelCheckpoint(
            monitor="val_loss",
            mode="min",
            save_top_k=1,
            save_weights_only=False,
            dirpath=checkpoint_dir,
            filename="best-model-{epoch:02d}-{val_loss:.4f}"
        )

        return checkpoint_callback



    def train(self):
        self.get_base_model()
        self.get_data_module()


        trainer = Trainer(
            max_epochs=self.config.params_epochs,
            accelerator="auto",
            logger=self.get_loggers(),
            callbacks=[self.get_checkpoint_callback()],
            enable_progress_bar=True,
            log_every_n_steps=1
        )

        trainer.fit(self.model, datamodule=self.datamodule)
        best_model_path = None
        for cb in trainer.callbacks:
            if isinstance(cb, ModelCheckpoint):
                best_model_path = cb.best_model_path
                break
        val_loss = trainer.callback_metrics.get("val_loss", torch.tensor(1.0)).item()
        print(f"Best model saved at: {best_model_path}")


        return val_loss, best_model_path
    



    def suggest_hyperparameters(self, trial):
        config_new = deepcopy(self.config)
        config_new.params_learning_rate = trial.suggest_float("LEARNING_RATE", 1e-5, 1e-1)
        config_new.params_batch_size = trial.suggest_categorical("BATCH_SIZE", [8, 16, 32])

        trial_ckpt_dir = os.path.join(self.config.root_dir, f"checkpoints_trial_{trial.number}")
        os.makedirs(trial_ckpt_dir, exist_ok=True)
        config_new.checkpoint_dir = trial_ckpt_dir

        return config_new


    def save_best_model_if_needed(self, val_loss, best_model_path):
        if val_loss < self.best_val_loss_overall:
            self.best_val_loss_overall = val_loss
            os.makedirs(os.path.dirname(self.config.trained_model_path), exist_ok=True)
            shutil.copy(best_model_path, self.config.trained_model_path)
            print(f"New best model saved with val_loss={val_loss:.4f} at {self.config.trained_model_path}")




    def objective(self,trial):
        
        mlflow.set_tracking_uri(str(self.logs.mlflow))
        config_dict = self.suggest_hyperparameters(trial)

        with mlflow.start_run():
            trainer = Training(config_dict, self.logs)
            val_loss, best_model_path = trainer.train()
            self.save_best_model_if_needed(val_loss, best_model_path)
            print("\n\n")
            return val_loss
        


    def run_optuna_study(self, n_trials: int = 5):
        study = optuna.create_study(direction="minimize")
        study.optimize(self.objective, n_trials=n_trials)

        print("Best trial:")
        print(f"  Value: {study.best_trial.value}")
        print(f"  Params: {study.best_trial.params}")
        print(f"Best model checkpoint can be found in: {self.config.trained_model_path}")

        self.cleanup_trial_checkpoints()



    def cleanup_trial_checkpoints(self):
        print("Cleaning up temporary trial checkpoint directories...")
        trial_dirs = glob.glob(os.path.join(self.config.root_dir, "checkpoints_trial_*"))
        final_dir = os.path.dirname(self.config.trained_model_path)
        for d in trial_dirs:
            if os.path.abspath(d) != os.path.abspath(final_dir):
                shutil.rmtree(d, ignore_errors=True)
        print("Cleanup complete. Only the best model directory remains.")

In [13]:
try:
    config = ConfigurationManager()
    training_config = config.get_training_config()
    logs_config = config.get_log_config()
    training = Training(config=training_config, logs= logs_config)
    training.run_optuna_study()
except Exception as e:
    raise e

[2025-06-10 16:35:59,539] [13] [common] - INFO - YAML file loaded successfully: config\config.yaml
[2025-06-10 16:35:59,544] [13] [common] - INFO - YAML file loaded successfully: params.yaml
[2025-06-10 16:35:59,546] [26] [common] - INFO - Created directory at: artifacts
[2025-06-10 16:35:59,547] [26] [common] - INFO - Created directory at: artifacts\training
[2025-06-10 16:35:59,548] [26] [common] - INFO - Created directory at: artifacts\logs


[I 2025-06-10 16:35:59,551] A new study created in memory with name: no-name-8186a51c-d3bd-418a-9b8a-4a3edc66d97e


[2025-06-10 16:36:01,327] [156] [setup] - INFO - GPU available: True (cuda), used: True
[2025-06-10 16:36:01,329] [159] [setup] - INFO - TPU available: False, using: 0 TPU cores
[2025-06-10 16:36:01,330] [169] [setup] - INFO - HPU available: False, using: 0 HPUs
[2025-06-10 16:36:01,431] [61] [cuda] - INFO - LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[2025-06-10 16:36:01,437] [104] [model_summary] - INFO - 
  | Name    | Type             | Params | Mode 
-----------------------------------------------------
0 | model   | VGG              | 134 M  | train
1 | loss_fn | CrossEntropyLoss | 0      | train
-----------------------------------------------------
119 M     Trainable params
14.7 M    Non-trainable params
134 M     Total params
537.075   Total estimated model params size (MB)
45        Modules in train mode
0         Modules in eval mode
Epoch 4: 100%|██████████| 18/18 [00:03<00:00,  5.48it/s, v_num=3714, val_loss=0.901, val_accuracy=0.412, train_loss=0.877][2025-06-10 16:36:41,05

[I 2025-06-10 16:36:46,473] Trial 0 finished with value: 0.9014971852302551 and parameters: {'LEARNING_RATE': 0.021987167793433484, 'BATCH_SIZE': 16}. Best is trial 0 with value: 0.9014971852302551.


New best model saved with val_loss=0.9015 at artifacts\training\model.pth



[2025-06-10 16:36:47,576] [156] [setup] - INFO - GPU available: True (cuda), used: True
[2025-06-10 16:36:47,577] [159] [setup] - INFO - TPU available: False, using: 0 TPU cores
[2025-06-10 16:36:47,577] [169] [setup] - INFO - HPU available: False, using: 0 HPUs
[2025-06-10 16:36:47,642] [61] [cuda] - INFO - LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[2025-06-10 16:36:47,646] [104] [model_summary] - INFO - 
  | Name    | Type             | Params | Mode 
-----------------------------------------------------
0 | model   | VGG              | 134 M  | train
1 | loss_fn | CrossEntropyLoss | 0      | train
-----------------------------------------------------
119 M     Trainable params
14.7 M    Non-trainable params
134 M     Total params
537.075   Total estimated model params size (MB)
45        Modules in train mode
0         Modules in eval mode
Epoch 4: 100%|██████████| 18/18 [00:03<00:00,  4.53it/s, v_num=1eb7,

[I 2025-06-10 16:37:36,195] Trial 1 finished with value: 0.9014971852302551 and parameters: {'LEARNING_RATE': 0.04801877697893216, 'BATCH_SIZE': 16}. Best is trial 0 with value: 0.9014971852302551.


Best model saved at: D:\python-projects\chest-cancer-classification\artifacts\training\checkpoints_trial_1\best-model-epoch=00-val_loss=0.9015.ckpt



[2025-06-10 16:37:38,029] [156] [setup] - INFO - GPU available: True (cuda), used: True
[2025-06-10 16:37:38,032] [159] [setup] - INFO - TPU available: False, using: 0 TPU cores
[2025-06-10 16:37:38,034] [169] [setup] - INFO - HPU available: False, using: 0 HPUs
[2025-06-10 16:37:38,223] [61] [cuda] - INFO - LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[2025-06-10 16:37:38,237] [104] [model_summary] - INFO - 
  | Name    | Type             | Params | Mode 
-----------------------------------------------------
0 | model   | VGG              | 134 M  | train
1 | loss_fn | CrossEntropyLoss | 0      | train
-----------------------------------------------------
119 M     Trainable params
14.7 M    Non-trainable params
134 M     Total params
537.075   Total estimated model params size (MB)
45        Modules in train mode
0         Modules in eval

[I 2025-06-10 16:38:35,465] Trial 2 finished with value: 0.9014971852302551 and parameters: {'LEARNING_RATE': 0.047168558176861004, 'BATCH_SIZE': 16}. Best is trial 0 with value: 0.9014971852302551.


Best model saved at: D:\python-projects\chest-cancer-classification\artifacts\training\checkpoints_trial_2\best-model-epoch=00-val_loss=0.9015.ckpt



[2025-06-10 16:38:37,197] [156] [setup] - INFO - GPU available: True (cuda), used: True
[2025-06-10 16:38:37,198] [159] [setup] - INFO - TPU available: False, using: 0 TPU cores
[2025-06-10 16:38:37,200] [169] [setup] - INFO - HPU available: False, using: 0 HPUs
[2025-06-10 16:38:37,388] [61] [cuda] - INFO - LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[2025-06-10 16:38:37,404] [104] [model_summary] - INFO - 
  | Name    | Type             | Params | Mode 
-----------------------------------------------------
0 | model   | VGG              | 134 M  | train
1 | loss_fn | CrossEntropyLoss | 0      | train
-----------------------------------------------------
119 M     Trainable params
14.7 M    Non-trainable params
134 M     Total params
537.075   Total estimated model params size (MB)
45        Modules in train mode
0         Modules in eval

[I 2025-06-10 16:39:53,781] Trial 3 finished with value: 0.7103207111358643 and parameters: {'LEARNING_RATE': 0.0402563320857799, 'BATCH_SIZE': 8}. Best is trial 3 with value: 0.7103207111358643.


New best model saved with val_loss=0.7103 at artifacts\training\model.pth



[2025-06-10 16:39:55,573] [156] [setup] - INFO - GPU available: True (cuda), used: True
[2025-06-10 16:39:55,575] [159] [setup] - INFO - TPU available: False, using: 0 TPU cores
[2025-06-10 16:39:55,576] [169] [setup] - INFO - HPU available: False, using: 0 HPUs
[2025-06-10 16:39:55,698] [61] [cuda] - INFO - LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[2025-06-10 16:39:55,711] [104] [model_summary] - INFO - 
  | Name    | Type             | Params | Mode 
-----------------------------------------------------
0 | model   | VGG              | 134 M  | train
1 | loss_fn | CrossEntropyLoss | 0      | train
-----------------------------------------------------
119 M     Trainable params
14.7 M    Non-trainable params
134 M     Total params
537.075   Total estimated model params size (MB)
45        Modules in train mode
0         Modules in eval mode
Epoch 4: 100%|██████████| 35/35 [00:04<00:00,  8.62it/s, v_num=c0f3,

[I 2025-06-10 16:40:59,285] Trial 4 finished with value: 0.6809089183807373 and parameters: {'LEARNING_RATE': 0.07927818925746836, 'BATCH_SIZE': 8}. Best is trial 4 with value: 0.6809089183807373.


New best model saved with val_loss=0.6809 at artifacts\training\model.pth



Best trial:
  Value: 0.6809089183807373
  Params: {'LEARNING_RATE': 0.07927818925746836, 'BATCH_SIZE': 8}
Best model checkpoint can be found in: artifacts\training\model.pth
Cleaning up temporary trial checkpoint directories...
Cleanup complete. Only the best model directory remains.
