# **DATA RELATED STUFF**

In [13]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("mohamedhanyyy/chest-ctscan-images")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/mohamedhanyyy/chest-ctscan-images?dataset_version_number=1...


100%|██████████| 119M/119M [00:19<00:00, 6.25MB/s] 

Extracting model files...





Path to dataset files: C:\Users\arpit\.cache\kagglehub\datasets\mohamedhanyyy\chest-ctscan-images\versions\1


In [14]:
import shutil
shutil.move(path, "C:\Projects\Chest-Cancer-Classification-App")

'C:\\Projects\\Chest-Cancer-Classification-App\\1'

In [18]:
import gdown 

file_id = "1Rfn_h7aGCgpSAuZJVm2oGM31Rl6f2Xs3"
url = f"https://drive.google.com/uc?/export=download&id={file_id}"

In [23]:
print(url)

https://drive.google.com/uc?/export=download&id=1Rfn_h7aGCgpSAuZJVm2oGM31Rl6f2Xs3


In [19]:
gdown.download(url, "Chest-Data.zip")

Downloading...
From (original): https://drive.google.com/uc?/export=download&id=1Rfn_h7aGCgpSAuZJVm2oGM31Rl6f2Xs3
From (redirected): https://drive.google.com/uc?%2Fexport=download&id=1Rfn_h7aGCgpSAuZJVm2oGM31Rl6f2Xs3&confirm=t&uuid=1e2b5a97-2cbe-4fa6-9d2c-33d48180ff0b
To: c:\Projects\Chest-Cancer-Classification-App\Research\Chest-Data.zip
100%|██████████| 124M/124M [00:18<00:00, 6.57MB/s] 


'Chest-Data.zip'

# **DATA INGESTION STEP**

In [1]:
%pwd

'c:\\Projects\\Chest-Cancer-Classification-App\\Research'

In [2]:
import os
os.chdir("../")

In [3]:
%pwd

'c:\\Projects\\Chest-Cancer-Classification-App'

### CONSTANTS

In [5]:
from pathlib import Path

CONFIG_FILE_PATH = Path("config/config.yaml")
PARAMS_FILE_PATH = Path("params.yaml")

### CONFIG.YAML

- artifacts_root: artifacts

- data_ingestion:
  - root_dir: artifacts/data_ingestion
  - source_url: https://drive.google.com/uc?/export=download&id=1Rfn_h7aGCgpSAuZJVm2oGM31Rl6f2Xs3
  - data_dir: artifacts/data_ingestion/data.zip
  - unzip_dir: artifacts/data_ingestion

### CONFIG ENTITY

In [6]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataIngestionConfig:
    root_dir: Path
    source_url: str
    data_dir: Path
    unzip_dir: Path

### CONFIGURATION

In [7]:
from src.logger import logger
from src.exception import CustomException
from src.constants import CONFIG_FILE_PATH, PARAMS_FILE_PATH
from src.utils import create_directories, read_yaml
## from src.entity.config_entity import DataIngestionConfig

class AppConfig:
    def __init__(self):
        self.config_filepath = CONFIG_FILE_PATH
        self.params_filepath = PARAMS_FILE_PATH

        self.config = read_yaml(self.config_filepath)
        self.params = read_yaml(self.params_filepath)

        create_directories([self.config.artifacts_root])   ### Now always pass path to this function as list
    
    def get_data_ingestion_config(self) -> DataIngestionConfig:
        config = self.config.data_ingestion

        create_directories([config.root_dir])              ### Now always pass path to this function as list

        data_ingestion_config = DataIngestionConfig(
            root_dir = config.root_dir,
            source_url = config.source_url,
            data_dir = config.data_dir,
            unzip_dir = config.unzip_dir
        )
        
        return data_ingestion_config

### COMPONENTS

In [8]:
from src.logger import logger
from src.exception import CustomException
import gdown
import os, sys
import zipfile
## from src.configuration.configuration import AppConfig

class DataIngestion:
    def __init__(self, config: DataIngestionConfig):
        self.config = config

    def download_data(self) -> str:
        try:
            root_dir = self.config.root_dir
            data_url = self.config.source_url
            zip_download_dir = self.config.data_dir

            create_directories([root_dir])            ### Now always pass path to this function as list

            gdown.download(data_url, zip_download_dir)

        except Exception as e:
            raise CustomException(e, sys)
    
    def unzip_data(self):
        try:
            unzip_dir = self.config.unzip_dir
            zip_download_dir = self.config.data_dir

            create_directories([unzip_dir])             ### Now always pass path to this function as list

            with zipfile.ZipFile(zip_download_dir, 'r') as zip_ref:
                zip_ref.extractall(unzip_dir)

        except Exception as e:
            raise CustomException(e, sys)



### PIPELINE

In [38]:
## from src.configuration.configuration import AppConfig
## from src.components.data_ingestion import DataIngestion
import sys
from src.logger import logger
from src.exception import CustomException

try:
    config = AppConfig()
    data_ingestion_config = config.get_data_ingestion_config()
    data_ingestion_obj = DataIngestion(config=data_ingestion_config)
    data_ingestion_obj.download_data()
    data_ingestion_obj.unzip_data()
except Exception as e:
    raise CustomException(e, sys)

[32m[2025-06-25 04:01:12]   19 | INFO     | yaml file: config\config.yaml loaded successfully[0m
[32m[2025-06-25 04:01:12]   19 | INFO     | yaml file: params.yaml loaded successfully[0m
Downloading...
From (original): https://drive.google.com/uc?/export=download&id=1Rfn_h7aGCgpSAuZJVm2oGM31Rl6f2Xs3
From (redirected): https://drive.google.com/uc?%2Fexport=download&id=1Rfn_h7aGCgpSAuZJVm2oGM31Rl6f2Xs3&confirm=t&uuid=5c07c6eb-c2b1-4613-b485-e79f90b567a0
To: c:\Projects\Chest-Cancer-Classification-App\artifacts\data_ingestion\data.zip
100%|██████████| 124M/124M [00:18<00:00, 6.58MB/s] 


# **BASE MODEL STEP**

### CONSTANTS

In [9]:
from pathlib import Path

CONFIG_FILE_PATH = Path("config/config.yaml")
PARAMS_FILE_PATH = Path("params.yaml")

### CONFIG.YAML

- base_model_preparation:
  - root_dir: artifacts/base_model
  - base_model_path: artifacts/base_model/base_model.h5
  - updated_base_model_path: artifacts/base_modeln/updated_base_model.h5

### PARAMS.YAML

- AUGMENTATION: True
- IMAGE_SIZE: [224, 224, 3]   ## as per VGG19
- BATCH_SIZE: 32
- INCLUDE_TOP: False
- EPOCHS: 1
- CLASSES: 4
- WEIGHTS: imagenet
- LEARNING_RATE: 0.001

### CONFIG ENTITY

In [10]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class PrepareBaseModelConfig:
    root_dir: Path
    base_model_path: Path
    updated_base_model_path: Path
    augmentation: bool
    image_size: list
    learning_rate: float
    epochs: int
    batch_size: int
    num_classes: int
    include_top: bool
    weights: str

### CONFIGURATION

In [11]:
from src.logger import logger
from src.exception import CustomException
from src.constants import CONFIG_FILE_PATH, PARAMS_FILE_PATH
from src.utils import create_directories, read_yaml
##from src.entity.config_entity import DataIngestionConfig

class AppConfig:
    def __init__(self):
        self.config_filepath = CONFIG_FILE_PATH
        self.params_filepath = PARAMS_FILE_PATH

        self.config = read_yaml(self.config_filepath)
        self.params = read_yaml(self.params_filepath)

        create_directories([self.config.artifacts_root])   ### Now always pass path to this function as list
    
    def get_data_ingestion_config(self) -> DataIngestionConfig:
        config = self.config.data_ingestion

        create_directories([config.root_dir])              ### Now always pass path to this function as list

        data_ingestion_config = DataIngestionConfig(
            root_dir = config.root_dir,
            source_url = config.source_url,
            data_dir = config.data_dir,
            unzip_dir = config.unzip_dir
        )
        
        return data_ingestion_config

    def get_base_model_config(self) -> PrepareBaseModelConfig:
        config = self.config.base_model_preparation
        params = self.params

        create_directories([config.root_dir])            ### Now always pass path to this function as list

        base_model_config = PrepareBaseModelConfig(
            root_dir = Path(config.root_dir),
            base_model_path = Path(config.base_model_path),
            updated_base_model_path = Path(config.updated_base_model_path),
            augmentation = params.AUGMENTATION,
            image_size = params.IMAGE_SIZE,
            learning_rate = params.LEARNING_RATE,
            epochs = params.EPOCHS,
            batch_size = params.BATCH_SIZE,
            num_classes = params.CLASSES,
            include_top = params.INCLUDE_TOP,
            weights = params.WEIGHTS
        )

        return base_model_config

### COMPONENT

In [16]:
from src.logger import logger
from src.exception import CustomException
import urllib.request as request
import os, sys
import tensorflow as tf
## from src.configuration.configuration import AppConfig
## from src.entity.config_entity import PrepareBaseModelConfig

class PrepareBaseModel:
    def __init__(self, config: PrepareBaseModelConfig):
        self.config = config
    
    @staticmethod
    def save_model(path: Path, model: tf.keras.Model):
        model.save(path)

    def download_base_mode(self):
        self.model = tf.keras.applications.VGG19(
            input_shape=self.config.image_size,
            include_top=self.config.include_top,
            weights=self.config.weights,
        )

        self.save_model(path=self.config.base_model_path, model=self.model)
    
    @staticmethod
    def _prepare_full_model(model, classes, freeze_all, freeze_till, learning_rate):
        if freeze_all:
            for layer in model.layers:
                model.trainable = False
        elif (freeze_till is not None) and (freeze_till > 0):
            for layer in model.layers[:-freeze_till]:
                model.trainable = False
            
        flatten_in = tf.keras.layers.Flatten()(model.output)
        prediction = tf.keras.layers.Dense(
            units=classes,
            activation="softmax"
        )(flatten_in)

        full_model = tf.keras.models.Model(
            inputs = model.input,
            outputs = prediction
        )

        full_model.compile(
            optimizer=tf.keras.optimizers.SGD(learning_rate=learning_rate),
            loss=tf.keras.losses.CategoricalCrossentropy(),
            metrics=['accuracy']
        )

        full_model.summary()
        
        return full_model

    def update_base_model(self):
        self.full_model = self._prepare_full_model(
            model = self.model,
            classes = self.config.num_classes,
            freeze_all=True,
            freeze_till=None,
            learning_rate=self.config.learning_rate
        )

        self.save_model(path=self.config.updated_base_model_path, model=self.full_model)  


### PIPELINE

In [17]:
try:
    config = AppConfig()
    prepare_base_model = config.get_base_model_config()
    prepare_base_model =  PrepareBaseModel(config=prepare_base_model)
    prepare_base_model.download_base_mode()
    prepare_base_model.update_base_model()
except Exception as e:
    raise CustomException(e, sys)


[32m[2025-06-25 15:12:58]   19 | INFO     | yaml file: config\config.yaml loaded successfully[0m
[32m[2025-06-25 15:12:58]   19 | INFO     | yaml file: params.yaml loaded successfully[0m


Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 224, 224, 3)]     0         
                                                                 
 block1_conv1 (Conv2D)       (None, 224, 224, 64)      1792      
                                                                 
 block1_conv2 (Conv2D)       (None, 224, 224, 64)      36928     
                                                                 
 block1_pool (MaxPooling2D)  (None, 112, 112, 64)      0         
                                                                 
 block2_conv1 (Conv2D)       (None, 112, 112, 128)     73856     
                                                                 
 block2_conv2 (Conv2D)       (None, 112, 112, 128)     147584    
                                                                 
 block2_pool (MaxPooling2D)  (None, 56, 56, 128)       0   

# **MODEL TRAINER STEP**

### CONSTANTS

In [48]:
from pathlib import Path

CONFIG_FILE_PATH = Path("config/config.yaml")
PARAMS_FILE_PATH = Path("params.yaml")

### CONFIG.YAML

- model_trainer:
  - root_dir: artifacts/model_trainer
  - trained_model_path: artifacts/model_trainer/trained_model.h5
  - train_data_path: artifacts/data_ingestion/Data/train
  - valid_data_path: artifacts/data_ingestion/Data/valid
  - test_data_path: artifacts/data_ingestion/Data/test

### CONFIG ENTITY

In [49]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class ModelTrainerConfig:
    root_dir: Path
    trained_model_path: Path
    updated_base_model_path: Path
    train_data_path: Path
    test_data_path: Path
    valid_data_path: Path
    augmentation: bool
    image_size: list
    epochs: int
    batch_size: int
    train_history_dir: Path
    loss_images_path: Path
    accuracy_images_path: Path
    history_json_path: Path

### CONFIGURATION

In [50]:
from src.logger import logger
from src.exception import CustomException
from src.constants import CONFIG_FILE_PATH, PARAMS_FILE_PATH
from src.utils import create_directories, read_yaml
## from src.entity.config_entity import ModelTrainerConfig
from pathlib import Path

class AppConfig:
    def __init__(self):
        self.config_filepath = CONFIG_FILE_PATH
        self.params_filepath = PARAMS_FILE_PATH

        self.config = read_yaml(self.config_filepath)
        self.params = read_yaml(self.params_filepath)

        create_directories([self.config.artifacts_root])   ### Now always pass path to this function as list
    
    def get_model_trainer_config(self):
        model_trainer = self.config.model_trainer
        params = self.params
        base_model = self.config.base_model_preparation

        create_directories([model_trainer.root_dir])              ### Now always pass path to this function as list

        model_trainer_config = ModelTrainerConfig(
            root_dir = Path(model_trainer.root_dir),
            trained_model_path = Path(model_trainer.trained_model_path),
            updated_base_model_path = Path(base_model.updated_base_model_path),
            train_data_path= Path(model_trainer.train_data_path),
            test_data_path= Path(model_trainer.test_data_path),
            valid_data_path= Path(model_trainer.valid_data_path),
            augmentation = params.AUGMENTATION,
            image_size = params.IMAGE_SIZE,
            epochs = params.EPOCHS,
            batch_size = params.BATCH_SIZE,
            train_history_dir = Path(model_trainer.train_history_dir),
            loss_images_path = Path(model_trainer.loss_images_path),
            accuracy_images_path = Path(model_trainer.accuracy_images_path),
            history_json_path = Path(model_trainer.history_json_path)
        )
        
        return model_trainer_config


### COMPONENT

In [51]:
from src.logger import logger
from src.exception import CustomException
import urllib.request as request
import os, sys
import tensorflow as tf
from keras.callbacks import EarlyStopping, ModelCheckpoint
## from src.configuration.configuration import AppConfig
## from src.entity.config_entity import ModelTrainerConfig
from src.utils import create_directories, save_json
import matplotlib.pyplot as plt

class ModelTrainer:
    def __init__(self, config: ModelTrainerConfig):
        self.config = config

    def get_model(self):
        self.model = tf.keras.models.load_model(
            self.config.updated_base_model_path
        )

    @staticmethod
    def normalize_img(image, label):
        image = tf.cast(image, tf.float32) / 255.0
        return image, label
    
    @staticmethod
    def save_model(path: Path, model: tf.keras.Model):
        model.save(path)

    def get_data_for_training(self):

        self.train_data = tf.keras.preprocessing.image_dataset_from_directory(
            directory=self.config.train_data_path,
            batch_size=self.config.batch_size,
            interpolation = "bilinear",
            image_size = self.config.image_size[:-1],
            label_mode="categorical",
            shuffle=True
        )

        self.valid_data = tf.keras.preprocessing.image_dataset_from_directory(
            directory=self.config.valid_data_path,
            batch_size=self.config.batch_size,
            interpolation = "bilinear",
            image_size = self.config.image_size[:-1],
            label_mode="categorical",
            shuffle=False
        )

        self.test_data = tf.keras.preprocessing.image_dataset_from_directory(
            directory=self.config.test_data_path,
            image_size=self.config.image_size[:-1],
            interpolation="bilinear",
            batch_size=self.config.batch_size,
            label_mode="categorical",
            shuffle=False
        )
    
    def train(self):

        early_stopping = EarlyStopping(
        monitor='val_loss',
        patience=5,
        restore_best_weights=True
        )

        checkpoint = ModelCheckpoint(
            filepath=str(self.config.trained_model_path),
            monitor='val_loss',
            save_best_only=True
        )

        self.history = self.model.fit(
            self.train_data.map(ModelTrainer.normalize_img),
            validation_data=self.valid_data.map(ModelTrainer.normalize_img),
            epochs=self.config.epochs,
            callbacks=[early_stopping, checkpoint],
            )
        
        self.save_model(path=self.config.trained_model_path, model=self.model)
    
    def save_training_metrics(self):
        try:
            history_dir = Path(self.config.train_history_dir)
            metrics_path = self.config.history_json_path
            loss_image = self.config.loss_images_path
            acc_image = self.config.accuracy_images_path

            create_directories([history_dir])

            history_dict = self.history.history

            save_json(path=metrics_path, data=history_dict)

            plt.figure(figsize=(10, 8))
            plt.plot(history_dict['loss'], label='Train Loss')
            plt.plot(history_dict['val_loss'], label='Val Loss')
            plt.title('Loss Over Epochs')
            plt.xlabel('Epochs')
            plt.ylabel('Loss')
            plt.legend() 
            plt.savefig(loss_image)
            plt.close()
            logger.info(f"Loss plot saved to {loss_image}")

            plt.figure(figsize=(10, 8))
            plt.plot(history_dict['accuracy'], label='Train Accuracy')
            plt.plot(history_dict['val_accuracy'], label='Val Accuracy')
            plt.title('Accuracy Over Epochs')
            plt.xlabel('Epochs')
            plt.ylabel('Accuracy')
            plt.legend()
            plt.savefig(acc_image)
            plt.close()
            logger.info(f"Accuracy plot saved to {acc_image}")
                
        except Exception as e:
            logger.error(e)
            raise CustomException(e, sys)

    

### PIPELINE

In [53]:
try:
    config = AppConfig()
    trainer_config = config.get_model_trainer_config()
    training = ModelTrainer(config=trainer_config)
    training.get_model()
    training.get_data_for_training()
    training.train()
    training.save_training_metrics()
except Exception as e:
    raise CustomException(e, sys)

[32m[2025-06-25 18:23:41]   19 | INFO     | yaml file: config\config.yaml loaded successfully[0m
[32m[2025-06-25 18:23:41]   19 | INFO     | yaml file: params.yaml loaded successfully[0m


Found 613 files belonging to 4 classes.
Found 72 files belonging to 4 classes.
Found 315 files belonging to 4 classes.
Epoch 1/3
Epoch 2/3
Epoch 3/3


[32m[2025-06-25 18:24:01]   43 | INFO     | json file saved at: artifacts\model_trainer\model_history\model_history.json[0m
[32m[2025-06-25 18:24:02]  104 | INFO     | Loss plot saved to artifacts\model_trainer\model_history\loss_image.png[0m
[32m[2025-06-25 18:24:02]  115 | INFO     | Accuracy plot saved to artifacts\model_trainer\model_history\accuracy_image.png[0m


# **MODEL EVALUATION**

In [4]:
import os
os.chdir("../")

In [5]:
%pwd

'c:\\Projects\\Chest-Cancer-Classification-App'

In [1]:
import tensorflow as tf



In [62]:
model = tf.keras.models.load_model(
    "artifacts/model_trainer/trained_model.h5"
)

### CONFIG ENTITY

In [2]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class EvaluationConfig:
    score_file: Path
    model_path: Path
    train_data_path: Path
    test_data_path: Path
    all_params: dict
    mlflow_uri: str
    image_size: list
    batch_size: int
    loss_images_path: Path
    accuracy_images_path: Path
    history_json_path: Path

### CONFIGURATION

In [6]:
from src.constants import *
from src.utils import read_yaml, save_json, create_directories
from src.logger import logger
from src.exception import CustomException
from src.constants import CONFIG_FILE_PATH, PARAMS_FILE_PATH
## from src.entity.config_entity import ModelTrainerConfig
from pathlib import Path

class AppConfig:
    def __init__(self):
        self.config_filepath = CONFIG_FILE_PATH
        self.params_filepath = PARAMS_FILE_PATH

        self.config = read_yaml(self.config_filepath)
        self.params = read_yaml(self.params_filepath)

        create_directories([self.config.artifacts_root])   ### Now always pass path to this function as list

    def get_evaluation_config(self) -> EvaluationConfig:
        config = self.config.model_evaluation
        trainer_config = self.config.model_trainer

        create_directories([config.root_dir])              ### Now always pass path to this function as list

        evaluation_config = EvaluationConfig(
            score_file=config.score_file, 
            model_path=config.trained_model_path,
            train_data_path=config.train_data_path,
            test_data_path=config.test_data_path,
            all_params=self.params,
            mlflow_uri=os.environ.get("MLFLOW_TRACKING_URI"),
            image_size=self.params.IMAGE_SIZE,
            batch_size=self.params.BATCH_SIZE,
            loss_images_path=trainer_config.loss_images_path,
            accuracy_images_path=trainer_config.accuracy_images_path,
            history_json_path=trainer_config.history_json_path
        )
        
        return evaluation_config

### COMPONENT

In [8]:
## from src.entity.config_entity import EvaluationConfig
from src.logger import logger
from src.exception import CustomException
import tensorflow as tf
from pathlib import Path
import mlflow
import mlflow.keras
from urllib.parse import urlparse

class ModelEvaluation:
    def __init__(self, config: EvaluationConfig):
        self.config = config
    
    @staticmethod
    def normalize_img(image, label):
        image = tf.cast(image, tf.float32) / 255.0
        return image, label

    @staticmethod
    def load_model(path: Path) -> tf.keras.Model:
        return tf.keras.models.load_model(path)
    
    def get_test_data(self):
        
        self.test_data = tf.keras.preprocessing.image_dataset_from_directory(
            directory=self.config.test_data_path,
            image_size=self.config.image_size[:-1],
            interpolation="bilinear",
            batch_size=self.config.batch_size,
            label_mode="categorical",
            shuffle=False
        )

        self.test_data = self.test_data.map(ModelEvaluation.normalize_img)
    
    def evaluate(self):
        self.model = self.load_model(self.config.model_path)
        self.results = self.model.evaluate(self.test_data)
        self.score = {'loss': self.results[0], 'accuracy': self.results[1]}
        save_json(path=Path(self.config.score_file), data=self.score)
    
    def log_into_mlflow(self):
        mlflow.set_registry_uri(self.config.mlflow_uri)
        tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme

        with mlflow.start_run():
            mlflow.log_params(self.config.all_params)
            mlflow.log_metrics(self.score)

            if tracking_url_type_store != "file":
                mlflow.keras.log_model(self.model, "model", registered_model_name="VGG16Model")
            else:
                mlflow.keras.log_model(self.model, "model")

### PIPELINE

In [None]:
try:
    config = AppConfig()
    eval_config = config.get_evaluation_config()
    evaluation = ModelEvaluation(eval_config)
    evaluation.get_test_data()
    evaluation.evaluate()
    evaluation.log_into_mlflow()
except Exception as e:
    raise CustomException(e, sys)