In [3]:
import os

In [4]:
%pwd

'/home/fintechsys/MLflow projects/kidney-Disease-classification-Deep-Learning--project/research'

In [5]:
os.chdir('../')

In [6]:
%pwd

'/home/fintechsys/MLflow projects/kidney-Disease-classification-Deep-Learning--project'

# Update entity

In [7]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class TrainingConfig:
    root_dir: Path
    trained_model_path: Path
    updated_base_model_path: Path
    training_data: Path
    params_epochs: int
    params_batch_size: int
    params_is_augmentation: bool
    params_image_size: list


# Update configuration

In [8]:
from cnnClassifier.constants import *
from cnnClassifier.utils.common import read_yaml, create_directories
import tensorflow as tf
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau, TensorBoard
from datetime import datetime

2024-02-13 22:45:39.495066: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-02-13 22:45:39.535090: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-02-13 22:45:39.535872: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [9]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])


    
    def get_training_config(self) -> TrainingConfig:
        training = self.config.training
        prepare_base_model = self.config.prepare_base_model
        params = self.params
        # training_data = os.path.join(self.config.data_ingestion.unzip_dir, "Norpeat liveness dataset")
        training_data = self.config.data_preprocessing.root_dir
        create_directories([
            Path(training.root_dir)
        ])

        training_config = TrainingConfig(
            root_dir=Path(training.root_dir),
            trained_model_path=Path(training.trained_model_path),
            updated_base_model_path=Path(prepare_base_model.updated_base_model_path),
            training_data=Path(training_data),
            params_epochs=params.EPOCHS,
            params_batch_size=params.BATCH_SIZE,
            params_is_augmentation=params.AUGMENTATION,
            params_image_size=params.IMAGE_SIZE
        )

        return training_config

# Update componentS

In [10]:
import os
import urllib.request as request
from zipfile import ZipFile
import tensorflow as tf
import time

In [11]:
class Training:
    def __init__(self, config: TrainingConfig):
        self.config = config


    def get_base_model(self):
        self.model = tf.keras.models.load_model(
            self.config.updated_base_model_path
        )


    def train_valid_generator(self):
        datagenerator_kwargs = dict(
            rescale=1./255,
            validation_split=0.20
            # Removed class_mode='binary' from here
        )

        dataflow_kwargs = dict(
            target_size=self.config.params_image_size[:-1],
            batch_size=self.config.params_batch_size,
            interpolation="bilinear",
            class_mode='binary'  # Moved class_mode here, applicable for both training and validation
        )

        valid_datagenerator = tf.keras.preprocessing.image.ImageDataGenerator(**datagenerator_kwargs)
        self.valid_generator = valid_datagenerator.flow_from_directory(
            directory=self.config.training_data,
            subset="validation",
            shuffle=False,
            **dataflow_kwargs  # class_mode='binary' is now correctly included here
        )

        if self.config.params_is_augmentation:
            train_datagenerator = tf.keras.preprocessing.image.ImageDataGenerator(
            rescale=1./255,  # Normalize images
            horizontal_flip=True,  # Randomly flip images horizontally (realistic for faces)
            width_shift_range=0.1,  # Randomly translate images horizontally by up to 10%
            height_shift_range=0.1,  # Randomly translate images vertically by up to 10%
            brightness_range=[0.8, 1.2],  # Randomly adjust brightness (80-120% of the original value)
            zoom_range=0.2,  # Randomly zoom in and out on images (80-120% zoom), can be useful for faces
            fill_mode='nearest'  # Strategy to fill newly created pixels, which can appear after a shift or a zoom
            # Avoid using vertical_flip=True for face images
            )
        else:
            train_datagenerator = valid_datagenerator

        self.train_generator = train_datagenerator.flow_from_directory(
            directory=self.config.training_data,
            subset="training",
            shuffle=True,
            **dataflow_kwargs
        )


    @staticmethod
    def save_model(path: Path, model: tf.keras.Model):
        model.save(path)


    def train(self):
        self.steps_per_epoch = self.train_generator.samples // self.train_generator.batch_size
        self.validation_steps = self.valid_generator.samples // self.valid_generator.batch_size

        # Callbacks
        checkpoint_path = str(self.config.root_dir / "best_model.h5")
        log_dir = "logs/fit/" + datetime.now().strftime("%Y%m%d-%H%M%S")

        callbacks = [
            ModelCheckpoint(
                filepath=checkpoint_path,
                save_best_only=True,
                monitor='val_loss',
                mode='min',
                verbose=1
            ),
            EarlyStopping(
                monitor='val_loss',
                patience=10,
                verbose=1,
                restore_best_weights=True
            ),
            ReduceLROnPlateau(
                monitor='val_loss',
                factor=0.1,
                patience=5,
                verbose=1,
                mode='min',
                min_delta=0.0001,
                cooldown=0,
                min_lr=0
            ),
            TensorBoard(
                log_dir=log_dir,
                histogram_freq=1
            )
        ]

        # Training
        self.model.fit(
            self.train_generator,
            epochs=self.config.params_epochs,
            steps_per_epoch=self.steps_per_epoch,
            validation_steps=self.validation_steps,
            validation_data=self.valid_generator,
            callbacks=callbacks  # Add the callbacks here
        )

        # Save the final model
        self.save_model(
            path=self.config.trained_model_path,
            model=self.model
        )

# Create pipeline

In [12]:
try:
    config = ConfigurationManager()
    training_config = config.get_training_config()
    training = Training(config=training_config)
    training.get_base_model()
    training.train_valid_generator()
    training.train()
    
except Exception as e:
    raise e

[2024-02-13 22:45:46,555: INFO: common: yaml file: config/config.yaml loaded successfully]
[2024-02-13 22:45:46,558: INFO: common: yaml file: params.yaml loaded successfully]
[2024-02-13 22:45:46,559: INFO: common: created directory at: artifacts]
[2024-02-13 22:45:46,560: INFO: common: created directory at: artifacts/training]


2024-02-13 22:45:46.609553: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-02-13 22:45:46.610300: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1956] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


Found 790 images belonging to 2 classes.
Found 3168 images belonging to 2 classes.
Epoch 1/10


2024-02-13 22:45:47.368122: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int32
	 [[{{node Placeholder/_0}}]]
2024-02-13 22:45:47.846908: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 205520896 exceeds 10% of free system memory.
2024-02-13 22:45:47.917735: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 205520896 exceeds 10% of free system memory.


  1/198 [..............................] - ETA: 9:45 - loss: 0.7979 - accuracy: 0.5000

2024-02-13 22:45:50.361493: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 205520896 exceeds 10% of free system memory.
2024-02-13 22:45:50.422040: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 205520896 exceeds 10% of free system memory.


  2/198 [..............................] - ETA: 8:10 - loss: 4.5952 - accuracy: 0.4375

2024-02-13 22:45:52.862284: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 205520896 exceeds 10% of free system memory.




2024-02-13 22:52:21.859854: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int32
	 [[{{node Placeholder/_0}}]]



Epoch 1: val_accuracy improved from inf to 0.57908, saving model to artifacts/training/best_model.h5
Epoch 2/10
Epoch 2: val_accuracy did not improve from 0.57908
Epoch 3/10
Epoch 3: val_accuracy did not improve from 0.57908
Epoch 4/10
Epoch 4: val_accuracy did not improve from 0.57908
Epoch 5/10
Epoch 5: val_accuracy did not improve from 0.57908
Epoch 6/10
Epoch 6: val_accuracy did not improve from 0.57908
Epoch 7/10
Epoch 7: val_accuracy did not improve from 0.57908

Epoch 7: ReduceLROnPlateau reducing learning rate to 0.0009999999776482583.
Epoch 8/10
Epoch 8: val_accuracy did not improve from 0.57908
Epoch 9/10
Epoch 9: val_accuracy did not improve from 0.57908
Epoch 10/10
Epoch 10: val_accuracy did not improve from 0.57908
