In [1]:
import os

In [2]:
%pwd

'C:\\Users\\DIKSHANT PATEL\\Kidney-Disease-Classification\\research'

In [3]:
os.chdir("../.")

In [4]:
%pwd

'C:\\Users\\DIKSHANT PATEL\\Kidney-Disease-Classification'

In [26]:
from dataclasses import dataclass

@dataclass
class DataLoaderConfig:
    root_dir: str
    train_data: str
    valid_data: str
    test_data: str
    target_size: tuple
    batch_size: int
    color_mode: str
    class_mode: str
    seed: int
    rotation_range: int
    width_shift_range: float
    height_shift_range: float
    shear_range: float
    zoom_range: float
    horizontal_flip: bool
    fill_mode: str

In [27]:
from cnnClassifier.constants import *
from cnnClassifier.utils.common import read_yaml, create_directories

In [28]:
class ConfigurationManager:
    def __init__(self, config_filepath = CONFIG_FILE_PATH, params_filepath = PARAMS_FILE_PATH):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        
    def get_data_loader_config(self) -> DataLoaderConfig:
        config = self.config.data_loader
        params = self.params.data_loader
        return DataLoaderConfig(
            root_dir=config.root_dir,
            train_data=config.train_data,
            valid_data=config.valid_data,
            test_data=config.test_data,
            target_size=params.target_size,
            batch_size=params.batch_size,
            color_mode=params.color_mode,
            class_mode=params.class_mode,
            seed=params.seed,
            rotation_range=params.rotation_range,
            width_shift_range=params.width_shift_range,
            height_shift_range=params.height_shift_range,
            shear_range=params.shear_range,
            zoom_range=params.zoom_range,
            horizontal_flip=params.horizontal_flip,
            fill_mode = params.fill_mode
        )
        

In [29]:
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from cnnClassifier import logger

In [32]:
class DataLoader: 
    def __init__(self, config: DataLoaderConfig):
        self.config = config
        
    def create_generator(self):
        aug = ImageDataGenerator(
            rescale=1.0 / 255,
            rotation_range=self.config.rotation_range,
            width_shift_range=self.config.width_shift_range,
            height_shift_range=self.config.height_shift_range,
            shear_range=self.config.shear_range,
            zoom_range=self.config.zoom_range,
            horizontal_flip=self.config.horizontal_flip,
            fill_mode=self.config.fill_mode,
        )
        ori = ImageDataGenerator(rescale=1.0 / 255)
        logger.info("Generators created successfully.")
        return aug, ori
    
    def load_dataframe(self, file_path):
        logger.info(f"Loading dataframe from file: {file_path}")
        try:
            df = pd.read_csv(file_path)
            logger.info(f"Dataframe loaded successfully with {len(df)} records.")
            return df
        except Exception as e:
            logger.error(f"Error loading dataframe from {file_path}: {str(e)}")
            raise e

    def validate_filepaths(self, df):
        # Check if the file paths are valid and log invalid paths
        df['valid_filepath'] = df['filepath'].apply(lambda x: os.path.isfile(x))
        invalid_files = df[df['valid_filepath'] == False]
        
        if not invalid_files.empty:
            logger.warning(f"Found {len(invalid_files)} invalid file paths!")
            logger.warning(f"Invalid file paths: \n{invalid_files[['filepath']]}")
        
        logger.info(f"Valid file paths count: {df['valid_filepath'].sum()}")
        # Return only rows with valid file paths
        return df[df['valid_filepath']]

    def get_flow(self, df, generator, shuffle=True):
        return generator.flow_from_dataframe(
            dataframe=df,
            x_col="filepath",        
            y_col="label",            
            target_size=(224,224),
            batch_size=self.config.batch_size,  
            class_mode=self.config.class_mode, 
            color_mode=self.config.color_mode,  
            shuffle=shuffle,  
            seed=self.config.seed  
        )

    def combined_generator(self, aug, ori):
        logger.info(f"Combining augmented and original data generators.")
        batch_size = self.config.batch_size
        n_orig = batch_size // 2
        n_aug = batch_size - n_orig
        logger.info(f"Type of aug: {type(aug)}")
        while True:
            aug_images, aug_labels = next(aug)
            ori_images, ori_labels = next(ori)
            
            logger.info(f"Augmented image batch shape: {aug_images.shape}")
            logger.info(f"Augmented labels shape: {aug_labels.shape}")
            logger.info(f"Original image batch shape: {ori_images.shape}")
            logger.info(f"Original labels shape: {ori_labels.shape}")
            
            # Combine the two datasets
            images = np.concatenate((ori_images[:n_orig], aug_images[n_aug:]), axis=0)
            labels = np.concatenate((ori_labels[:n_orig], aug_labels[n_aug:]), axis=0)

            yield images, labels
    
    def get_generators(self):
        # File paths for the CSVs
        train_path = f"{self.config.root_dir}/{self.config.train_data}"
        valid_path = f"{self.config.root_dir}/{self.config.valid_data}"
        test_path = f"{self.config.root_dir}/{self.config.test_data}"

        # Load the dataframes
        try:
            train_df = self.load_dataframe(train_path)
            valid_df = self.load_dataframe(valid_path)
            test_df = self.load_dataframe(test_path)
        except Exception as e:
            logger.error(f"Error loading dataframes: {str(e)}")
            raise e
        
        # Validate file paths
        train_df = self.validate_filepaths(train_df)
        valid_df = self.validate_filepaths(valid_df)
        test_df = self.validate_filepaths(test_df)
        
        # Ensure the labels are in the correct format (str)
        train_df['label'] = train_df['label'].astype(str)
        valid_df['label'] = valid_df['label'].astype(str)
        test_df['label'] = test_df['label'].astype(str)
        
        # Create generators
        aug_gen, ori_gen = self.create_generator()

        # Get augmented and original image generators
        aug_train = self.get_flow(train_df, aug_gen, shuffle=True)
        ori_train = self.get_flow(train_df, ori_gen, shuffle=True)

        # Combine the generators
        train = self.combined_generator(aug_train, ori_train)
        valid = self.get_flow(valid_df, ori_gen, shuffle=False)
        test = self.get_flow(test_df, ori_gen, shuffle=False)
        
        logger.info(f"Successfully created train, validation, and test generators.")
        return train, valid, test,train_df, ori_train


In [31]:
try:
    config = ConfigurationManager()
    data_loader_config = config.get_data_loader_config()
    data_loader = DataLoader(config=data_loader_config)
    train_generator, valid_generator, test_generator,train_df, ori_train = data_loader.get_generators()

    # Trigger one batch to activate logging from combined_generator
    logger.info("Fetching one batch to trigger logging inside combined_generator...")
    images, labels = next(train_generator)
    logger.info(f"Combined batch shape: {images.shape}, Labels shape: {labels.shape}")

except Exception as e:
    logger.error(f"Exception occurred during generator setup: {str(e)}")
    raise e

[2025-05-11 22:18:30,495: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-05-11 22:18:30,513: INFO: common: yaml file: params.yaml loaded successfully]
[2025-05-11 22:18:30,516: INFO: 144888782: Loading dataframe from file: artifacts/data_split/train.csv]
[2025-05-11 22:18:30,545: INFO: 144888782: Dataframe loaded successfully with 8712 records.]
[2025-05-11 22:18:30,546: INFO: 144888782: Loading dataframe from file: artifacts/data_split/val.csv]
[2025-05-11 22:18:30,553: INFO: 144888782: Dataframe loaded successfully with 1121 records.]
[2025-05-11 22:18:30,554: INFO: 144888782: Loading dataframe from file: artifacts/data_split/test.csv]
[2025-05-11 22:18:30,566: INFO: 144888782: Dataframe loaded successfully with 2613 records.]
[2025-05-11 22:18:31,215: INFO: 144888782: Valid file paths count: 8712]
[2025-05-11 22:18:31,288: INFO: 144888782: Valid file paths count: 1121]
[2025-05-11 22:18:31,428: INFO: 144888782: Valid file paths count: 2613]
[2025-05-11 22:18: