In [1]:
import os

In [2]:
%pwd

'C:\\Users\\DIKSHANT PATEL\\Kidney-Disease-Classification\\research'

In [3]:
os.chdir('../')

In [4]:
%pwd

'C:\\Users\\DIKSHANT PATEL\\Kidney-Disease-Classification'

In [None]:
from cnnClassifier.constants import *
from cnnClassifier.utils.common import read_yaml, create_directories

In [5]:
import pandas as pd

In [6]:
df = pd.read_csv("artifacts/data_preprocessing/Data.csv")
df.sample(10)

Unnamed: 0,filepath,label
7100,artifacts/data_ingestion/CT-KIDNEY-DATASET-Nor...,Normal
12306,artifacts/data_ingestion/CT-KIDNEY-DATASET-Nor...,Tumor
2328,artifacts/data_ingestion/CT-KIDNEY-DATASET-Nor...,Cyst
5965,artifacts/data_ingestion/CT-KIDNEY-DATASET-Nor...,Normal
11910,artifacts/data_ingestion/CT-KIDNEY-DATASET-Nor...,Tumor
11718,artifacts/data_ingestion/CT-KIDNEY-DATASET-Nor...,Tumor
2230,artifacts/data_ingestion/CT-KIDNEY-DATASET-Nor...,Cyst
11159,artifacts/data_ingestion/CT-KIDNEY-DATASET-Nor...,Tumor
11041,artifacts/data_ingestion/CT-KIDNEY-DATASET-Nor...,Tumor
8973,artifacts/data_ingestion/CT-KIDNEY-DATASET-Nor...,Stone


In [7]:
label_mapping = {"Normal": 0,
                 "Cyst": 1,
                 "Tumor": 2,
                 "Stone": 3}
df["label"] = df["label"].map(label_mapping)


In [9]:
df.sample(10)

Unnamed: 0,filepath,label
1980,artifacts/data_ingestion/CT-KIDNEY-DATASET-Nor...,1
9572,artifacts/data_ingestion/CT-KIDNEY-DATASET-Nor...,3
5616,artifacts/data_ingestion/CT-KIDNEY-DATASET-Nor...,0
9754,artifacts/data_ingestion/CT-KIDNEY-DATASET-Nor...,3
1763,artifacts/data_ingestion/CT-KIDNEY-DATASET-Nor...,1
5798,artifacts/data_ingestion/CT-KIDNEY-DATASET-Nor...,0
4268,artifacts/data_ingestion/CT-KIDNEY-DATASET-Nor...,0
9701,artifacts/data_ingestion/CT-KIDNEY-DATASET-Nor...,3
3244,artifacts/data_ingestion/CT-KIDNEY-DATASET-Nor...,1
6846,artifacts/data_ingestion/CT-KIDNEY-DATASET-Nor...,0


In [11]:
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold

In [14]:
train_df , temp_df = train_test_split(df, test_size = 0.3, random_state = 42, stratify = df["label"])
test_df, valid_df = train_test_split(temp_df, test_size = 0.3, random_state=42, stratify = temp_df["label"])

print("Training set shapes:",train_df.shape)
print("Training set shapes:",test_df.shape)
print("Training set shapes:",valid_df.shape)

Training set shapes: (8712, 2)
Training set shapes: (2613, 2)
Training set shapes: (1121, 2)


In [16]:
print("\nTraining Set class Distribution:")
print(train_df["label"].value_counts(normalize = True))


print("\nTesting Set class Distribution:")
print(test_df["label"].value_counts(normalize = True))


print("\nvalidation Set class Distribution:")
print(valid_df["label"].value_counts(normalize = True))



Training Set class Distribution:
label
0    0.407943
1    0.297980
2    0.183425
3    0.110652
Name: proportion, dtype: float64

Testing Set class Distribution:
label
0    0.407960
1    0.298125
2    0.183314
3    0.110601
Name: proportion, dtype: float64

validation Set class Distribution:
label
0    0.407672
1    0.297948
2    0.183764
3    0.110616
Name: proportion, dtype: float64


In [17]:
from dataclasses import dataclass
import pandas as pd
from sklearn.model_selection import train_test_split
from pathlib import Path
from cnnClassifier.utils.common import read_yaml, create_directories
from cnnClassifier.constants import *
from cnnClassifier import logger

In [38]:
@dataclass(frozen = True)
class DataSplitConfig:
    root_dir: Path
    base_csv_path: Path
    train_csv_path: Path
    val_csv_path: Path
    test_csv_path: Path
    test_size: float
    random_state: float  
    Normal: int
    Cyst: int
    Tumor:int
    Stone:int

In [39]:
class ConfigurationManager:
    def __init__(self, config_filepath = CONFIG_FILE_PATH, params_filepath = PARAMS_FILE_PATH):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
    def get_data_split_config(self) -> DataSplitConfig:
        config = self.config.data_split
        label_mapping = self.params["label_mapping"]
        create_directories([config.root_dir])
        data_split_config = DataSplitConfig(
            root_dir=config.root_dir,
            base_csv_path=config.base_csv_path,
            train_csv_path=config.train_csv_path,
            val_csv_path=config.val_csv_path,
            test_csv_path=config.test_csv_path,
            test_size=config.test_size,
            random_state=config.random_state,
            Normal=label_mapping["Normal"],
            Cyst= label_mapping["Cyst"],
            Tumor=label_mapping["Tumor"],
            Stone=label_mapping["Stone"],
        )
        return data_split_config
        

In [40]:
class DataSplitter:
    def __init__(self, config: DataSplitConfig):
        self.config = config
        
    def split_data(self):
        try:
            df = pd.read_csv(self.config.base_csv_path)
            label_mapping = {
                         "Normal": self.config.Normal,
                         "Cyst": self.config.Cyst,
                         "Tumor": self.config.Tumor,
                         "Stone": self.config.Stone }
            
            
            df["label"] = df["label"].map(label_mapping)
            logger.info(f"Applied label mapping: {label_mapping}")
            train_df, temp_df = train_test_split(
                df,
                test_size=self.config.test_size,
                stratify=df["label"],
                random_state=self.config.random_state
            )

            test_df, valid_df = train_test_split(
                temp_df,
                test_size=self.config.test_size,
                stratify=temp_df["label"],
                random_state=self.config.random_state
            )

            # Save all
            create_directories([self.config.root_dir])
            train_df.to_csv(self.config.train_csv_path, index=False)
            valid_df.to_csv(self.config.val_csv_path, index=False)
            test_df.to_csv(self.config.test_csv_path, index=False)

            logger.info("Train, validation, and test files saved successfully!")
            logger.info(
                f"""
                Train shape: {train_df.shape}
                Training Set Class Distribution:
                {train_df["label"].value_counts(normalize=True)}
            
                Validation shape: {valid_df.shape}
                Validation Set Class Distribution:
                {valid_df["label"].value_counts(normalize=True)}
            
                Test shape: {test_df.shape}
                Test Set Class Distribution:
                {test_df["label"].value_counts(normalize=True)}
                """)

        except Exception as e:
            logger.exception(f"Error in data splitting: {e}")
            raise e

In [41]:
try:
    config = ConfigurationManager()
    data_split_config = config.get_data_split_config()
    data_splitter = DataSplitter(config = data_split_config)
    data_splitter.split_data()
    
except Exception as e:
    raise e

[2025-04-21 18:43:27,188: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-04-21 18:43:27,193: INFO: common: yaml file: params.yaml loaded successfully]
[2025-04-21 18:43:27,195: INFO: common: created directory at: artifacts/data_split]
[2025-04-21 18:43:27,618: INFO: 2822761280: Applied label mapping: {'Normal': 0, 'Cyst': 1, 'Tumor': 2, 'Stone': 3}]
[2025-04-21 18:43:27,696: INFO: common: created directory at: artifacts/data_split]
[2025-04-21 18:43:27,754: INFO: 2822761280: Train, validation, and test files saved successfully!]
[2025-04-21 18:43:27,759: INFO: 2822761280: 
                Train shape: (8712, 2)
                Training Set Class Distribution:
                label
0    0.407943
1    0.297980
2    0.183425
3    0.110652
Name: proportion, dtype: float64
            
                Validation shape: (1121, 2)
                Validation Set Class Distribution:
                label
0    0.407672
1    0.297948
2    0.183764
3    0.110616
Name: propo