In [1]:
import os
%pwd

'c:\\Users\\Prarthana\\Desktop\\projects\\noise_reduction_autoencoder\\research'

In [2]:
os.chdir('../')
%pwd

'c:\\Users\\Prarthana\\Desktop\\projects\\noise_reduction_autoencoder'

In [3]:
#update entity
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataPreprocessingConfig:
    data_dir: Path
    image_size: int
    batch_size: int
    shuffle_buffer_size: int

In [4]:
#update configuration manager
from src import logger
from src.utils.common import read_yaml, create_directories
#from src.entity.config_entity import DataIngestionConfig, DataSplittingConfig, DataPreprocessingConfig

class ConfigurationManager:
    def __init__(self, config_filepath="config/config.yaml", params_filepath="params.yaml"):
        self.config = read_yaml(Path(config_filepath))
        self.params = read_yaml(Path(params_filepath))
        create_directories([self.config.artifacts_root])

    def get_data_preprocessing_config(self) -> DataPreprocessingConfig:
        config = self.config.data_preprocessing
        data_preprocessing_config = DataPreprocessingConfig(
            data_dir=config.data_dir,
            image_size=config.image_size,
            batch_size=config.batch_size,
            shuffle_buffer_size=config.shuffle_buffer_size
        )
        return data_preprocessing_config

In [6]:
#update components
import tensorflow as tf
from pathlib import Path
import numpy as np
from src import logger
#from src.entity.config_entity import DataPreprocessingConfig

class DataPreprocessing:
    def __init__(self, config: DataPreprocessingConfig):
        self.config = config

    def load_and_preprocess_image(self, clean_path, noisy_path):
        """Load and preprocess a pair of clean and noisy images."""
        # Read images
        clean_img = tf.io.read_file(clean_path)
        noisy_img = tf.io.read_file(noisy_path)
        
        # Decode JPEG images
        clean_img = tf.image.decode_jpeg(clean_img, channels=1)  # Grayscale
        noisy_img = tf.image.decode_jpeg(noisy_img, channels=1)
        
        # Convert to float32 and normalize to [0, 1]
        clean_img = tf.cast(clean_img, tf.float32) / 255.0
        noisy_img = tf.cast(noisy_img, tf.float32) / 255.0
        
        # Resize images
        clean_img = tf.image.resize(clean_img, [self.config.image_size, self.config.image_size])
        noisy_img = tf.image.resize(noisy_img, [self.config.image_size, self.config.image_size])
        
        return noisy_img, clean_img  # Input: noisy, Target: clean

    def create_dataset(self, clean_paths, noisy_paths, split):
        """Create a TensorFlow dataset for a given split."""
        dataset = tf.data.Dataset.from_tensor_slices((clean_paths, noisy_paths))
        dataset = dataset.map(
            lambda clean, noisy: self.load_and_preprocess_image(clean, noisy),
            num_parallel_calls=tf.data.AUTOTUNE
        )
        
        # Apply batching and shuffling for training set
        if split == "train":
            dataset = dataset.shuffle(buffer_size=self.config.shuffle_buffer_size)
        dataset = dataset.batch(self.config.batch_size)
        dataset = dataset.prefetch(tf.data.AUTOTUNE)
        
        return dataset

    def preprocess(self):
        """Preprocess images and create datasets for train, val, and test splits."""
        try:
            logger.info("Starting data preprocessing")
            
            splits = ["train", "val", "test"]
            datasets = {}
            
            for split in splits:
                # Collect clean and noisy image paths
                clean_normal_dir = Path(self.config.data_dir) / split / "clean" / "NORMAL"
                clean_pneumonia_dir = Path(self.config.data_dir) / split / "clean" / "PNEUMONIA"
                noisy_normal_dir = Path(self.config.data_dir) / split / "noisy" / "NORMAL"
                noisy_pneumonia_dir = Path(self.config.data_dir) / split / "noisy" / "PNEUMONIA"
                
                # Get image paths
                clean_normal_paths = [str(p) for p in clean_normal_dir.glob("*.jpeg")]
                clean_pneumonia_paths = [str(p) for p in clean_pneumonia_dir.glob("*.jpeg")]
                noisy_normal_paths = [str(p) for p in noisy_normal_dir.glob("*.jpeg")]
                noisy_pneumonia_paths = [str(p) for p in noisy_pneumonia_dir.glob("*.jpeg")]
                
                # Combine paths
                clean_paths = clean_normal_paths + clean_pneumonia_paths
                noisy_paths = noisy_normal_paths + noisy_pneumonia_paths
                
                # Sort to ensure pairing
                clean_paths.sort()
                noisy_paths.sort()
                
                # Log counts
                logger.info(f"{split} set: {len(clean_paths)} clean images, {len(noisy_paths)} noisy images")
                
                if len(clean_paths) != len(noisy_paths):
                    raise ValueError(f"Mismatch in {split} set: {len(clean_paths)} clean vs {len(noisy_paths)} noisy images")
                
                if len(clean_paths) == 0:
                    raise ValueError(f"No images found in {split} set")
                
                # Create dataset
                dataset = self.create_dataset(clean_paths, noisy_paths, split)
                datasets[split] = dataset
            
            logger.info("Data preprocessing completed")
            return datasets
        
        except Exception as e:
            logger.error(f"Data preprocessing failed: {e}", exc_info=True)
            raise e

In [7]:
#update pipeline
try:
    config = ConfigurationManager()
    data_preprocessing_config = config.get_data_preprocessing_config()
    data_preprocessing = DataPreprocessing(config=data_preprocessing_config)
    datasets = data_preprocessing.preprocess()
except Exception as e:
    raise e

[2025-04-23 22:26:07,526: INFO: common: YAML file: config\config.yaml loaded successfully]
[2025-04-23 22:26:07,531: INFO: common: YAML file: params.yaml loaded successfully]
[2025-04-23 22:26:07,533: INFO: common: Created directory at: artifacts]
[2025-04-23 22:26:07,534: INFO: 1408370280: Starting data preprocessing]
[2025-04-23 22:26:07,548: INFO: 1408370280: train set: 436 clean images, 436 noisy images]
[2025-04-23 22:26:07,899: INFO: 1408370280: val set: 93 clean images, 93 noisy images]
[2025-04-23 22:26:07,917: INFO: 1408370280: test set: 95 clean images, 95 noisy images]
[2025-04-23 22:26:07,934: INFO: 1408370280: Data preprocessing completed]


In [None]:
#research

In [3]:
import tensorflow as tf
from src.config.configuration import ConfigurationManager
from src.components.data_preprocessing import DataPreprocessing

# Load the datasets
config = ConfigurationManager()
data_preprocessing_config = config.get_data_preprocessing_config()
data_preprocessing = DataPreprocessing(config=data_preprocessing_config)
datasets = data_preprocessing.preprocess()

# Log dataset sizes (in batches)
for split, dataset in datasets.items():
    dataset_size = tf.data.experimental.cardinality(dataset).numpy()
    print(f"{split} dataset size (batches): {dataset_size}")

[2025-04-23 22:34:31,142: INFO: common: YAML file: c:\Users\Prarthana\Desktop\projects\noise_reduction_autoencoder\config\config.yaml loaded successfully]
[2025-04-23 22:34:31,144: INFO: common: YAML file: c:\Users\Prarthana\Desktop\projects\noise_reduction_autoencoder\params.yaml loaded successfully]
[2025-04-23 22:34:31,145: INFO: common: Created directory at: artifacts]
[2025-04-23 22:34:31,145: INFO: data_preprocessing: Starting data preprocessing]
[2025-04-23 22:34:31,151: INFO: data_preprocessing: train set: 436 clean images, 436 noisy images]
[2025-04-23 22:34:31,251: INFO: data_preprocessing: val set: 93 clean images, 93 noisy images]
[2025-04-23 22:34:31,266: INFO: data_preprocessing: test set: 95 clean images, 95 noisy images]
[2025-04-23 22:34:31,289: INFO: data_preprocessing: Data preprocessing completed]
train dataset size (batches): 14
val dataset size (batches): 3
test dataset size (batches): 3


In [8]:
import matplotlib.pyplot as plt
import tensorflow as tf
from src.config.configuration import ConfigurationManager
from src.components.data_preprocessing import DataPreprocessing
from pathlib import Path

# Ensure the save directory exists
save_dir = Path("docs/plots")
save_dir.mkdir(parents=True, exist_ok=True)

# Load the datasets
config = ConfigurationManager()
data_preprocessing_config = config.get_data_preprocessing_config()
data_preprocessing = DataPreprocessing(config=data_preprocessing_config)
datasets = data_preprocessing.preprocess()

# Extract a sample clean-noisy pair from the training dataset
train_dataset = datasets["train"]
batch = next(iter(train_dataset.take(1)))  # Get the first batch
noisy_img, clean_img = batch[0][0], batch[1][0]  # Get the first image pair from the batch

# Create the plot
plt.figure(figsize=(8, 4))
plt.subplot(1, 2, 1)
plt.imshow(noisy_img, cmap="gray")
plt.title("Noisy Image")
plt.axis("off")  # Optional: Hide axes for cleaner visuals
plt.subplot(1, 2, 2)
plt.imshow(clean_img, cmap="gray")
plt.title("Clean Image")
plt.axis("off")
plt.tight_layout()

# Save the plot
save_path = save_dir / "sample_clean_noisy_pair.png"
plt.savefig(save_path)
plt.close()
print(f"Plot saved to {save_path}")

[2025-04-23 22:39:47,153: INFO: common: YAML file: c:\Users\Prarthana\Desktop\projects\noise_reduction_autoencoder\config\config.yaml loaded successfully]
[2025-04-23 22:39:47,157: INFO: common: YAML file: c:\Users\Prarthana\Desktop\projects\noise_reduction_autoencoder\params.yaml loaded successfully]
[2025-04-23 22:39:47,159: INFO: common: Created directory at: artifacts]
[2025-04-23 22:39:47,161: INFO: data_preprocessing: Starting data preprocessing]
[2025-04-23 22:39:47,173: INFO: data_preprocessing: train set: 436 clean images, 436 noisy images]
[2025-04-23 22:39:47,201: INFO: data_preprocessing: val set: 93 clean images, 93 noisy images]
[2025-04-23 22:39:47,223: INFO: data_preprocessing: test set: 95 clean images, 95 noisy images]
[2025-04-23 22:39:47,241: INFO: data_preprocessing: Data preprocessing completed]
Plot saved to docs\plots\sample_clean_noisy_pair.png
