In [1]:
"""
MODEL TRAINING RESEARCH NOTEBOOK
=================================
Modern TensorFlow training pipeline with production-ready practices:
- tf.data.Dataset API (modern, efficient data loading)
- Modern data augmentation techniques
- Callbacks for monitoring and early stopping
- Mixed precision training support
- Comprehensive metrics and logging
- Class imbalance handling
"""

import os
import sys
from pathlib import Path
import logging
from typing import Optional, Tuple
import numpy as np

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)

In [2]:
%pwd

'c:\\Users\\asus\\Desktop\\Deep Learning project\\Chest-Cancer-Classification\\research'

In [3]:
# Navigate to project root
project_root = Path(__file__).resolve().parent.parent if '__file__' in globals() else Path.cwd().parent
os.chdir(project_root)
print(f"‚úì Working directory: {os.getcwd()}")

‚úì Working directory: c:\Users\asus\Desktop\Deep Learning project\Chest-Cancer-Classification


In [4]:
# Load environment variables
from dotenv import load_dotenv

env_path = Path('.env')
if env_path.exists():
    load_dotenv(env_path)
    print("‚úì Environment variables loaded")
else:
    print("‚ö† Warning: .env file not found")

‚úì Environment variables loaded


In [12]:
from dataclasses import dataclass
from pathlib import Path
from typing import List


@dataclass(frozen=True)
class TrainingConfig:
    """Training configuration with validation"""
    root_dir: Path
    trained_model_path: Path
    updated_base_model_path: Path
    training_data: Path
    params_epochs: int
    params_batch_size: int
    params_is_augmentation: bool
    params_image_size: List[int]
    
    def __post_init__(self):
        """Validate training configuration"""
        if self.params_epochs < 1:
            raise ValueError("Epochs must be >= 1")
        if self.params_batch_size < 1:
            raise ValueError("Batch size must be >= 1")
        if not self.training_data.exists():
            raise FileNotFoundError(f"Training data not found: {self.training_data}")

In [13]:
from cnnClassifier.constants import CONFIG_FILE_PATH, PARAMS_FILE_PATH
from cnnClassifier.utils.common import read_yaml, create_directories
import tensorflow as tf

print(f"TensorFlow version: {tf.__version__}")

TensorFlow version: 2.20.0


In [14]:
class ConfigurationManager:
    """Modern configuration manager"""
    
    def __init__(
        self,
        config_filepath: Path = CONFIG_FILE_PATH,
        params_filepath: Path = PARAMS_FILE_PATH
    ):
        """Initialize configuration"""
        try:
            self.config = read_yaml(config_filepath)
            self.params = read_yaml(params_filepath)
            
            create_directories([self.config.artifacts_root])
            logging.info("‚úì Configuration loaded successfully")
            
        except Exception as e:
            logging.error(f"Failed to load configuration: {e}")
            raise

    def get_training_config(self) -> TrainingConfig:
        """Get validated training configuration"""
        training = self.config.training
        prepare_base_model = self.config.prepare_base_model
        params = self.params
        
        # Construct path to training data
        training_data = Path(self.config.data_ingestion.unzip_dir) / "Chest-CT-Scan-data"
        
        create_directories([Path(training.root_dir)])
        
        training_config = TrainingConfig(
            root_dir=Path(training.root_dir),
            trained_model_path=Path(training.trained_model_path),
            updated_base_model_path=Path(prepare_base_model.updated_base_model_path),
            training_data=training_data,
            params_epochs=params.EPOCHS,
            params_batch_size=params.BATCH_SIZE,
            params_is_augmentation=params.AUGMENTATION,
            params_image_size=params.IMAGE_SIZE
        )
        
        logging.info("‚úì Training config created")
        return training_config

In [15]:
import tensorflow as tf
from pathlib import Path
from typing import Tuple, Optional
import logging
from datetime import datetime

# Check GPU availability
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    print(f"‚úì Found {len(gpus)} GPU(s)")
    for gpu in gpus:
        print(f"  - {gpu}")
else:
    print("‚ö† No GPU found - Training on CPU")

‚ö† No GPU found - Training on CPU


In [16]:
class Training:
    """
    Modern training pipeline with TensorFlow best practices.
    
    Key improvements:
    - tf.data.Dataset API (efficient, modern)
    - Built-in augmentation layers (GPU-accelerated)
    - Performance optimizations (prefetch, cache)
    - Comprehensive callbacks
    - Class imbalance handling
    - Progress monitoring
    """
    
    def __init__(self, config: TrainingConfig):
        self.config = config
        self.logger = logging.getLogger(self.__class__.__name__)
        self.model = None
        self.train_generator = None
        self.valid_generator = None

    def get_base_model(self) -> tf.keras.Model:
        """
        Load compiled model from .keras file.
        
        Returns:
            tf.keras.Model: Loaded model ready for training
        """
        try:
            model_path = self.config.updated_base_model_path
            
            if not model_path.exists():
                raise FileNotFoundError(f"Model file not found: {model_path}")
            
            self.logger.info(f"Loading model from: {model_path}")
            
            # Load model in .keras format
            self.model = tf.keras.models.load_model(model_path)
            
            self.logger.info("‚úì Model loaded successfully")
            self.logger.info(f"  Total parameters: {self.model.count_params():,}")
            
            return self.model
            
        except Exception as e:
            self.logger.error(f"Failed to load model: {e}")
            raise

    def train_valid_generator(self) -> Tuple[tf.data.Dataset, tf.data.Dataset]:
        """
        Create training and validation datasets using modern tf.data API.
        
        Modern practices:
        - image_dataset_from_directory (replaces deprecated ImageDataGenerator)
        - GPU-accelerated augmentation layers
        - Prefetching for performance
        - Proper normalization
        - Deterministic splits with seed
        
        Returns:
            Tuple of (train_dataset, validation_dataset)
        """
        try:
            image_size = tuple(self.config.params_image_size[:-1])
            batch_size = self.config.params_batch_size
            
            self.logger.info(f"Loading dataset from: {self.config.training_data}")
            self.logger.info(f"Image size: {image_size}, Batch size: {batch_size}")
            
            # Create validation dataset (20% split)
            self.valid_generator = tf.keras.utils.image_dataset_from_directory(
                directory=str(self.config.training_data),
                validation_split=0.20,
                subset="validation",
                seed=123,  # Deterministic split
                image_size=image_size,
                batch_size=batch_size,
                shuffle=False,  # Don't shuffle validation
                label_mode='categorical'  # For categorical_crossentropy
            )
            
            # Create training dataset (80% split)
            self.train_generator = tf.keras.utils.image_dataset_from_directory(
                directory=str(self.config.training_data),
                validation_split=0.20,
                subset="training",
                seed=123,  # Same seed for consistent split
                image_size=image_size,
                batch_size=batch_size,
                shuffle=True,  # Shuffle training data
                label_mode='categorical'
            )
            
            # Get class names and counts
            class_names = self.train_generator.class_names
            self.logger.info(f"‚úì Classes detected: {class_names}")
            
            # Normalize pixel values to [0, 1]
            normalization_layer = tf.keras.layers.Rescaling(1./255)
            self.train_generator = self.train_generator.map(
                lambda x, y: (normalization_layer(x), y),
                num_parallel_calls=tf.data.AUTOTUNE
            )
            self.valid_generator = self.valid_generator.map(
                lambda x, y: (normalization_layer(x), y),
                num_parallel_calls=tf.data.AUTOTUNE
            )
            
            # Apply data augmentation if enabled (modern GPU-accelerated layers)
            if self.config.params_is_augmentation:
                self.logger.info("‚úì Data augmentation enabled")
                
                # Modern augmentation using Keras layers (GPU-accelerated)
                data_augmentation = tf.keras.Sequential([
                    tf.keras.layers.RandomFlip("horizontal_and_vertical"),
                    tf.keras.layers.RandomRotation(0.2),  # ¬±20% rotation
                    tf.keras.layers.RandomZoom(0.2),  # ¬±20% zoom
                    tf.keras.layers.RandomTranslation(0.2, 0.2),  # ¬±20% shift
                    tf.keras.layers.RandomContrast(0.2),  # Contrast adjustment
                ], name='augmentation')
                
                # Apply only to training data (not validation)
                self.train_generator = self.train_generator.map(
                    lambda x, y: (data_augmentation(x, training=True), y),
                    num_parallel_calls=tf.data.AUTOTUNE
                )
            
            # Performance optimizations
            # Cache: keeps data in memory after first epoch
            # Prefetch: prepares next batch while training current batch
            AUTOTUNE = tf.data.AUTOTUNE
            self.train_generator = self.train_generator.cache().prefetch(buffer_size=AUTOTUNE)
            self.valid_generator = self.valid_generator.cache().prefetch(buffer_size=AUTOTUNE)
            
            # Calculate dataset sizes
            train_batches = tf.data.experimental.cardinality(self.train_generator).numpy()
            valid_batches = tf.data.experimental.cardinality(self.valid_generator).numpy()
            
            self.logger.info(f"‚úì Training batches: {train_batches}")
            self.logger.info(f"‚úì Validation batches: {valid_batches}")
            
            return self.train_generator, self.valid_generator
            
        except Exception as e:
            self.logger.error(f"Failed to create data generators: {e}")
            raise

    @staticmethod
    def save_model(path: Path, model: tf.keras.Model) -> None:
        """Save model in .keras format"""
        try:
            # Ensure .keras extension
            if not str(path).endswith('.keras'):
                path = Path(str(path).replace('.h5', '.keras'))
            
            path.parent.mkdir(parents=True, exist_ok=True)
            model.save(path, save_format='keras')
            
            file_size = path.stat().st_size / (1024 * 1024)
            logging.info(f"‚úì Model saved: {path} ({file_size:.2f} MB)")
            
        except Exception as e:
            logging.error(f"Failed to save model: {e}")
            raise

    def train(self) -> tf.keras.callbacks.History:
        """
        Train model with modern callbacks and monitoring.
        
        Returns:
            History: Training history with metrics
        """
        try:
            # Calculate class weights for imbalanced dataset
            # This is crucial for the 2:1 ratio (300 adenocarcinoma : 150 normal)
            self.logger.info("Calculating class weights for imbalanced dataset...")
            
            # Extract labels from training data
            class_labels = np.concatenate([y for x, y in self.train_generator], axis=0)
            class_labels = np.argmax(class_labels, axis=1)
            
            from sklearn.utils import class_weight
            class_weights = class_weight.compute_class_weight(
                class_weight='balanced',
                classes=np.unique(class_labels),
                y=class_labels
            )
            class_weight_dict = dict(enumerate(class_weights))
            
            self.logger.info(f"‚úì Class weights: {class_weight_dict}")
            self.logger.info("  This balances the 2:1 dataset ratio")
            
            # Setup callbacks for production-ready training
            callbacks = self._create_callbacks()
            
            # Display training info
            print("\n" + "="*60)
            print("STARTING MODEL TRAINING")
            print("="*60)
            print(f"Epochs: {self.config.params_epochs}")
            print(f"Batch size: {self.config.params_batch_size}")
            print(f"Augmentation: {self.config.params_is_augmentation}")
            print(f"Class weights: {class_weight_dict}")
            print("="*60 + "\n")
            
            # Train model
            history = self.model.fit(
                self.train_generator,
                epochs=self.config.params_epochs,
                validation_data=self.valid_generator,
                class_weight=class_weight_dict,
                callbacks=callbacks,
                verbose=1
            )
            
            # Save trained model
            self.save_model(
                path=self.config.trained_model_path,
                model=self.model
            )
            
            self.logger.info("‚úì Training completed successfully")
            return history
            
        except Exception as e:
            self.logger.error(f"Training failed: {e}")
            raise

    def _create_callbacks(self) -> list:
        """
        Create modern callbacks for training monitoring.
        
        Returns:
            List of callbacks
        """
        callbacks = []
        
        # Early stopping to prevent overfitting
        early_stopping = tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',
            patience=5,
            restore_best_weights=True,
            verbose=1
        )
        callbacks.append(early_stopping)
        
        # Reduce learning rate when plateauing
        reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.5,
            patience=3,
            min_lr=1e-7,
            verbose=1
        )
        callbacks.append(reduce_lr)
        
        # Model checkpoint to save best model
        checkpoint_path = self.config.root_dir / "best_model_checkpoint.keras"
        model_checkpoint = tf.keras.callbacks.ModelCheckpoint(
            filepath=str(checkpoint_path),
            monitor='val_accuracy',
            save_best_only=True,
            mode='max',
            verbose=1
        )
        callbacks.append(model_checkpoint)
        
        # TensorBoard for visualization (optional)
        log_dir = self.config.root_dir / "logs" / datetime.now().strftime("%Y%m%d-%H%M%S")
        tensorboard = tf.keras.callbacks.TensorBoard(
            log_dir=str(log_dir),
            histogram_freq=1,
            write_graph=True
        )
        callbacks.append(tensorboard)
        
        self.logger.info(f"‚úì Configured {len(callbacks)} callbacks")
        self.logger.info(f"  TensorBoard logs: {log_dir}")
        
        return callbacks

In [17]:
# MAIN EXECUTION PIPELINE
# Production-ready training with comprehensive error handling

if __name__ == "__main__":
    try:
        print("\n" + "="*60)
        print("MODEL TRAINING PIPELINE")
        print("="*60 + "\n")
        
        # Initialize configuration
        config_manager = ConfigurationManager()
        training_config = config_manager.get_training_config()
        
        # Initialize training
        training = Training(config=training_config)
        
        # Step 1: Load model
        print("Step 1/3: Loading trained base model...")
        training.get_base_model()
        
        # Step 2: Prepare data
        print("\nStep 2/3: Preparing training and validation data...")
        training.train_valid_generator()
        
        # Step 3: Train
        print("\nStep 3/3: Training model...")
        history = training.train()
        
        print("\n" + "="*60)
        print("‚úì TRAINING COMPLETED SUCCESSFULLY")
        print("="*60 + "\n")
        print(f"üìÅ Trained model: {training_config.trained_model_path}")
        print(f"üìä TensorBoard logs: {training_config.root_dir / 'logs'}")
        print("\n‚ú® Ready for evaluation!")
        
    except FileNotFoundError as e:
        print(f"\n‚ùå FILE ERROR: {e}")
        print("   Ensure previous steps (data ingestion, base model) completed")
    except ValueError as e:
        print(f"\n‚ùå CONFIGURATION ERROR: {e}")
        print("   Check your configuration files")
    except Exception as e:
        print(f"\n‚ùå UNEXPECTED ERROR: {e}")
        import traceback
        traceback.print_exc()
        raise

2025-12-13 00:19:54,400 - cnnClassifierLogger - INFO - yaml file: config\config.yaml loaded successfully
2025-12-13 00:19:54,410 - cnnClassifierLogger - INFO - yaml file: params.yaml loaded successfully
2025-12-13 00:19:54,413 - cnnClassifierLogger - INFO - created directory at: artifacts
2025-12-13 00:19:54,418 - root - INFO - ‚úì Configuration loaded successfully
2025-12-13 00:19:54,420 - cnnClassifierLogger - INFO - created directory at: artifacts\training
2025-12-13 00:19:54,421 - root - INFO - ‚úì Training config created
2025-12-13 00:19:54,424 - Training - INFO - Loading model from: artifacts\prepare_base_model\base_model_updated.keras



MODEL TRAINING PIPELINE

Step 1/3: Loading trained base model...


  saveable.load_own_variables(weights_store.get(inner_path))
2025-12-13 00:19:57,206 - Training - INFO - ‚úì Model loaded successfully
2025-12-13 00:19:57,208 - Training - INFO -   Total parameters: 4,057,253
2025-12-13 00:19:57,210 - Training - INFO - Loading dataset from: artifacts\data_ingestion\Chest-CT-Scan-data
2025-12-13 00:19:57,211 - Training - INFO - Image size: (224, 224), Batch size: 8



Step 2/3: Preparing training and validation data...
Found 466 files belonging to 2 classes.
Using 93 files for validation.
Found 466 files belonging to 2 classes.
Using 373 files for training.


2025-12-13 00:19:57,427 - Training - INFO - ‚úì Classes detected: ['adenocarcinoma', 'normal']
2025-12-13 00:19:57,525 - Training - INFO - ‚úì Data augmentation enabled
2025-12-13 00:19:57,881 - Training - INFO - ‚úì Training batches: 47
2025-12-13 00:19:57,885 - Training - INFO - ‚úì Validation batches: 12
2025-12-13 00:19:57,886 - Training - INFO - Calculating class weights for imbalanced dataset...



Step 3/3: Training model...


2025-12-13 00:20:02,796 - Training - INFO - ‚úì Class weights: {0: np.float64(0.734251968503937), 1: np.float64(1.5672268907563025)}
2025-12-13 00:20:02,796 - Training - INFO -   This balances the 2:1 dataset ratio
2025-12-13 00:20:02,797 - Training - INFO - ‚úì Configured 4 callbacks
2025-12-13 00:20:02,798 - Training - INFO -   TensorBoard logs: artifacts\training\logs\20251213-002002



STARTING MODEL TRAINING
Epochs: 1
Batch size: 8
Augmentation: True
Class weights: {0: np.float64(0.734251968503937), 1: np.float64(1.5672268907563025)}

[1m47/47[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m0s[0m 485ms/step - accuracy: 0.5151 - auc: 0.4862 - loss: 0.8878 - precision: 0.5151 - recall: 0.5151
Epoch 1: val_accuracy improved from None to 1.00000, saving model to artifacts\training\best_model_checkpoint.keras
[1m47/47[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m49s[0m 794ms/step - accuracy: 0.5067 - auc: 0.4929 - loss: 0.8385 - precision: 0.5067 - recall: 0.5067 - val_accuracy: 1.0000 - val_auc: 1.0000 - val_loss: 0.5527 - val_precision: 1.0000 - val_recall: 1.0000 - learning_rate: 0.0010
Restoring model weights from the end of the best epoch: 1.


2025-12-13 00:20:54,875 - root - INFO - ‚úì Model saved: artifacts\training\model.keras (16.32 MB)
2025-12-13 00:20:54,877 - Training - INFO - ‚úì Training completed successfully



‚úì TRAINING COMPLETED SUCCESSFULLY

üìÅ Trained model: artifacts\training\model.keras
üìä TensorBoard logs: artifacts\training\logs

‚ú® Ready for evaluation!
