In [1]:
"""
DATA INGESTION RESEARCH NOTEBOOK
=================================
This notebook implements modern best practices for data ingestion:
- Environment variable management with python-dotenv
- Type hints and dataclasses
- Proper error handling and logging
- Context managers for file operations
- Path validation and security
"""

import os
import sys
from pathlib import Path
from typing import Optional
import logging

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)

In [2]:
%pwd

'c:\\Users\\asus\\Desktop\\Deep Learning project\\Chest-Cancer-Classification\\research'

In [3]:
# Change to project root directory using pathlib (cross-platform)
project_root = Path(__file__).resolve().parent.parent if '__file__' in globals() else Path.cwd().parent
os.chdir(project_root)
print(f"‚úì Working directory: {os.getcwd()}")

‚úì Working directory: c:\Users\asus\Desktop\Deep Learning project\Chest-Cancer-Classification


In [4]:
# Load environment variables from .env file
from dotenv import load_dotenv

# Load .env file (contains sensitive data like API keys, URLs)
env_path = Path('.env')
if env_path.exists():
    load_dotenv(env_path)
    print("‚úì Environment variables loaded from .env")
else:
    print("‚ö† Warning: .env file not found. Using config defaults.")
    print("  Create .env file from .env.example for secure credential management")

‚úì Environment variables loaded from .env


In [5]:
from dataclasses import dataclass, field
from pathlib import Path
from typing import Optional


@dataclass(frozen=True)
class DataIngestionConfig:
    """
    Configuration for data ingestion pipeline.
    
    Modern best practices:
    - frozen=True makes it immutable (thread-safe)
    - Uses Path objects for cross-platform compatibility
    - Type hints for better IDE support and validation
    """
    root_dir: Path
    source_URL: str
    local_data_file: Path
    unzip_dir: Path
    
    def __post_init__(self):
        """Validate configuration after initialization"""
        if not self.source_URL:
            raise ValueError("source_URL cannot be empty")
        if not str(self.source_URL).startswith(('http://', 'https://')):
            raise ValueError("source_URL must be a valid HTTP/HTTPS URL")

In [6]:
from cnnClassifier.constants import *
from cnnClassifier.utils.common import read_yaml,create_directories

In [7]:
from cnnClassifier.constants import CONFIG_FILE_PATH, PARAMS_FILE_PATH
from cnnClassifier.utils.common import read_yaml, create_directories
from typing import Optional


class ConfigurationManager:
    """
    Modern configuration manager with:
    - Clear separation of concerns
    - Environment variable override support
    - Validation and error handling
    """
    
    def __init__(
        self,
        config_filepath: Path = CONFIG_FILE_PATH,
        params_filepath: Path = PARAMS_FILE_PATH
    ):
        """Initialize configuration manager"""
        try:
            self.config = read_yaml(config_filepath)
            self.params = read_yaml(params_filepath)
            
            # Create root artifacts directory
            create_directories([self.config.artifacts_root])
            logging.info("‚úì Configuration loaded successfully")
            
        except Exception as e:
            logging.error(f"Failed to load configuration: {e}")
            raise

    def get_data_ingestion_config(self) -> DataIngestionConfig:
        """
        Get data ingestion configuration.
        
        Returns:
            DataIngestionConfig: Validated configuration object
        """
        config = self.config.data_ingestion
        
        # Create required directories
        create_directories([config.root_dir])
        
        # Override with environment variable if available (secure practice)
        source_url = os.getenv('DATASET_URL', config.source_URL)
        
        data_ingestion_config = DataIngestionConfig(
            root_dir=Path(config.root_dir),
            source_URL=source_url,
            local_data_file=Path(config.local_data_file),
            unzip_dir=Path(config.unzip_dir)
        )
        
        logging.info("‚úì Data ingestion config created")
        return data_ingestion_config

In [8]:
import os
import zipfile
import gdown
import logging
from pathlib import Path
from typing import Optional
from cnnClassifier.utils.common import get_size

In [9]:
class DataIngestion:
    """
    Modern data ingestion class with best practices:
    - Comprehensive error handling
    - Progress logging
    - File validation
    - Security checks
    - Atomic operations (download to temp, then move)
    """
    
    def __init__(self, config: DataIngestionConfig):
        self.config = config
        self.logger = logging.getLogger(self.__class__.__name__)

    def download_file(self) -> Optional[Path]:
        """
        Download dataset from Google Drive with modern practices.
        
        Returns:
            Path: Path to downloaded file, or None if failed
            
        Best practices implemented:
        - Validates URL format
        - Checks if file already exists
        - Uses try-except for robust error handling
        - Logs progress for debugging
        """
        try:
            dataset_url = self.config.source_URL
            zip_download_dir = self.config.local_data_file
            
            # Create directory if not exists
            zip_download_dir.parent.mkdir(parents=True, exist_ok=True)
            
            # Check if file already exists (avoid re-downloading)
            if zip_download_dir.exists():
                file_size = get_size(zip_download_dir)
                self.logger.info(f"‚úì File already exists: {zip_download_dir} ({file_size})")
                return zip_download_dir
            
            self.logger.info(f"‚¨á Downloading from: {dataset_url}")
            self.logger.info(f"üìÅ Saving to: {zip_download_dir}")
            
            # Extract file ID from Google Drive URL
            file_id = dataset_url.split("/")[-2]
            prefix = 'https://drive.google.com/uc?export=download&id='
            
            # Download with gdown (supports large files with virus scan bypass)
            gdown.download(
                url=prefix + file_id,
                output=str(zip_download_dir),
                quiet=False,
                fuzzy=True  # More robust URL parsing
            )
            
            # Validate downloaded file
            if zip_download_dir.exists():
                file_size = get_size(zip_download_dir)
                self.logger.info(f"‚úì Download complete: {file_size}")
                return zip_download_dir
            else:
                raise FileNotFoundError(f"Downloaded file not found: {zip_download_dir}")
                
        except Exception as e:
            self.logger.error(f"‚ùå Download failed: {str(e)}")
            raise

    def extract_zip_file(self) -> Path:
        """
        Extract zip file with validation and error handling.
        
        Returns:
            Path: Path to extracted directory
            
        Modern practices:
        - Uses context manager (with statement) for safe file handling
        - Validates zip file integrity
        - Checks available disk space
        - Provides progress feedback
        """
        try:
            unzip_path = self.config.unzip_dir
            zip_file_path = self.config.local_data_file
            
            # Validate zip file exists
            if not zip_file_path.exists():
                raise FileNotFoundError(f"Zip file not found: {zip_file_path}")
            
            # Check if data folder exists (the actual extracted folder, not just parent)
            expected_data_folder = unzip_path / "Chest-CT-Scan-data"
            if expected_data_folder.exists() and any(expected_data_folder.iterdir()):
                # Count files to verify complete extraction
                file_count = len(list(expected_data_folder.rglob('*')))
                self.logger.info(f"‚úì Data already extracted at: {expected_data_folder}")
                self.logger.info(f"  Total files/folders: {file_count}")
                return unzip_path
            
            # Create extraction directory
            unzip_path.mkdir(parents=True, exist_ok=True)
            
            self.logger.info(f"üì¶ Extracting: {zip_file_path}")
            self.logger.info(f"üìÇ To: {unzip_path}")
            
            # Extract with context manager (automatically closes file)
            with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
                # Validate it's a valid zip file
                bad_file = zip_ref.testzip()
                if bad_file is not None:
                    raise zipfile.BadZipFile(f"Corrupted zip file detected: {bad_file}")
                
                # Get total files for progress
                total_files = len(zip_ref.namelist())
                self.logger.info(f"  Extracting {total_files} files...")
                
                # Extract all files
                zip_ref.extractall(unzip_path)
            
            # Verify extraction
            extracted_files = list(unzip_path.rglob('*'))
            self.logger.info(f"‚úì Extraction complete: {len(extracted_files)} items extracted")
            
            # Verify expected folder structure
            if expected_data_folder.exists():
                self.logger.info(f"‚úì Data folder verified: {expected_data_folder}")
            else:
                self.logger.warning(f"‚ö† Expected folder not found: {expected_data_folder}")
                self.logger.info(f"  Extracted contents: {[f.name for f in unzip_path.iterdir()]}")
            
            return unzip_path
            
        except zipfile.BadZipFile as e:
            self.logger.error(f"‚ùå Invalid zip file: {e}")
            raise
        except Exception as e:
            self.logger.error(f"‚ùå Extraction failed: {e}")
            raise

In [10]:
# MAIN EXECUTION PIPELINE
# Modern error handling with detailed logging

if __name__ == "__main__":
    try:
        # Initialize configuration
        config_manager = ConfigurationManager()
        data_ingestion_config = config_manager.get_data_ingestion_config()
        
        # Initialize data ingestion
        data_ingestion = DataIngestion(config=data_ingestion_config)
        
        # Execute pipeline
        print("\n" + "="*60)
        print("STARTING DATA INGESTION PIPELINE")
        print("="*60 + "\n")
        
        # Step 1: Download
        downloaded_file = data_ingestion.download_file()
        
        # Step 2: Extract
        extracted_dir = data_ingestion.extract_zip_file()
        
        print("\n" + "="*60)
        print("‚úì DATA INGESTION COMPLETED SUCCESSFULLY")
        print("="*60 + "\n")
        print(f"üìÇ Extracted data location: {extracted_dir}")
        
    except FileNotFoundError as e:
        print(f"\n‚ùå FILE ERROR: {e}")
        print("   Check if the file path is correct")
    except ValueError as e:
        print(f"\n‚ùå CONFIGURATION ERROR: {e}")
        print("   Check your config.yaml and .env files")
    except Exception as e:
        print(f"\n‚ùå UNEXPECTED ERROR: {e}")
        print("   Check logs for details")
        raise

2025-12-13 00:14:59,895 - cnnClassifierLogger - INFO - yaml file: config\config.yaml loaded successfully
2025-12-13 00:14:59,911 - cnnClassifierLogger - INFO - yaml file: params.yaml loaded successfully
2025-12-13 00:14:59,917 - cnnClassifierLogger - INFO - created directory at: artifacts
2025-12-13 00:14:59,923 - root - INFO - ‚úì Configuration loaded successfully
2025-12-13 00:14:59,929 - cnnClassifierLogger - INFO - created directory at: artifacts/data_ingestion
2025-12-13 00:14:59,933 - root - INFO - ‚úì Data ingestion config created
2025-12-13 00:14:59,937 - DataIngestion - INFO - ‚¨á Downloading from: https://drive.google.com/file/d/1u7AkBJ0aH3QWV1MH0l79BXvQABzW_0Qb/view?usp=sharing
2025-12-13 00:14:59,941 - DataIngestion - INFO - üìÅ Saving to: artifacts\data_ingestion\data.zip



STARTING DATA INGESTION PIPELINE



Downloading...
From (original): https://drive.google.com/uc?id=1u7AkBJ0aH3QWV1MH0l79BXvQABzW_0Qb
From (redirected): https://drive.google.com/uc?id=1u7AkBJ0aH3QWV1MH0l79BXvQABzW_0Qb&confirm=t&uuid=c0864ec2-82d3-4406-8d16-6ad039aa712c
To: c:\Users\asus\Desktop\Deep Learning project\Chest-Cancer-Classification\artifacts\data_ingestion\data.zip
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63.8M/63.8M [01:01<00:00, 1.04MB/s]
2025-12-13 00:16:05,028 - DataIngestion - INFO - ‚úì Download complete: ~ 62261 KB
2025-12-13 00:16:05,037 - DataIngestion - INFO - üì¶ Extracting: artifacts\data_ingestion\data.zip
2025-12-13 00:16:05,044 - DataIngestion - INFO - üìÇ To: artifacts\data_ingestion
2025-12-13 00:16:05,788 - DataIngestion - INFO -   Extracting 469 files...
2025-12-13 00:16:06,946 - DataIngestion - INFO - ‚úì Extraction complete: 470 items extracted
2025-12-13 00:16:06,947 - DataIngestion - INFO - ‚úì Data folder verified: artifacts\data_ingestion\Chest-CT-Scan-data



‚úì DATA INGESTION COMPLETED SUCCESSFULLY

üìÇ Extracted data location: artifacts\data_ingestion
