In [1]:
import os

In [2]:
%pwd

'c:\\Users\\Adithya\\Desktop\\chest_cancer_classification\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'c:\\Users\\Adithya\\Desktop\\chest_cancer_classification'

In [5]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataIngestionConfig:
    root_dir: Path
    source_URL: str
    local_data_file: Path
    unzip_dir: Path

In [6]:
from cnnClassifier.constants import *
from cnnClassifier.utils.common import read_yaml, create_directories

In [7]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])


    
    def get_data_ingestion_config(self) -> DataIngestionConfig:
        config = self.config.data_ingestion

        create_directories([config.root_dir])

        data_ingestion_config = DataIngestionConfig(
            root_dir=config.root_dir,
            source_URL=config.source_URL,
            local_data_file=config.local_data_file,
            unzip_dir=config.unzip_dir 
        )

        return data_ingestion_config
      

In [8]:
import os
import zipfile
import gdown
from cnnClassifier import logger
from cnnClassifier.utils.common import get_size

In [9]:
import os
import requests
import zipfile
import gdown
import logging
from pathlib import Path

# Set up logging configuration
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class DataIngestionConfig:
    def __init__(self, source_URL, local_data_file, root_dir, unzip_dir):
        self.source_URL = source_URL
        self.local_data_file = Path(local_data_file)
        self.root_dir = Path(root_dir)
        self.unzip_dir = Path(unzip_dir)

class DataIngestion:
    def __init__(self, config: DataIngestionConfig):
        self.config = config

    def download_file(self) -> Path:
        '''Fetch data from the URL.'''
        try:
            dataset_url = self.config.source_URL
            zip_download_dir = self.config.local_data_file
            download_dir = zip_download_dir.parent

            # Ensure the download directory exists
            download_dir.mkdir(parents=True, exist_ok=True)
            logger.info(f"Downloading data from {dataset_url} into file {zip_download_dir}")

            # Extract file ID for Google Drive if applicable
            if "drive.google.com" in dataset_url:
                file_id = dataset_url.split("/")[-2]
                prefix = 'https://drive.google.com/uc?export=download&id='
                download_url = prefix + file_id
            else:
                download_url = dataset_url

            # Download using gdown
            gdown.download(download_url, str(zip_download_dir), quiet=False)

            # Check if the file exists and is not empty
            if not zip_download_dir.exists() or zip_download_dir.stat().st_size == 0:
                raise Exception("Downloaded file is empty or does not exist. Please check the URL or file permissions.")

            logger.info(f"Downloaded data from {dataset_url} into file {zip_download_dir}")
            return zip_download_dir

        except Exception as e:
            logger.error(f"Error in downloading file: {e}")
            raise e

    def extract_zip_file(self):
        """Extracts the zip file into the data directory."""
        try:
            unzip_path = self.config.unzip_dir
            unzip_path.mkdir(parents=True, exist_ok=True)  # Ensure the unzip directory exists

            # Check if the file is actually a zip file
            if not zipfile.is_zipfile(str(self.config.local_data_file)):
                raise zipfile.BadZipFile("The downloaded file is not a zip file.")

            # Extract the zip file
            with zipfile.ZipFile(str(self.config.local_data_file), 'r') as zip_ref:
                zip_ref.extractall(unzip_path)
            logger.info(f"Extracted zip file to {unzip_path}")

        except zipfile.BadZipFile as bzfe:
            logger.error(f"BadZipFile error: {bzfe}")
            raise bzfe
        except Exception as e:
            logger.error(f"Error in extracting zip file: {e}")
            raise e

if __name__ == "__main__":
    try:
        # Example configuration (replace with your actual paths and URLs)
        config = DataIngestionConfig(
            source_URL="https://drive.google.com/file/d/11T_7lm2Fuhn94UR3PnKn6OtOjXsRO8qT/view?usp=sharing",
            local_data_file="artifacts/data_ingestion/data.zip",
            root_dir="artifacts/data_ingestion",
            unzip_dir="artifacts/data_ingestion"
        )
        data_ingestion = DataIngestion(config)
        logger.info(">>>>>> stage Data Ingestion stage started <<<<<<")
        
        # Download the data
        data_ingestion.download_file()

        # Extract the downloaded ZIP file
        data_ingestion.extract_zip_file()

        logger.info(">>>>>> stage Data Ingestion completed <<<<<<")

    except Exception as e:
        logger.error(f"Error in Data Ingestion stage: {e}")
        raise e

[2024-10-12 13:52:16,569:INFO:2403714904:>>>>>> stage Data Ingestion stage started <<<<<<]
[2024-10-12 13:52:16,569:INFO:2403714904:Downloading data from https://drive.google.com/file/d/11T_7lm2Fuhn94UR3PnKn6OtOjXsRO8qT/view?usp=sharing into file artifacts\data_ingestion\data.zip]


Downloading...
From (original): https://drive.google.com/uc?export=download&id=11T_7lm2Fuhn94UR3PnKn6OtOjXsRO8qT
From (redirected): https://drive.google.com/uc?export=download&id=11T_7lm2Fuhn94UR3PnKn6OtOjXsRO8qT&confirm=t&uuid=42b4b950-535c-43a1-95ba-93691999ffba
To: c:\Users\Adithya\Desktop\chest_cancer_classification\artifacts\data_ingestion\data.zip
100%|██████████| 49.0M/49.0M [00:07<00:00, 6.23MB/s]

[2024-10-12 13:52:29,528:INFO:2403714904:Downloaded data from https://drive.google.com/file/d/11T_7lm2Fuhn94UR3PnKn6OtOjXsRO8qT/view?usp=sharing into file artifacts\data_ingestion\data.zip]





[2024-10-12 13:52:30,166:INFO:2403714904:Extracted zip file to artifacts\data_ingestion]
[2024-10-12 13:52:30,166:INFO:2403714904:>>>>>> stage Data Ingestion completed <<<<<<]


In [10]:
try:
    config = ConfigurationManager()
    data_ingestion_config = config.get_data_ingestion_config()
    data_ingestion = DataIngestion(config=data_ingestion_config)
    data_ingestion.download_file()
    data_ingestion.extract_zip_file()
except Exception as e:
    raise e

[2024-10-12 13:52:30,190:INFO:common:yaml file: config\config.yaml loaded successfully]
[2024-10-12 13:52:30,222:INFO:common:yaml file: params.yaml loaded successfully]
[2024-10-12 13:52:30,232:INFO:common:created directory at: artifacts]
[2024-10-12 13:52:30,234:INFO:common:created directory at: artifacts/data_ingestion]
[2024-10-12 13:52:30,234:INFO:2403714904:Downloading data from https://drive.google.com/file/d/11T_7lm2Fuhn94UR3PnKn6OtOjXsRO8qT/view?usp=sharing into file artifacts\data_ingestion\data.zip]


Downloading...
From (original): https://drive.google.com/uc?export=download&id=11T_7lm2Fuhn94UR3PnKn6OtOjXsRO8qT
From (redirected): https://drive.google.com/uc?export=download&id=11T_7lm2Fuhn94UR3PnKn6OtOjXsRO8qT&confirm=t&uuid=cb8a1632-8877-4bd2-b401-8d1f6a17dc82
To: c:\Users\Adithya\Desktop\chest_cancer_classification\artifacts\data_ingestion\data.zip
100%|██████████| 49.0M/49.0M [00:07<00:00, 6.17MB/s]

[2024-10-12 13:52:42,841:INFO:2403714904:Downloaded data from https://drive.google.com/file/d/11T_7lm2Fuhn94UR3PnKn6OtOjXsRO8qT/view?usp=sharing into file artifacts\data_ingestion\data.zip]





[2024-10-12 13:52:43,593:INFO:2403714904:Extracted zip file to artifacts\data_ingestion]
