In [1]:
import os

In [2]:
%pwd

'c:\\Users\\User\\Desktop\\Project Anchor\\MLflow-DVC-Kidney-Disease-Classification\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'c:\\Users\\User\\Desktop\\Project Anchor\\MLflow-DVC-Kidney-Disease-Classification'

In [7]:
# Create Entity (the return type of any function)
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataIngestionConfig:
    """Configuration class for data ingestion."""

    root_dir: Path        # Root directory where data will be stored
    source_URL: str       # URL from where the data will be downloaded
    local_data_file: Path  # Path to the local data file
    unzip_dir: Path       # Directory where the data will be extracted after downloading


In [6]:
from cnnClassifier.constants import *
from cnnClassifier.utils.common import read_yaml,create_directories

In [8]:
# Class for managing configuration settings
class ConfigurationManager:
    def __init__(
        self,
        config_filepath=CONFIG_FILE_PATH,  # Path to the configuration file
        params_filepath=PARAMS_FILE_PATH   # Path to the parameters file
    ):
        # Initialize the configuration and parameters
        self.config = read_yaml(config_filepath)  # Read configuration from YAML file
        self.params = read_yaml(params_filepath)  # Read parameters from YAML file

        # Create necessary directories
        create_directories([self.config.artifacts_root])

    # Method to retrieve data ingestion configuration
    def get_data_ingestion_config(self) -> DataIngestionConfig:
        # Get data ingestion configuration from the main configuration
        config = self.config.data_ingestion

        # Create directories specified in the configuration
        create_directories([config.root_dir])

        # Create DataIngestionConfig object using the retrieved configuration
        data_ingestion_config = DataIngestionConfig(
            root_dir=config.root_dir,              # Root directory for data ingestion
            source_URL=config.source_URL,          # URL for data source
            local_data_file=config.local_data_file,# Path to local data file
            unzip_dir=config.unzip_dir            # Directory for extracted data
        )

        return data_ingestion_config


In [9]:
import os
import zipfile
import gdown
from cnnClassifier import logger
from cnnClassifier.utils.common import get_size

In [10]:
class DataIngestion:
    def __init__(self, config: DataIngestionConfig):
        """
        Initializes DataIngestion instance with provided configuration.

        Args:
            config (DataIngestionConfig): Configuration for data ingestion.
        """
        self.config = config

    def download_file(self) -> str:
        """
        Downloads data from the specified URL.

        Returns:
            str: Path to the downloaded file.
        """
        try:
            dataset_url = self.config.source_URL
            zip_download_dir = self.config.local_data_file

            # Create directory for downloaded data if it doesn't exist
            os.makedirs("artifacts/data_ingestion", exist_ok=True)

            logger.info(f"Downloading data from {dataset_url} into file {zip_download_dir}")

            # Extract file ID from the dataset URL
            file_id = dataset_url.split("/")[-2]
            prefix = 'https://drive.google.com/uc?/export=download&id='

            # Download the file using gdown library
            gdown.download(prefix + file_id, zip_download_dir)

            logger.info(f"Downloaded data from {dataset_url} into file {zip_download_dir}")

            return zip_download_dir

        except Exception as e:
            raise e

    def extract_zip_file(self):
        """
        Extracts the contents of the zip file into the specified directory.
        """
        unzip_path = self.config.unzip_dir

        # Create directory for extracted data if it doesn't exist
        os.makedirs(unzip_path, exist_ok=True)

        # Extract the zip file
        with zipfile.ZipFile(self.config.local_data_file, 'r') as zip_ref:
            zip_ref.extractall(unzip_path)


In [11]:
try:
    # Initialize ConfigurationManager to get configuration settings
    config_manager = ConfigurationManager()

    # Get data ingestion configuration from the configuration manager
    data_ingestion_config = config_manager.get_data_ingestion_config()

    # Initialize DataIngestion with the obtained configuration
    data_ingestion = DataIngestion(config=data_ingestion_config)

    # Download data from the specified URL
    data_ingestion.download_file()

    # Extract the downloaded zip file
    data_ingestion.extract_zip_file()

except Exception as e:
    # If any exception occurs during the process, raise it with the original traceback
    raise e


[2024-03-05 15:13:36,301: INFO: common: YAML file 'config\config.yaml' loaded successfully]
[2024-03-05 15:13:36,319: INFO: common: YAML file 'params.yaml' loaded successfully]
[2024-03-05 15:13:36,321: INFO: common: Created directory at: artifacts]
[2024-03-05 15:13:36,323: INFO: common: Created directory at: artifacts/data_ingestion]
[2024-03-05 15:13:36,324: INFO: 1284514612: Downloading data from https://drive.google.com/file/d/1e9oof67nDls9kd6CRzhJvk8Le3WmIh5y/view?usp=sharing into file artifacts/data_ingestion/data.zip]


Downloading...
From (original): https://drive.google.com/uc?/export=download&id=1e9oof67nDls9kd6CRzhJvk8Le3WmIh5y
From (redirected): https://drive.google.com/uc?%2Fexport=download&id=1e9oof67nDls9kd6CRzhJvk8Le3WmIh5y&confirm=t&uuid=5a2db451-cc59-4221-a6ae-53e7c51c7024
To: c:\Users\User\Desktop\Project Anchor\MLflow-DVC-Kidney-Disease-Classification\artifacts\data_ingestion\data.zip
100%|██████████| 940M/940M [01:57<00:00, 8.02MB/s] 

[2024-03-05 15:15:35,738: INFO: 1284514612: Downloaded data from https://drive.google.com/file/d/1e9oof67nDls9kd6CRzhJvk8Le3WmIh5y/view?usp=sharing into file artifacts/data_ingestion/data.zip]



