In [1]:
import os

In [2]:
%pwd

'c:\\Users\\user\\Desktop\\End-to-End-ML-project-MLflow\\research'

In [None]:
#os.chdir("..")

In [5]:
# entity

from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataIngestionConfig:
    root_dir: Path
    source_URL: str
    local_data_file: Path
    unzip_dir: Path

In [6]:
from my_project.constants import *
from my_project.utils.common import read_yaml, create_directories
from pathlib import Path

2025-08-25 13:13:47,530 - my_project - INFO - Initializing my_project package


In [None]:
# configuration manager
class ConfigurationManager:
    def __init__(
        self,
        config_filepath: Path = CONFIG_FILE_PATH,
        params_filepath: Path = PARAMS_FILE_PATH,
        schema_filepath: Path = SCHEMA_FILE_PATH,
    ):
        # Load configs
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        # Ensure artifacts root exists
        create_directories(Path(self.config.artifacts_root))
        
# data ingestion config
    def get_data_ingestion_config(self) -> DataIngestionConfig:
        cfg = self.config.data_ingestion

        root_dir = Path(cfg.root_dir)
        create_directories([root_dir])

        return DataIngestionConfig(
            root_dir=root_dir,
            source_URL=cfg.source_URL,
            local_data_file=Path(cfg.local_data_file),
            unzip_dir=Path(cfg.unzip_dir),
        )


In [8]:
# component dependency
import os
import urllib.request as request
import zipfile
from my_project import logger
from my_project.utils.common import get_size

In [9]:
import os
import zipfile
from pathlib import Path
from urllib import request

from my_project.utils.common import get_size
from my_project import logger


class DataIngestion:
    def __init__(self, config: DataIngestionConfig):
        self.config = config

    def download_file(self) -> None:
        """Downloads the dataset from the source URL if not already present."""
        local_path = Path(self.config.local_data_file)

        if not local_path.exists():
            logger.info(f"Downloading file from: [{self.config.source_URL}] to: [{local_path}]")
            try:
                request.urlretrieve(self.config.source_URL, local_path)
                logger.info(f"File downloaded successfully. Size: {get_size(local_path)} bytes")
            except Exception as e:
                logger.error(f"Failed to download file from {self.config.source_URL}: {e}")
                raise
        else:
            logger.info(f"File already exists at: [{local_path}] (size: {get_size(local_path)} bytes)")

    def unzip_and_clean(self) -> None:
        """Unzips the dataset and removes the original zip file."""
        local_path = Path(self.config.local_data_file)
        unzip_dir = Path(self.config.unzip_dir)

        logger.info(f"Unzipping file: [{local_path}] to dir: [{unzip_dir}]")
        try:
            with zipfile.ZipFile(local_path, "r") as zip_ref:
                zip_ref.extractall(unzip_dir)
            logger.info("Unzipping completed successfully.")
        except zipfile.BadZipFile as e:
            logger.error(f"Invalid zip file: {local_path} ({e})")
            raise
        except Exception as e:
            logger.error(f"Error while unzipping {local_path}: {e}")
            raise

        # Clean up zip file only after successful extraction
        try:
            local_path.unlink()
            logger.info(f"Deleted zip file: [{local_path}]")
        except Exception as e:
            logger.warning(f"Could not delete zip file {local_path}: {e}")


In [10]:
#pipline
try:
    config = ConfigurationManager()
    data_ingestion_config = config.get_data_ingestion_config()

    data_ingestion = DataIngestion(config=data_ingestion_config)
    data_ingestion.download_file()
    data_ingestion.unzip_and_clean()

    logger.info("===== Data Ingestion completed successfully =====")

except Exception as e:
    logger.exception(f"Pipeline failed due to error: {e}")
    raise


2025-08-25 13:13:52,965 - my_project - INFO - YAML file 'config\config.yaml' read successfully.
2025-08-25 13:13:52,969 - my_project - INFO - YAML file 'params.yaml' read successfully.
2025-08-25 13:13:52,969 - my_project - INFO - YAML file 'schema.yaml' read successfully.
2025-08-25 13:13:52,981 - my_project - INFO - Directory created: 'artifacts'
2025-08-25 13:13:52,981 - my_project - INFO - Directory created: 'artifacts\data_ingestion'
2025-08-25 13:13:52,987 - my_project - INFO - Downloading file from: [https://github.com/entbappy/Branching-tutorial/raw/master/winequality-data.zip] to: [artifacts\data_ingestion\data.zip]
2025-08-25 13:14:00,351 - my_project - INFO - Size of file 'artifacts\data_ingestion\data.zip': 23329 bytes
2025-08-25 13:14:00,357 - my_project - INFO - File downloaded successfully. Size: 23329 bytes
2025-08-25 13:14:00,363 - my_project - INFO - Unzipping file: [artifacts\data_ingestion\data.zip] to dir: [artifacts\data_ingestion]
2025-08-25 13:14:00,468 - my_pro