In [2]:
pwd

'c:\\Users\\Arpit Kadam\\Desktop\\DataScienceProject\\data-science-project\\research'

In [3]:
os.chdir("../")

In [4]:
pwd

'c:\\Users\\Arpit Kadam\\Desktop\\DataScienceProject\\data-science-project'

In [45]:
from dataclasses import dataclass
from pathlib import Path
import os
import urllib.request as request
import zipfile
from src.datascienceproject.constants import *
from src.datascienceproject.utils.common import read_yaml, create_directories
import logging

# Logger setup
logger = logging.getLogger("DataIngestion")

@dataclass
class DataIngestionConfig:
    root_dir: Path
    source_URL: str
    local_data_file: Path
    unzip_dir: Path

class ConfigurationManager:
    def __init__(self, 
                 config_filepath=CONFIG_FILE_PATH, 
                 params_filepath=PARAMS_FILE_PATH, 
                 schema_filepath=SCHEMA_FILE_PATH):
        self.config = read_yaml(config_filepath)
        print("Debug Config:", self.config)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)
        
        # Validate configuration structure
        if "artifacts" not in self.config or "artifacts_root" not in self.config["artifacts"]:
            raise KeyError("'artifacts_root' is missing in the configuration.")

        # Create root directory
        create_directories([self.config["artifacts"]["artifacts_root"]])

    def get_data_ingestion_config(self) -> DataIngestionConfig:
        config = self.config["data_ingestion"]  # Corrected access
        create_directories([config["root_dir"]])

        # Create DataIngestionConfig object
        data_ingestion_config = DataIngestionConfig(
            root_dir=Path(config["root_dir"]),
            source_URL=config["source_url"],
            local_data_file=Path(config["local_data"]),
            unzip_dir=Path(config["unzip_dir"])
        )
        return data_ingestion_config

class DataIngestion:
    def __init__(self, config: DataIngestionConfig):
        self.config = config

    def download_file(self):
        if not os.path.exists(self.config.local_data_file):
            filename, headers = request.urlretrieve(
                url=self.config.source_URL, 
                filename=self.config.local_data_file
            )
            logger.info(f"{filename} downloaded successfully.\nHeaders: {headers}")
        else:
            logger.info(f"File already exists at {self.config.local_data_file}")

    def extract_zip_file(self):
        unzip_path = self.config.unzip_dir
        os.makedirs(unzip_path, exist_ok=True)
        with zipfile.ZipFile(self.config.local_data_file, 'r') as zip_ref:
            zip_ref.extractall(unzip_path)
        logger.info(f"Extracted files to {unzip_path}")

# Main script
try:
    config = ConfigurationManager()
    data_ingestion_config = config.get_data_ingestion_config()
    data_ingestion = DataIngestion(config=data_ingestion_config)
    data_ingestion.download_file()
    data_ingestion.extract_zip_file()
except KeyError as e:
    print(f"Configuration key error: {e}")
except Exception as e:
    raise e


[2024-12-26 01:19:22,905: INFO: common: Yaml file: config\config.yaml read successfully]
Debug Config: {'artifacts': {'artifacts_root': './artifacts'}, 'data_ingestion': {'root_dir': './artifacts/data_ingestion', 'source_url': 'https://github.com/krishnaik06/datasets/raw/refs/heads/main/winequality-data.zip', 'local_data': './artifacts/data_ingestion/data.zip', 'unzip_dir': './artifacts/data_ingestion'}}
[2024-12-26 01:19:22,908: INFO: common: Yaml file: params.yaml read successfully]
[2024-12-26 01:19:22,910: INFO: common: Yaml file: schema.yaml read successfully]
[2024-12-26 01:19:22,912: INFO: common: Created directory at: ./artifacts]
[2024-12-26 01:19:22,915: INFO: common: Created directory at: ./artifacts/data_ingestion]
[2024-12-26 01:19:24,450: INFO: 3162971701: artifacts\data_ingestion\data.zip downloaded successfully.
Headers: Connection: close
Content-Length: 23329
Cache-Control: max-age=300
Content-Security-Policy: default-src 'none'; style-src 'unsafe-inline'; sandbox
Cont