This is the data ingestion notebook for testing purpose.

In [1]:
from dataclasses import dataclass
from pathlib import Path
from MLProject.constants import *
from MLProject.utils.common import read_yaml, create_directories
import os
import urllib.request as request
import zipfile
from MLProject import logger
from MLProject.utils.common import get_size

In [2]:
%pwd

'd:\\Projects\\ML Projects\\End-to-End Wine Quality\\End-to-End-ML-Project\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'''
It's necessary to change the present working directory or it will give error in further process.
'''

"\nIt's necessary to change the present working directory or it will give error in further process.\n"

In [5]:
@dataclass(frozen=True)
class DataIngestionConfig:
    root_dir: Path
    source_URL: str
    local_data_path: Path
    unzip_dir: Path

In [6]:
class ConfigurationManager:
    def __init__(self, config_filepath=CONFIG_FILE_PATH, params_filepath=PARAMS_FILE_PATH, schema_filepath=SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])
    
    def get_data_ingestion_config(self)->DataIngestionConfig:
        config = self.config.data_ingestion

        create_directories([config.root_dir])

        data_ingestion_config = DataIngestionConfig(
            root_dir=config.root_dir,
            source_URL=config.source_url,
            local_data_path=config.local_data_file,
            unzip_dir=config.unzip_dir
        )
        return data_ingestion_config

In [7]:
class DataIngestion:
    def __init__(self, config: DataIngestionConfig):
        self.config = config
    
    def download_file(self):
        if not os.path.exists(self.config.local_data_path):
            filename, headers = request.urlretrieve(
                url=self.config.source_URL,
                filename=self.config.local_data_path
            )
            logger.info(f"{filename} download! with the following info: \n{headers}")
        else:
            logger.info(f"Filename already exists of size: {get_size(self.config.local_data_path)}")
    
    def extract_zip(self):
        '''
        zip_file_path: str
        Extracts the zip file into the data directory.
        Function return None.
        '''

        unzip_path = self.config.unzip_dir
        os.makedirs(unzip_path, exist_ok=True)
        with zipfile.ZipFile(self.config.local_data_path, 'r') as zip_ref:
            zip_ref.extractall(unzip_path)

In [8]:
try:
    config = ConfigurationManager()
    data_ingestion_config = config.get_data_ingestion_config()
    data_ingestion = DataIngestion(config=data_ingestion_config)
    data_ingestion.download_file()
    data_ingestion.extract_zip()
except Exception as e:
    raise e

[2025-04-25 15:20:14,298: INFO: common: yaml file: config\config.yaml loaded successfully.]
[2025-04-25 15:20:14,310: INFO: common: yaml file: params.yaml loaded successfully.]
[2025-04-25 15:20:14,314: INFO: common: yaml file: schema.yaml loaded successfully.]
[2025-04-25 15:20:14,315: INFO: common: Created directory at: artifacts]
[2025-04-25 15:20:14,318: INFO: common: Created directory at: artifacts/data_ingestion]
[2025-04-25 15:20:15,297: INFO: 2752968892: artifacts/data_ingestion/data.zip download! with the following info: 
Connection: close
Content-Length: 21296
Cache-Control: max-age=300
Content-Security-Policy: default-src 'none'; style-src 'unsafe-inline'; sandbox
Content-Type: application/zip
ETag: "e515d25569fe5aabd61e15e84d2705a9f6cff52f48e995d477759975ea241744"
Strict-Transport-Security: max-age=31536000
X-Content-Type-Options: nosniff
X-Frame-Options: deny
X-XSS-Protection: 1; mode=block
X-GitHub-Request-Id: FD76:226149:6EFA5:8687F:680B5AD6
Accept-Ranges: bytes
Date: Fr