In [1]:
import os


In [2]:
%pwd

'f:\\OneDrive - MSFT\\wine_quality_ML\\Machine-Learning-End-to-End-Project\\research'

In [3]:
# go back one folder
os.chdir('../')

In [4]:
from dataclasses import dataclass
from pathlib import Path
@dataclass(frozen=True)
class DataingestionConfig:
    root_dir:Path
    Source_URL: str
    local_data_files:Path
    unzip_dir: Path

In [5]:
# configuration Manager
from src.MLProject_WineQT.constants.const import *
from src.MLProject_WineQT.utils.common import read_yaml, create_directories

In [6]:
class ConfigurationManager:
     def __init__(
         self,
         config_filepath = CONFIG_FILE_PATH,
         params_filepath = PARAMS_FILE_PATH,
         schema_filepath = SCHEMA_FILE_PATH):
         
         self.config = read_yaml(config_filepath)
         self.params = read_yaml(params_filepath)
         self.schema = read_yaml(schema_filepath)
         
         create_directories([self.config.artifacts_root])
         
     def get_data_ingestion_config(self) -> DataingestionConfig:
        config = self.config.data_ingestion
        create_directories([config.root_dir])
        get_data_ingestion_config=DataingestionConfig(root_dir=config.root_dir,
                                                      Source_URL=config.source_URL,
                                                      local_data_files=config.local_data_file,
                                                      unzip_dir=config.unzip_dir)
        return get_data_ingestion_config
           
             

In [7]:
# update the component
import os
import urllib.request as request
import zipfile
from src.MLProject_WineQT.my_logging.loger import logger
from src.MLProject_WineQT.utils.common import get_size


In [8]:
# updata the component 
class DataIngestion:
    def __init__(self,config: DataingestionConfig):
        self.config = config
    
    def download_file(self):
        if not os.path.exists(self.config.local_data_files):
            filename,header = request.urlretrieve(
                url=self.config.Source_URL,
                filename=self.config.local_data_files
            ) 
            logger.info(f"{filename} download with following info: /n {header}")
        else:
            logger.info(f"file already exist fo size: {get_size(Path(self.config.local_data_files))}")
            
    def extract_zip_file(self):
        unzip_path = self.config.unzip_dir
        os.makedirs(unzip_path,exist_ok=True)
        if zipfile.is_zipfile(self.config.local_data_files):
            with zipfile.ZipFile(self.config.local_data_files, 'r') as zip_ref:
                zip_ref.extractall(unzip_path)
        else:
            raise zipfile.BadZipFile(f"{self.config.local_data_files} is not a valid zip file")

In [6]:
from src.MLProject_WineQT.constants.const import *
from src.MLProject_WineQT.utils.common import read_yaml, create_directories
import os
import urllib.request as request
import zipfile
from src.MLProject_WineQT.my_logging.loger import logger
from src.MLProject_WineQT.utils.common import get_size
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataingestionConfig:
    root_dir: Path
    Source_URL: str
    local_data_files: Path
    unzip_dir: Path

class ConfigurationManager:
    def __init__(
        self,
        config_filepath=CONFIG_FILE_PATH,
        params_filepath=PARAMS_FILE_PATH,
        schema_filepath=SCHEMA_FILE_PATH
    ):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)
        
        # Create root directory
        create_directories([self.config.artifacts_root])
        
    def get_data_ingestion_config(self) -> DataingestionConfig:
        config = self.config.data_ingestion
        create_directories([config.root_dir])
        
        return DataingestionConfig(
            root_dir=Path(config.root_dir),
            Source_URL=config.source_URL,
            local_data_files=Path(config.local_data_file),
            unzip_dir=Path(config.unzip_dir)
        )

class DataIngestion:
    def __init__(self, config: DataingestionConfig):
        self.config = config

    def download_file(self):
        if not os.path.exists(self.config.local_data_files):
            try:
                filename, header = request.urlretrieve(
                    url=self.config.Source_URL,
                    filename=self.config.local_data_files
                )
                logger.info(f"{filename} downloaded with the following info: \n{header}")
                
                if not zipfile.is_zipfile(self.config.local_data_files):
                    raise zipfile.BadZipFile(f"{self.config.local_data_files} is not a valid zip file")
                    
            except Exception as e:
                logger.error(f"Error downloading file: {e}")
                raise
        else:
            logger.info(f"File already exists with size: {get_size(self.config.local_data_files)}")

    def extract_zip_file(self):
        unzip_path = self.config.unzip_dir
        os.makedirs(unzip_path, exist_ok=True)
        
        if zipfile.is_zipfile(self.config.local_data_files):
            with zipfile.ZipFile(self.config.local_data_files, 'r') as zip_ref:
                zip_ref.extractall(unzip_path)
            logger.info(f"Extracted {self.config.local_data_files} to {unzip_path}")
        else:
            raise zipfile.BadZipFile(f"{self.config.local_data_files} is not a valid zip file")



In [9]:
# Pipeline
try:
    config = ConfigurationManager()
    data_ingestion_config = config.get_data_ingestion_config()
    data_ingestion = DataIngestion(config=data_ingestion_config)
    data_ingestion.download_file()
    data_ingestion.extract_zip_file()
except Exception as e:
    raise e

[2024-10-04 04:04:06,449: INFO: common: yaml file:config\config.yaml load successfuly]
[2024-10-04 04:04:06,455: INFO: common: yaml file:params.yaml load successfuly]
[2024-10-04 04:04:06,463: INFO: common: yaml file:schema.yaml load successfuly]
[2024-10-04 04:04:06,467: INFO: common: created directories at: artifacts]
[2024-10-04 04:04:06,469: INFO: common: created directories at: artifacts/data_ingestion]
[2024-10-04 04:04:09,254: INFO: 254835833: artifacts/data_ingestion/data.zip download with following info: /n Connection: close
Content-Length: 21952
Cache-Control: max-age=300
Content-Security-Policy: default-src 'none'; style-src 'unsafe-inline'; sandbox
Content-Type: application/zip
ETag: "d8ac884c0b43591e6fccccdf678b39bfabcbcbb00f3ba0dc2886bc75bc948168"
Strict-Transport-Security: max-age=31536000
X-Content-Type-Options: nosniff
X-Frame-Options: deny
X-XSS-Protection: 1; mode=block
X-GitHub-Request-Id: A221:356DFF:2A809D1:2B987A7:66FFCBA8
Accept-Ranges: bytes
Date: Fri, 04 Oct 2

In [12]:
import os
import urllib.request as request
import zipfile
from pathlib import Path
from dataclasses import dataclass
from src.MLProject_WineQT.utils.common import read_yaml, create_directories
from src.MLProject_WineQT.my_logging.loger import logger
# data Entity 
@dataclass(frozen=True)
class DataingestionConfig:
    root_dir: Path
    Source_URL: str
    local_data_files: Path
    unzip_dir: Path
# configuration manager
class ConfigurationManager:
    def __init__(self, config_filepath=CONFIG_FILE_PATH):
        self.config = read_yaml(config_filepath)
        create_directories([self.config.artifacts_root])

    def get_data_ingestion_config(self) -> DataingestionConfig:
        config = self.config.data_ingestion
        create_directories([config.root_dir])
        return DataingestionConfig(
            root_dir=config.root_dir,
            Source_URL=config.source_URL,
            local_data_files=config.local_data_file,
            unzip_dir=config.unzip_dir
        )

class DataIngestion:
    def __init__(self, config: DataingestionConfig):
        self.config = config

    def download_file(self):
        if not os.path.exists(self.config.local_data_files):
            try:
                filename, header = request.urlretrieve(
                    url=self.config.Source_URL,
                    filename=self.config.local_data_files
            )
                logger.info(f"{filename} downloaded with the following info: \n{header}")

            # After downloading, validate if it's a valid zip file
                if not zipfile.is_zipfile(self.config.local_data_files):
                    raise zipfile.BadZipFile(f"{self.config.local_data_files} is not a valid zip file")
            except Exception as e:
                logger.error(f"Error downloading file: {e}")
                raise 
            else:
                logger.info(f"File already exists with size: {get_size(Path(self.config.local_data_files))}")

    def extract_zip_file(self):
        unzip_path = self.config.unzip_dir
        os.makedirs(unzip_path, exist_ok=True)

        if zipfile.is_zipfile(self.config.local_data_files):
            with zipfile.ZipFile(self.config.local_data_files, 'r') as zip_ref:
                zip_ref.extractall(unzip_path)
            logger.info(f"Extracted {self.config.local_data_files} to {unzip_path}")
        else:
            raise zipfile.BadZipFile(f"{self.config.local_data_files} is not a valid zip file")

# Usage
if __name__ == "__main__":
    try:
        config_manager = ConfigurationManager()
        data_ingestion_config = config_manager.get_data_ingestion_config()
        data_ingestion = DataIngestion(config=data_ingestion_config)
        data_ingestion.download_file()
        data_ingestion.extract_zip_file()
    except Exception as e:
        logger.error(f"An error occurred: {e}")


[2024-10-04 03:55:37,562: INFO: common: yaml file:config\config.yaml load successfuly]
[2024-10-04 03:55:37,684: INFO: common: created directories at: artifacts]
[2024-10-04 03:55:37,684: INFO: common: created directories at: artifacts/data_ingestion]
[2024-10-04 03:55:39,302: ERROR: 4006031347: Error downloading file: HTTP Error 404: Not Found]
[2024-10-04 03:55:39,302: ERROR: 4006031347: An error occurred: HTTP Error 404: Not Found]
