In [1]:
import os
os.chdir("../")

What is happening here?

1) We created a class called DataIngestionConfig. This is like a blueprint whose entries are root_path, source_URL, Where you want to unzip it etc etc.
2) We created another class called DataIngestionConfigManager. This class does the job of creating directories based on config.yaml file. And also reading yaml files.
3) Now that the directories are created, we are going to create a class called DataIngestionManager. This class will download and unzip the data files.


In [2]:
from src.utils.common import create_directories
from src.utils.common import read_yaml
from src.constants import constant_paths

In [3]:
CONFIG_FILE_PATH = constant_paths().get_paths()[0]
PARAMS_FILE_PATH = constant_paths().get_paths()[1]

Now there is a need to read the config file which contains paths to the dataset (compressed)

In constants folder, we have the list of Config paths and Params path. We need to use them to read the config file and the params file.

In [4]:
from dataclasses import dataclass
from pathlib import Path

#we created this class to store the paths of the data, this will be used later on to create directories

@dataclass
class DataIngestionConfig:
    root_dir: Path
    source_URL: str
    local_data_file: Path
    unzip_dir: Path

In [5]:
#creating a class called Configuration manager
#also reading yaml files (config and params)
#read_yaml's output is a dictionary type datatype called ConfigBox

class ConfigurationManager:
    def __init__(
            self,
            config_file_path = CONFIG_FILE_PATH,
            params_file_path = PARAMS_FILE_PATH):
        
        self.config = read_yaml(config_file_path)
        self.params = read_yaml(params_file_path)

        create_directories([self.config.artifacts_dir])

    def get_data_ingestion(self) -> DataIngestionConfig:
        config = self.config.data_ingestion

        create_directories([config.root_dir])

        Data_ingestion_Config = DataIngestionConfig(root_dir=config.root_dir,source_URL=config.source_URL,
                                                    local_data_file=config.local_data_file,unzip_dir=config.unzip_dir)
        #returning a class with loaded data path files
        return Data_ingestion_Config


Now the configuration is done, we need to create the Data Ingestion Class

In [6]:
import os
from src.utils.common import download_zip_files
from src.utils.common import unzip_zip_files
from src.utils.common import download_s3
from src.logger import logging


In [7]:
class DataIngestion:
    def __init__(self, config: DataIngestionConfig):
        self.config = config
    
    def download_data(self):
        """
        Download data from S3
        """
        download_s3(self.config.local_data_file)
    def unzip_zip_files(self):
        """Unzipping files"""
        unzip_zip_files(self.config.local_data_file,self.config.unzip_dir)
        

In [8]:
from src.exception import CustomException
import sys

In [9]:
try:
    configmanager = ConfigurationManager()
    data_ingestion_config = configmanager.get_data_ingestion()
    data_ingestion = DataIngestion(data_ingestion_config)
    data_ingestion.download_data()
    data_ingestion.unzip_zip_files()
except Exception as e:
    raise CustomException(e,sys)


[2023-07-27 02:33:25,192] 37 root INFO - yaml file: config\config.yaml loaded successfully
[2023-07-27 02:33:25,192] 37 root INFO - yaml file: params.yaml loaded successfully
[2023-07-27 02:33:25,202] 56 root INFO - created directory at: artifacts
[2023-07-27 02:33:25,202] 56 root INFO - created directory at: artifacts/data_ingestion
[2023-07-27 02:33:25,587] 89 root INFO - Inside download function
[2023-07-27 02:33:35,626] 91 root INFO - downloaded s3 data successfully: artifacts/data_ingestion/data.zip
[2023-07-27 02:33:35,632] 72 root INFO - Inside unzip function
[2023-07-27 02:33:42,557] 75 root INFO - unzipped file: artifacts/data_ingestion/data.zip
