In [1]:
from boto3 import Session
import botocore
import os

In [2]:
%pwd

'/Users/saptarshi/BITS/dissertation/project/predictive-maintenance/research'

In [3]:
os.chdir('../')
%pwd

'/Users/saptarshi/BITS/dissertation/project/predictive-maintenance'

In [4]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataIngestionConfig:
    root_dir: Path
    source_bucket: str
    source_object_key: str
    local_data_file: Path
    unzip_dir: Path


In [5]:
from linear_regressor.constants import *
from linear_regressor.utils.commons import *

class ConfigurationManager:
    def __init__(self, config_file_path = CONFIG_FILE_PATH, params_file_path = PARAMS_FILE_PATH):
        self.config = load_yaml(config_file_path)
        self.params = load_yaml(params_file_path)
        create_directories(
            [self.config.artifacts_root, self.config.data_ingestion.root_dir, self.config.data_ingestion.unzip_dir]
        )

    def get_data_ingestion_config(self) -> DataIngestionConfig:
        data_ingestion_config = DataIngestionConfig(
            root_dir = self.config.data_ingestion.root_dir,
            source_bucket=self.config.data_ingestion.source_bucket,
            source_object_key=self.config.data_ingestion.source_object_key,
            local_data_file=self.config.data_ingestion.local_data_file,
            unzip_dir=self.config.data_ingestion.unzip_dir
        )
        return data_ingestion_config


In [6]:
from linear_regressor import logger
import zipfile

class DataIngestion:
    def __init__(self, config: DataIngestionConfig):
        self.config = config

    def download_data(self):
        session = Session()
        s3 = session.resource('s3')

        try:
            bucket = self.config.source_bucket
            object_key = self.config.source_object_key
            local_file = self.config.local_data_file
            s3.Bucket(bucket).download_file(object_key, local_file)
            logger.info(f'Raw data downloaded from s3://{bucket}/{object_key} to {local_file}')
        except botocore.exceptions.ClientError as e:
            if e.response['Error']['Code'] == "404":
                print("The object does not exist.")

    def extract_data(self, delete_zip=False):
        local_file = self.config.local_data_file
        unzip_path = self.config.unzip_dir

        with zipfile.ZipFile(local_file, 'r') as zip_ref:
            zip_ref.extractall(unzip_path)
            logger.info(f'Data successfully extracted from: {local_file}')
    
        if delete_zip:
            os.remove(local_file)
            logger.info(f'Zip file: {local_file} deleted')
    
    

In [7]:
try:
    config = ConfigurationManager()
    data_ingestion_config = config.get_data_ingestion_config()
    data_ingestion = DataIngestion(config=data_ingestion_config)
    data_ingestion.download_data()
    data_ingestion.extract_data(delete_zip=True)
except Exception as e:
    raise e


2024-05-01 13:22:30,852: INFO: commons: yaml file: config/config.yaml loaded successfully
2024-05-01 13:22:30,854: INFO: commons: yaml file: params.yaml loaded successfully
2024-05-01 13:22:30,855: INFO: commons: Created directory at: artifacts
2024-05-01 13:22:30,856: INFO: commons: Created directory at: artifacts/data_ingestion
2024-05-01 13:22:30,857: INFO: commons: Created directory at: artifacts/data_ingestion
2024-05-01 13:22:32,291: INFO: 3343967143: Raw data downloaded from s3://mlflow-bucket-predictive-maintenance/raw_data/train_FD004.txt.zip to artifacts/data_ingestion/train_FD004.txt.zip
2024-05-01 13:22:32,334: INFO: 3343967143: Data successfully extracted from artifacts/data_ingestion/train_FD004.txt.zip
2024-05-01 13:22:32,335: INFO: 3343967143: Zip file artifacts/data_ingestion/train_FD004.txt.zip deleted
