## DATA INGESTION NOTEBOOK

In [17]:
from dataclasses import dataclass
from pathlib import Path
from src.utils.commons import read_yaml,create_directories
from src.cloud_storage.S3_object_store import S3Client
from botocore.exceptions import ClientError
import os
import sys
from datetime import datetime
from src.logger import logging
import pandas  as pd
from src.constants import *
import time
from datetime import datetime
from src.utils.commons import unzip_files
from glob import glob


os.chdir("C:/Users/giddy/Documents/RECOMMENDATION_SYSTEM")




In [18]:
# entity

@dataclass
class DataIngestionConfig:
    root_dir: Path
    bucket_name : str
    filename: str
    object_name: str

In [19]:
# creating a configuration manager class to manage configuration
class ConfigurationManager:
    def __init__(self,config_filepath=CONFIG_FILE_PATH,
                 params_filepath=PARAMS_FILE_PATH,
                 schema_filepath=SCHEMA_FILE_PATH):
        '''
        Initiating the configuration manager
        '''
        # read YAML configuration files to initatiate configuration parameters
        self.config = read_yaml(str(config_filepath))
        self.params = read_yaml(str(params_filepath))
        self.schema = read_yaml(str(schema_filepath))

        logging.info(f'configuration: {self.config}')


        

    #configuration manager
    def get_data_ingestion_config(self) -> DataIngestionConfig:
        '''
        Function to get configuration settings
        '''

        # read data ingestion configuration section from config.yaml
        config = self.config.DATA_INGESTION

        # create a new artifacts folder to store ingested data
        create_directories([config.root_dir])

        # create and return dataingestion configuration object

        config_object = DataIngestionConfig(
            root_dir=config.root_dir,
            bucket_name= config.bucket_name,
            filename=config.filename,
            object_name = config.object_name
        )

        return config_object







In [23]:
# DATA INGESTION COMPONENT
class DataIngestion:
    def __init__(self,config:DataIngestionConfig):

        self.config = config

    def initiate_data_ingestion(self):

        start_time = time.time()
        start_timestamp = datetime.now().strftime("%m_%d_%Y %H:%M:%S")

        try:
            # download file from landing s3 bucket

            s3_client = S3Client()

            s3_client.download_file(bucket=self.config.bucket_name,
                                    object_name=self.config.object_name,
                                    filename=self.config.filename)
            # unzip ingested data
            output_dir = os.path.join(self.config.root_dir,'Raw_ingested_data')
            unzip_files(self.config.filename,output_dir)

            total_files = len(glob(output_dir+'\*csv'))
            logging.info(f'Number of ingested files : {total_files}')


            # save metadata
            end_time = time.time()
            end_timestamp = datetime.now().strftime("%m_%d_%Y %H:%M:%S")
            duration = end_time - start_time
            metadata = {
                'start_time' : start_timestamp,
                'end_time' : end_timestamp,
                'duration' : duration,
                'Number of files loaded' : total_files,
                'data_source' : self.config.bucket_name,
                'output_path' : output_dir
            }
            metadata_path = os.path.join(self.config.root_dir,'metadat.json')
            pd.Series(metadata).to_json(metadata_path)
            logging.info(f'saved ingestion pipeline metadat into {metadata_path}')


            # monitoring metrics
            ingestion_speed = total_files / duration
            logging.info(f'Ingestion speed: {ingestion_speed} files/second')
            
        except ClientError as e:
            logging.info(f'error occured {e}')
            

In [None]:
manager = ConfigurationManager()
data_ingestion_config = manager.get_data_ingestion_config()


data = DataIngestion(config=data_ingestion_config)
data.initiate_data_ingestion()


[2024-11-12 23:58:18,068 ] 38 root - INFO - Yaml file:  config\config.yaml loaded suscessfully
[2024-11-12 23:58:18,072 ] 38 root - INFO - Yaml file:  params.yaml loaded suscessfully
[2024-11-12 23:58:18,076 ] 38 root - INFO - Yaml file:  schema.yaml loaded suscessfully
[2024-11-12 23:58:18,078 ] 14 root - INFO - configuration: {'DATA_INGESTION': {'root_dir': 'Artifacts', 'bucket_name': 'cosmetic-store', 'filename': 'Artifacts/Ingested_data.zip', 'object_name': 'archive (1).zip'}}
[2024-11-12 23:58:18,080 ] 61 root - INFO - File directory create at : Artifacts
DataIngestionConfig(root_dir='Artifacts', bucket_name='cosmetic-store', filename='Artifacts/Ingested_data.zip', object_name='archive (1).zip')
[2024-11-12 23:58:22,225 ] 78 root - INFO - successfully downloaded file archive (1).zip from S3 bucket to Artifacts/Ingested_data.zip
[2024-11-12 23:58:22,229 ] 61 root - INFO - File directory create at : Artifacts\Raw_ingested_data
[2024-11-12 23:58:22,231 ] 189 root - INFO - Unzipping d