## DATA INGESTION NOTEBOOK

In [1]:
import os
os.chdir('C:/Users/giddy/Documents/RECOMMENDATION_SYSTEM')

from dataclasses import dataclass
from pathlib import Path
from src.utils.commons import read_yaml,create_directories
from src.cloud_storage.S3_object_store import S3Client
from botocore.exceptions import ClientError
import os
import sys
from datetime import datetime
from src.logger import logging
import pandas  as pd
from src.constants import *
import time
from datetime import datetime
from src.utils.commons import unzip_files
from glob import glob



In [2]:
@dataclass
class DataIngestionConfig:
    root_dir : Path
    bucket : str
    local_path : Path
    target_path : str
    object_name_prefix : str

In [3]:


class ConfigurationManager:
    def __init__(self,config_filepath=CONFIG_FILE_PATH,
                 params_filepath=PARAMS_FILE_PATH,
                 schema_filepath=SCHEMA_FILE_PATH):
        '''
        Initiating the configuration manager
        '''
        # read YAML configuration files to initatiate configuration parameters
        self.config = read_yaml(str(config_filepath))
        self.params = read_yaml(str(params_filepath))
        self.schema = read_yaml(str(schema_filepath))

        logging.info(f'configuration: {self.config}')


        

    #configuration manager
    def get_data_ingestion_config(self) -> DataIngestionConfig:
        '''
        Function to get configuration settings
        '''

        # read data ingestion configuration section from config.yaml
        config = self.config.DATA_INGESTION

        # create a new artifacts folder to store ingested data if doesn't exist already 
        create_directories([config.root_dir])

        # create and return dataingestion configuration object

        config_object = DataIngestionConfig(
            root_dir=config.root_dir,
            local_path= config.local_path,
            target_path=config.target_path,
            bucket = config.bucket,
            object_name_prefix= config.object_name_prefix
           
        )

        return config_object

In [19]:
class DataIngestion:
    def __init__(self,config:DataIngestionConfig):

        self.config = config

    def initiate_data_ingestion(self):

        start_time = time.time()
        start_timestamp = datetime.now().strftime("%m_%d_%Y %H:%M:%S")

        try:

            logging.info(f'Unziping file from {self.config.local_path} into {self.config.root_dir} ')
            unzip_folder = os.path.join(self.config.root_dir,'unzipped_data') # folder to extract data to

            if not os.path.exists(unzip_folder) or not os.listdir(unzip_folder) :
                unzip_files(self.config.local_path,unzip_folder)
                logging.info(f'Data from {self.config.local_path} unzippeded into {unzip_folder}')
            else:
                logging.info(f'data from {self.config.local_path} already unzipped')
                pass


            s3 = S3Client()
            logging.info('Creating s3 bucket')
            s3.create_bucket(bucketname='cosmetic-store-1')
            logging.info('uploading files into s3 bucket')
            s3.upload_folder(folder_path=unzip_folder,bucket=self.config.bucket,object_name_prefix=self.config.object_name_prefix)
            logging.info('downloadin to local machine')
            s3.download_folder(bucket=self.config.bucket,folder_prefix=self.config.object_name_prefix,local_dir=self.config.target_path)
        


            number_of_files = len(os.listdir(unzip_folder))
            logging.info(f'Number of ingested files : {number_of_files}')


            # save metadata
            end_time = time.time()
            end_timestamp = datetime.now().strftime("%m_%d_%Y %H:%M:%S")
            duration = end_time - start_time
            metadata = {
                'start_time' : start_timestamp,
                'end_time' : end_timestamp,
                'duration' : duration,
                'Number of files loaded to workspace' : number_of_files,
                'data_source' : self.config.local_path,
                'target_path' : self.config.target_path,
                'Project name' : 'Data ingestion'
            }

        
            metadata_path = os.path.join(self.config.root_dir,'Data_ingestion_metadata.json')
            pd.Series(metadata).to_json(metadata_path)
    
            logging.info(f'saved ingestion pipeline metadata into {metadata_path}')
            
        except ClientError as e:             
            logging.info(f'error occured {e}')

In [20]:
manager = ConfigurationManager()
data_ingestion_config = manager.get_data_ingestion_config()


data = DataIngestion(config=data_ingestion_config)
filepath = data.initiate_data_ingestion()


[2025-01-06 18:07:02,450 ] 39 root - INFO - Yaml file:  config\config.yaml loaded suscessfully
[2025-01-06 18:07:02,461 ] 39 root - INFO - Yaml file:  params.yaml loaded suscessfully
[2025-01-06 18:07:02,523 ] 39 root - INFO - Yaml file:  schema.yaml loaded suscessfully
[2025-01-06 18:07:02,536 ] 13 root - INFO - configuration: {'DATA_INGESTION': {'root_dir': 'Artifacts', 'bucket': 'cosmetic-store-1', 'object_name_prefix': 'customer_interation_data', 'local_path': 'Artifacts\\Ingested_data.zip', 'target_path': 'Artifacts\\ingested_data_files'}, 'DATA_VALIDATION': {'root_dir': 'Artifacts/data_validation', 'status_file': 'Artifacts/data_validation/status.json'}, 'DATA_TRANSFORMATION': {'source_datapath': 'Artifacts/Raw_ingested_data/*.csv', 'source_parquetpath': 'Artifacts/loaded_data.parquet', 'feature_store_path': 'Artifacts/FeatureStore', 'train_datapath': 'Artifacts/train_data', 'val_datapath': 'Artifacts/val_data', 'test_datapath': 'Artifacts/test_data'}}
[2025-01-06 18:07:02,545 ] 

[2025-01-06 18:07:02,625 ] 25 root - INFO - Creating s3 bucket
[2025-01-06 18:07:02,662 ] 39 root - INFO - Could not create bucket cosmetic-store-1 because An error occurred (BucketAlreadyOwnedByYou) when calling the CreateBucket operation: Your previous request to create the named bucket succeeded and you already own it.
[2025-01-06 18:07:02,663 ] 27 root - INFO - uploading files into s3 bucket
[2025-01-06 18:07:11,766 ] 89 root - INFO - Uploaded Artifacts\unzipped_data\2019-Dec.csv as customer_interation_data/2019-Dec.csv into cosmetic-store-1 S3 bucket
[2025-01-06 18:07:24,883 ] 89 root - INFO - Uploaded Artifacts\unzipped_data\2019-Nov.csv as customer_interation_data/2019-Nov.csv into cosmetic-store-1 S3 bucket
[2025-01-06 18:07:38,136 ] 89 root - INFO - Uploaded Artifacts\unzipped_data\2019-Oct.csv as customer_interation_data/2019-Oct.csv into cosmetic-store-1 S3 bucket
[2025-01-06 18:07:52,766 ] 89 root - INFO - Uploaded Artifacts\unzipped_data\2020-Feb.csv as customer_interation

In [10]:
print(filepath)

None
