## DATA INGESTION NOTEBOOK

In [1]:
import os
os.chdir('/home/azureuser/cloudfiles/code/Users/oviemunooboro/Product_recommendation_system')

from dataclasses import dataclass
from pathlib import Path
from src.utils.commons import read_yaml,create_directories
from src.cloud_storage.azure_blob_storage import AzureDatastore
from botocore.exceptions import ClientError
import os
import sys
from datetime import datetime
from src.logger import logging
import pandas  as pd
from src.constants import *
import time
from datetime import datetime
from src.utils.commons import unzip_files
from glob import glob


[2024-12-07 12:29:45,009 ] 161 numexpr.utils - INFO - NumExpr defaulting to 4 threads.


In [2]:
@dataclass
class DataIngestionConfig:
    root_dir : Path
    local_path : Path
    target_path : str
    registered_name : str

In [3]:


class ConfigurationManager:
    def __init__(self,config_filepath=CONFIG_FILE_PATH,
                 params_filepath=PARAMS_FILE_PATH,
                 schema_filepath=SCHEMA_FILE_PATH):
        '''
        Initiating the configuration manager
        '''
        # read YAML configuration files to initatiate configuration parameters
        self.config = read_yaml(str(config_filepath))
        self.params = read_yaml(str(params_filepath))
        self.schema = read_yaml(str(schema_filepath))

        logging.info(f'configuration: {self.config}')


        

    #configuration manager
    def get_data_ingestion_config(self) -> DataIngestionConfig:
        '''
        Function to get configuration settings
        '''

        # read data ingestion configuration section from config.yaml
        config = self.config.DATA_INGESTION

        # create a new artifacts folder to store ingested data if doesn't exist already 
        create_directories([config.root_dir])

        # create and return dataingestion configuration object

        config_object = DataIngestionConfig(
            root_dir=config.root_dir,
            local_path= config.local_path,
            target_path=config.target_path,
            registered_name= config.registered_name
        )

        return config_object

In [4]:
class DataIngestion:
    def __init__(self,config:DataIngestionConfig):

        self.config = config

    def initiate_data_ingestion(self):

        start_time = time.time()
        start_timestamp = datetime.now().strftime("%m_%d_%Y %H:%M:%S")

        try:

            logging.info(f'Unziping file from {self.config.local_path} into {self.config.root_dir} ')
            unzip_folder = os.path.join(self.config.root_dir,'unzipped_data') # folder to extract data to

            if not os.path.exists(unzip_folder) or not os.listdir(unzip_folder) :
                unzip_files(self.config.local_path,unzip_folder)
                logging.info(f'Data from {self.config.local_path} unzippeded into {unzip_folder}')
            else:
                logging.info(f'data from {self.config.local_path} already unzipped')
                pass


            Azure_ws = AzureDatastore()

            Azure_ws.load_local_data_to_Azure_datastore(
                src_dir = unzip_folder,
                target_path = self.config.target_path,
                registered_name = self.config.registered_name
            )

            number_of_files = len(os.listdir(unzip_folder))
            logging.info(f'Number of ingested files : {number_of_files}')


            # save metadata
            end_time = time.time()
            end_timestamp = datetime.now().strftime("%m_%d_%Y %H:%M:%S")
            duration = end_time - start_time
            metadata = {
                'start_time' : start_timestamp,
                'end_time' : end_timestamp,
                'duration' : duration,
                'Number of files loaded to workspace' : number_of_files,
                'data_source' : self.config.local_path,
                'target_path_from_datastore' : self.config.target_path,
                'registered_name' : self.config.registered_name,
                'Project name' : 'Data ingestion'
            }

            # download data
            logging.info('downloading data from datastore')
            Azure_ws.download_from_datastore(
                registered_name=self.config.registered_name,
                target_path=self.config.target_path
            )
            metadata_path = os.path.join(unzip_folder,'metadata.json')
            pd.Series(metadata).to_json(metadata_path)
    
            logging.info(f'saved ingestion pipeline metadata into {metadata_path}')
            
        except ClientError as e:             
            logging.info(f'error occured {e}')

In [5]:
manager = ConfigurationManager()
data_ingestion_config = manager.get_data_ingestion_config()


data = DataIngestion(config=data_ingestion_config)
filepath = data.initiate_data_ingestion()


[2024-12-07 12:29:57,209 ] 38 root - INFO - Yaml file:  config/config.yaml loaded suscessfully
[2024-12-07 12:29:57,236 ] 38 root - INFO - Yaml file:  params.yaml loaded suscessfully
[2024-12-07 12:29:57,261 ] 38 root - INFO - Yaml file:  schema.yaml loaded suscessfully
[2024-12-07 12:29:57,263 ] 13 root - INFO - configuration: {'DATA_INGESTION': {'root_dir': 'Artifacts', 'local_path': '/home/azureuser/cloudfiles/code/Users/oviemunooboro/Product_recommendation_system/Artifacts/customer_interactions.zip', 'target_path': 'Artifacts/Unzipped_data', 'registered_name': 'customer_interaction_dataset'}, 'DATA_VALIDATION': {'root_dir': 'Artifacts/data_validation', 'data_source': 'Artifacts/Raw_ingested_data/', 'status_file': 'Artifacts/data_validation/status.json', 'critical_columns': ['event_time', 'event_type', 'product_id', 'category_id', 'category_code', 'brand', 'price', 'user_id', 'user_session']}}
[2024-12-07 12:29:57,267 ] 61 root - INFO - File directory create at : Artifacts
[2024-12-

[2024-12-07 12:30:21,149 ] 195 root - INFO - Files extracted successfully to Artifacts/unzipped_data
[2024-12-07 12:30:21,150 ] 18 root - INFO - Data from /home/azureuser/cloudfiles/code/Users/oviemunooboro/Product_recommendation_system/Artifacts/customer_interactions.zip unzippeded into Artifacts/unzipped_data
[2024-12-07 12:30:21,151 ] 16 root - INFO - configuring workspace
[2024-12-07 12:30:21,184 ] 291 azureml.core.workspace - INFO - Found the config file in: /config.json
[2024-12-07 12:30:22,089 ] 991 azureml.data.datastore_client - INFO - <azureml.core.authentication.InteractiveLoginAuthentication object at 0x7f86ea6536d0>
[2024-12-07 12:30:23,586 ] 37 root - INFO - Uploading data from local machine into blob storage using datastore
[2024-12-07 12:30:23,588 ] 923 azureml.data.azure_storage_datastore - INFO - Called AzureBlobDatastore.upload
Uploading an estimated of 5 files
[2024-12-07 12:30:23,802 ] 372 azureml.data.azure_storage_datastore - INFO - Uploading an estimated of 5 fi

In [10]:
print(filepath)

None
