In [1]:
import os
os.chdir("../")
%pwd

'd:\\Final-Year-Project\\Credit-Card-Fraud-Detection-Using-GNN'

In [2]:
import zipfile
import subprocess  # Used for running shell commands
from pathlib import Path
from dataclasses import dataclass

# Importing constants and utility functions
from Credit_Card_Fraud_Detection.constants import *
from Credit_Card_Fraud_Detection.utils.common import read_yaml, create_directories
from Credit_Card_Fraud_Detection import logger

In [3]:
# ====================================================
# ENTITY: DataIngestionConfig
# ====================================================

@dataclass(frozen=True)
class DataIngestionConfig:
    """
    This class stores configuration details for data ingestion.
    - root_dir: Main directory where data is stored.
    - source_URL: URL to download the dataset.
    - local_data_file: Path where the downloaded data is stored.
    - unzip_dir: Directory where the extracted files will be stored.
    """
    root_dir: Path
    source_URL: str
    local_data_file: Path
    unzip_dir: Path

In [4]:
# ====================================================
# CONFIGURATION MANAGER
# ====================================================

class ConfigurationManager:
    """
    This class manages the configuration settings by reading YAML files.
    It loads config, parameters, and schema details.
    """
    def __init__(
        self,
        config_filepath=CONFIG_FILE_PATH,
        params_filepath=PARAMS_FILE_PATH,
        schema_filepath=SCHEMA_FILE_PATH):
        
        # Read YAML configuration files
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        # Create required directories
        create_directories([self.config.artifacts_root])

    def get_data_ingestion_config(self) -> DataIngestionConfig:
        """Retrieves data ingestion settings and ensures directories exist."""
        config = self.config.data_ingestion
        create_directories([config.root_dir])

        return DataIngestionConfig(
            root_dir=Path(config.root_dir),
            source_URL=config.source_URL,
            local_data_file=Path(config.local_data_file),
            unzip_dir=Path(config.unzip_dir)
        )

In [5]:

# ====================================================
# COMPONENT: Data Ingestion
# ====================================================

class DataIngestion:
    """
    This class is responsible for downloading and extracting the dataset.
    It takes a DataIngestionConfig object as input, which specifies the
    source URL, local file path, and unzip directory.
    """
    def __init__(self, config: DataIngestionConfig):
        """
        Initializes the DataIngestion class with the provided configuration.

        Args:
            config (DataIngestionConfig): Configuration object containing download and extraction details.
        """
        self.config = config

    def download_file(self):
        """
        Downloads a file from the source URL if it doesn't already exist locally.

        This method uses the 'gdown' command-line tool to download the file. If the file
        already exists at the specified local path, it skips the download.

        Raises:
            subprocess.CalledProcessError: If the 'gdown' command fails.
        """
        # Check if the file already exists locally
        if not self.config.local_data_file.exists():
            try:
                # Use 'gdown' to download the file from the URL
                # '--fuzzy' allows for partial matches in the URL
                # '-O' specifies the output file path
                subprocess.run([
                    "gdown", "--fuzzy", self.config.source_URL, "-O", str(self.config.local_data_file)
                ], check=True)  # 'check=True' raises an error if the command fails

                # Log successful download
                logger.info(f"Downloaded file to {self.config.local_data_file}")
            except subprocess.CalledProcessError as e:
                # Log the error if the download fails
                logger.error(f"Failed to download file: {e}")
        else:
            # Log that the file already exists
            logger.info(f"File already exists at {self.config.local_data_file}")

    def extract_zip_file(self):
        """
        Extracts the downloaded ZIP file into the specified directory.

        This method creates the unzip directory if it doesn't exist and then extracts
        the contents of the ZIP file into it.

        Raises:
            zipfile.BadZipFile: If the downloaded file is not a valid ZIP file.
            FileNotFoundError: If the ZIP file is not found.
        """
        # Create the unzip directory if it doesn't exist
        os.makedirs(self.config.unzip_dir, exist_ok=True)

        try:
            # Open the ZIP file for reading
            with zipfile.ZipFile(self.config.local_data_file, 'r') as zip_ref:
                # Extract all files from the ZIP archive to the unzip directory
                zip_ref.extractall(self.config.unzip_dir)

            # Log successful extraction
            logger.info(f"Extracted ZIP file to {self.config.unzip_dir}")
        except zipfile.BadZipFile:
            # Log the error if the file is not a valid ZIP file
            logger.error("The downloaded file is not a valid ZIP file.")
        except FileNotFoundError:
            # Log the error if the ZIP file is not found
            logger.error("ZIP file not found.")

In [6]:
# ====================================================
# PIPELINE: Run the Data Ingestion Process
# ====================================================
if __name__ == "__main__":
    try:
        # Step 1: Load Configuration
        config_manager = ConfigurationManager()
        data_ingestion_config = config_manager.get_data_ingestion_config()
        
        # Step 2: Initialize Data Ingestion
        data_ingestion = DataIngestion(config=data_ingestion_config)
        
        # Step 3: Download and Extract Data
        data_ingestion.download_file()
        data_ingestion.extract_zip_file()
        
    except Exception as e:
        logger.error(f"Pipeline failed due to: {e}")

[2025-03-26 10:35:28,528: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-03-26 10:35:28,532: INFO: common: yaml file: params.yaml loaded successfully]
[2025-03-26 10:35:28,532: INFO: common: yaml file: schema.yaml loaded successfully]
[2025-03-26 10:35:28,535: INFO: common: created directory at: artifacts]
[2025-03-26 10:35:28,536: INFO: common: created directory at: artifacts/data_ingestion]
[2025-03-26 10:35:28,536: INFO: 3685546946: File already exists at artifacts\data_ingestion\data.zip]
[2025-03-26 10:35:30,617: INFO: 3685546946: Extracted ZIP file to artifacts\data_ingestion]
