In [1]:
import os  # Importing the operating system module to interact with the system
os.chdir("../")  # Changing the working directory to one level up
%pwd  # Displaying the current working directory 


'd:\\Final-Year-Project\\Credit-Card-Fraud-Detection-Using-GNN'

In [2]:
import pickle  # Used for saving and loading Python objects
from pathlib import Path
from dataclasses import dataclass
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Importing constants and utility functions
from Credit_Card_Fraud_Detection.constants import *
from Credit_Card_Fraud_Detection.utils.common import read_yaml, create_directories
from Credit_Card_Fraud_Detection import logger

In [3]:
# ====================================================
# ENTITY: DataTransformationConfig
# ====================================================

@dataclass(frozen=True)
class DataTransformationConfig:
    """
    This class stores configuration details for data transformation.
    - root_dir: Main directory where transformed data is stored.
    - data_path: Path to the raw data.
    - customer_mapping_path: Path to store customer ID mappings.
    - merchant_mapping_path: Path to store merchant ID mappings.
    - label_encoders_path: Path to store label encoders.
    - scaler_path: Path to store the scaler for normalization.
    """
    root_dir: Path
    data_path: Path
    customer_mapping_path: Path
    merchant_mapping_path: Path
    label_encoders_path: Path
    scaler_path: Path

In [4]:
# ====================================================
# CONFIGURATION MANAGER
# ====================================================

class ConfigurationManager:
    """
    This class manages configuration settings by reading YAML files.
    It loads config, parameters, and schema details.
    """
    def __init__(
        self,
        config_filepath=CONFIG_FILE_PATH,
        params_filepath=PARAMS_FILE_PATH,
        schema_filepath=SCHEMA_FILE_PATH):
        
        # Read YAML configuration files
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        # Create required directories
        create_directories([self.config.data_transformation.root_dir])

    def get_data_transformation_config(self) -> DataTransformationConfig:
        """Retrieves data transformation settings and ensures directories exist."""
        config = self.config.data_transformation
        create_directories([config.root_dir])

        return DataTransformationConfig(
            root_dir=Path(config.root_dir),
            data_path=Path(config.data_path),
            customer_mapping_path=Path(config.customer_mapping_file),
            merchant_mapping_path=Path(config.merchant_mapping_file),
            label_encoders_path=Path(config.label_encoders_file),
            scaler_path=Path(config.scaler_file)
        )

In [5]:
# ====================================================
# COMPONENT: Data Transformation
# ====================================================

class DataTransformation:
    """
    This class handles data preprocessing, including:
    - Loading data,
    - Handling missing values,
    - Assigning unique IDs,
    - Extracting datetime features,
    - Encoding categorical variables,
    - Engineering new features,
    - Selecting final features,
    - Normalizing numerical features.
    """
    def __init__(self, config, customer_mapping=None, merchant_mapping=None, label_encoders=None, scaler=None):
        """
        Initializes the DataTransformation class with the provided configuration and optional mappings/encoders.

        Args:
            config: Configuration object containing data transformation details.
            customer_mapping (dict, optional): Mapping of customer IDs. Defaults to None.
            merchant_mapping (dict, optional): Mapping of merchant IDs. Defaults to None.
            label_encoders (dict, optional): Dictionary of LabelEncoders for categorical features. Defaults to None.
            scaler (StandardScaler, optional): StandardScaler object for normalization. Defaults to None.
        """
        self.config = config
        self.label_encoders = label_encoders or {}  # Use provided or create empty dict
        self.scaler = scaler or StandardScaler()  # Use provided or create new StandardScaler
        self.customer_mapping = customer_mapping or {}  # Use provided or create empty dict
        self.merchant_mapping = merchant_mapping or {}  # Use provided or create empty dict

    def load_data(self):
        """
        Loads the dataset from the specified path.

        Returns:
            pd.DataFrame: Loaded DataFrame if successful, None otherwise.
        """
        try:
            return pd.read_csv(self.config.data_path)  # Load data from CSV file
        except FileNotFoundError:
            logger.error(f"File not found: {self.config.data_path}")  # Log error if file not found
            return None  # Return None if loading fails

    def handle_missing_values(self, df):
        """
        Fills missing values for categorical and numerical columns.

        Args:
            df (pd.DataFrame): DataFrame to handle missing values.

        Returns:
            pd.DataFrame: DataFrame with filled missing values.
        """
        df.fillna({"category": "unknown", "state": "unknown"}, inplace=True)  # Fill categorical missing values with 'unknown'
        num_features = ["amt", "city_pop", "lat", "long", "merch_lat", "merch_long"]  # Numerical features to handle
        df[num_features] = df[num_features].apply(pd.to_numeric, errors='coerce').fillna(0)  # Convert to numeric, fill NaN with 0
        return df

    def create_ids(self, df):
        """
        Assigns unique numerical IDs to customers and merchants.

        Args:
            df (pd.DataFrame): DataFrame to assign IDs.

        Returns:
            pd.DataFrame: DataFrame with assigned customer and merchant IDs.
        """
        self.customer_mapping = {customer: idx for idx, customer in enumerate(df['cc_num'].unique())}  # Create mapping for customers
        df['customer_id'] = df['cc_num'].map(self.customer_mapping)  # Map customer IDs

        self.merchant_mapping = {merchant: idx + len(self.customer_mapping) for idx, merchant in enumerate(df['merchant'].unique())}  # Create mapping for merchants
        df['merchant_id'] = df['merchant'].map(self.merchant_mapping)  # Map merchant IDs
        
        df.drop(columns=['cc_num', 'merchant'], inplace=True)  # Drop original customer and merchant columns
        return df

    def extract_datetime(self, df):
        """
        Extracts hour information from the transaction timestamp.

        Args:
            df (pd.DataFrame): DataFrame with transaction timestamps.

        Returns:
            pd.DataFrame: DataFrame with extracted hour information.
        """
        if 'trans_date_trans_time' in df.columns:  # Check if timestamp column exists
            df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])  # Convert to datetime
            df['trans_hour'] = df['trans_date_trans_time'].dt.hour  # Extract hour
        return df

    def encode_categorical(self, df):
        """
        Encodes categorical variables using Label Encoding.

        Args:
            df (pd.DataFrame): DataFrame with categorical columns.

        Returns:
            pd.DataFrame: DataFrame with encoded categorical columns.
        """
        df["gender"] = df["gender"].map({"M": 1, "F": 0}) if "gender" in df.columns else df["gender"] # Map gender to numerical if exists.
        for col in ["category", "state"]:  # Loop through categorical columns
            if col not in self.label_encoders:  # If encoder not already created
                self.label_encoders[col] = LabelEncoder()  # Create new LabelEncoder
                df[col] = self.label_encoders[col].fit_transform(df[col])  # Fit and transform
            else:
                df[col] = self.label_encoders[col].transform(df[col])  # Transform using existing encoder
        return df

    def engineer_features(self, df):
        """
        Creates new features for better fraud detection analysis.

        Args:
            df (pd.DataFrame): DataFrame to engineer features on.

        Returns:
            pd.DataFrame: DataFrame with new engineered features.
        """
        df["transaction_unique"] = range(len(df))  # Unique transaction ID
        df["customer_avg_amt"] = df.groupby("customer_id")["amt"].transform("mean")  # Average amount per customer
        df["merchant_avg_amt"] = df.groupby("merchant_id")["amt"].transform("mean")  # Average amount per merchant
        df["high_amt"] = (df["amt"] > df["customer_avg_amt"] + 3 * df.groupby("customer_id")["amt"].transform("std").fillna(0)).astype(int)  # High amount flag
        df["amt_ratio_merchant"] = df["amt"] / (df["merchant_avg_amt"] + 1e-9)  # Amount ratio to merchant average
        df["amt_diff_customer_avg"] = df["amt"] - df["customer_avg_amt"]  # Amount difference from customer average
        df["hour_cos"] = np.cos(2 * np.pi * df["trans_hour"] / 24)  # Cosine transformation of hour
        df["amt_per_city_pop"] = df["amt"] / (df["city_pop"] + 1e-9)  # Amount per city population
        df["customer_min_amt"] = df.groupby("customer_id")["amt"].transform("min")  # Minimum amount per customer
        df["merchant_min_amt"] = df.groupby("merchant_id")["amt"].transform("min")  # Minimum amount per merchant
        df["customer_amt_std"] = df.groupby("customer_id")["amt"].transform("std").fillna(0)  # Standard deviation of amount per customer
        df["merchant_amt_std"] = df.groupby("merchant_id")["amt"].transform("std").fillna(0)  # Standard deviation of amount per merchant
        df["sqrt_amt"] = np.sqrt(df["amt"]) # Square root of amount.
        return df

    def select_final_features(self, df):
        """
        Selects only relevant features for modeling.

        Args:
            df (pd.DataFrame): DataFrame to select features from.

        Returns:
            pd.DataFrame: DataFrame with selected features.
        """
        columns_to_keep = ['high_amt', 'amt_ratio_merchant', 'sqrt_amt', 'amt', 'customer_avg_amt',
                            'amt_diff_customer_avg', 'hour_cos', 'amt_per_city_pop', 'customer_id', 'merchant_id',
                            'merchant_avg_amt','merchant_min_amt','customer_min_amt','customer_amt_std','merchant_amt_std','transaction_unique']
        if 'is_fraud' in df.columns:  # If target column exists
            columns_to_keep.append('is_fraud')  # Add target column
        return df[columns_to_keep]

    def normalize_features(self, df):
        """
        Normalizes numerical features using StandardScaler.

        Args:
            df (pd.DataFrame): DataFrame to normalize.

        Returns:
            pd.DataFrame: DataFrame with normalized features.
        """
        num_features = [col for col in df.select_dtypes(include=np.number) if col not in ['is_fraud', 'customer_id', 'merchant_id', 'transaction_unique']]  # Select numerical columns
        df[num_features] = self.scaler.fit_transform(df[num_features]).round(5)  # Normalize and round to 5 decimal places
        return df

    def preprocess(self):
        """
        Main function to execute the full preprocessing pipeline.
        Loads, cleans, transforms, and saves the processed dataset.

        Returns:
            tuple: customer_mapping, merchant_mapping, label_encoders, scaler if successful, else tuple of None.
        """
        df = self.load_data()  # Load data
        if df is None:  # If loading failed
            return None, None, None, None  # Return None for all mappings and encoders
        
        df = self.handle_missing_values(df)  # Handle missing values
        df = self.create_ids(df)  # Create customer and merchant IDs
        df = self.extract_datetime(df)  # Extract datetime features
        df = self.encode_categorical(df)  # Encode categorical features
        df = self.engineer_features(df)  # Engineer new features
        df = self.select_final_features(df)  # Select final features
        df = self.normalize_features(df)  # Normalize numerical features
        
        preprocessed_path = os.path.join(self.config.root_dir, "transformed_dataset.csv")  # Define output path
        df.to_csv(preprocessed_path, index=False)  # Save preprocessed data to CSV
        logger.info(f"Preprocessed data saved at {preprocessed_path}")  # Log successful saving
        
        return self.customer_mapping, self.merchant_mapping, self.label_encoders, self.scaler  # Return mappings and encoders

In [6]:
# ===========================================
# TRAINING DATA PIPELINE
# ===========================================
try:
    # Step 1: Load configuration settings
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()

    # Step 2: Initialize the DataTransformation class with the configuration
    data_transformation = DataTransformation(config=data_transformation_config)

    # Step 3: Perform data preprocessing
    customer_mapping, merchant_mapping, label_encoders, scaler = data_transformation.preprocess()

    # Step 4: Save transformation artifacts (customer/merchant mappings, encoders, and scaler)
    with open(data_transformation_config.customer_mapping_path, 'wb') as f:
        pickle.dump(customer_mapping, f)

    with open(data_transformation_config.merchant_mapping_path, 'wb') as f:
        pickle.dump(merchant_mapping, f)

    with open(data_transformation_config.label_encoders_path, 'wb') as f:
        pickle.dump(label_encoders, f)

    with open(data_transformation_config.scaler_path, 'wb') as f:
        pickle.dump(scaler, f)

except Exception as e:
    # Handle and raise any unexpected exceptions
    raise RuntimeError(f"Error in training pipeline: {str(e)}") from e


[2025-03-26 10:58:27,161: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-03-26 10:58:27,164: INFO: common: yaml file: params.yaml loaded successfully]
[2025-03-26 10:58:27,165: INFO: common: yaml file: schema.yaml loaded successfully]
[2025-03-26 10:58:27,165: INFO: common: created directory at: artifacts/data_transformation]
[2025-03-26 10:58:27,168: INFO: common: created directory at: artifacts/data_transformation]
[2025-03-26 10:58:42,846: INFO: 2449968333: Preprocessed data saved at artifacts\data_transformation\transformed_dataset.csv]
