In [1]:
import os
os.chdir("../")
%pwd

'd:\\Final-Year-Project\\Credit-Card-Fraud-Detection-Using-GNN'

In [2]:
from Credit_Card_Fraud_Detection.constants import *
from Credit_Card_Fraud_Detection.utils.common import read_yaml, create_directories
from Credit_Card_Fraud_Detection import logger

In [3]:
# ConfigurationManager.py
from dataclasses import dataclass
from pathlib import Path
import os
from box import ConfigBox
import yaml



@dataclass(frozen=True)
class TestDataTransformationConfig:
    root_dir: Path
    data_path: Path
    customer_mapping_path: Path
    merchant_mapping_path: Path
    label_encoders_path: Path
    scaler_path: Path



In [4]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath=CONFIG_FILE_PATH,
        params_filepath=PARAMS_FILE_PATH,
        schema_filepath=SCHEMA_FILE_PATH,
    ):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)
        create_directories([self.config.artifacts_root])

    def get_test_data_transformation_config(self) -> TestDataTransformationConfig:
        config = self.config.test_data_transformation
        create_directories([Path(config.root_dir)])

        test_data_transformation_config = TestDataTransformationConfig(
            root_dir=Path(config.root_dir),
            data_path=Path(config.data_path),
            customer_mapping_path=Path(os.path.join(self.config.data_transformation.root_dir, self.config.data_transformation.customer_mapping_file)),
            merchant_mapping_path=Path(os.path.join(self.config.data_transformation.root_dir, self.config.data_transformation.merchant_mapping_file)),
            label_encoders_path=Path(os.path.join(self.config.data_transformation.root_dir, self.config.data_transformation.label_encoders_file)),
            scaler_path=Path(os.path.join(self.config.data_transformation.root_dir, self.config.data_transformation.scaler_file)),
        )
        return test_data_transformation_config

In [5]:
import pandas as pd
import numpy as np
import pickle
import os
import logging

# Configure logging (replace with your actual logging setup)
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class TestDataTransformation:
    def __init__(self, config, customer_mapping_path, merchant_mapping_path, label_encoders_path, scaler_path):
        self.config = config
        self.customer_mapping_path = customer_mapping_path
        self.merchant_mapping_path = merchant_mapping_path
        self.label_encoders_path = label_encoders_path
        self.scaler_path = scaler_path

    def load_data(self):
        try:
            return pd.read_csv(self.config.data_path)
        except FileNotFoundError:
            logger.error(f"File not found: {self.config.data_path}")
            return None

    def load_artifacts(self):
        try:
            with open(self.customer_mapping_path, 'rb') as f:
                customer_mapping = pickle.load(f)
            with open(self.merchant_mapping_path, 'rb') as f:
                merchant_mapping = pickle.load(f)
            with open(self.label_encoders_path, 'rb') as f:
                label_encoders = pickle.load(f)
            with open(self.scaler_path, 'rb') as f:
                scaler = pickle.load(f)
            return customer_mapping, merchant_mapping, label_encoders, scaler
        except FileNotFoundError as e:
            logger.error(f"Artifact file not found: {e}")
            return None, None, None, None

    def handle_missing_values(self, df):
        df["category"] = df["category"].fillna("unknown")
        df["state"] = df["state"].fillna("unknown")
        num_features = ["amt", "city_pop", "lat", "long", "merch_lat", "merch_long"]
        existing_features = [col for col in num_features if col in df.columns]
        df[existing_features] = df[existing_features].apply(pd.to_numeric, errors='coerce').fillna(0)
        return df

    def create_ids(self, df, customer_mapping, merchant_mapping):
        df['customer_id'] = df['cc_num'].map(customer_mapping).astype(int)
        df['customer_id'] = df['customer_id'].fillna(-1).astype(int)
        unseen_customers = df[df['customer_id'] == -1]['cc_num'].unique()
        max_customer_id = max(customer_mapping.values())
        for idx, customer in enumerate(unseen_customers):
            max_customer_id += 1
            customer_mapping[customer] = max_customer_id
        df['customer_id'] = df['cc_num'].map(customer_mapping).astype(int)

        df['merchant_id'] = df['merchant'].map(merchant_mapping).astype(int)
        df['merchant_id'] = df['merchant_id'].fillna(-1).astype(int)
        unseen_merchants = df[df['merchant_id'] == -1]['merchant'].unique()
        max_merchant_id = max(merchant_mapping.values())
        for idx, merchant in enumerate(unseen_merchants):
            max_merchant_id +=1
            merchant_mapping[merchant] = max_merchant_id
        df['merchant_id'] = df['merchant'].map(merchant_mapping).astype(int)

        df.drop(columns=['cc_num', 'merchant'], inplace=True)
        return df

    def extract_datetime(self, df):
        if 'trans_date_trans_time' in df.columns:
            df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])
            df['trans_hour'] = df['trans_date_trans_time'].dt.hour
        return df

    def encode_categorical(self, df, label_encoders):
        if "gender" in df.columns:
            df["gender"] = df["gender"].map({"M": 1, "F": 0})
        for col in ["category", "state"]:
            df[col] = label_encoders[col].transform(df[col])
        return df

    def engineer_features(self, df):
        df["transaction_unique"] = range(len(df))
        df['customer_avg_amt'] = df.groupby('customer_id')['amt'].transform('mean')
        df['merchant_avg_amt'] = df.groupby('merchant_id')['amt'].transform('mean')
        df['high_amt'] = (df['amt'] > df['customer_avg_amt'] + 3 * df.groupby('customer_id')['amt'].transform('std').fillna(0)).astype(int)
        df['amt_ratio_merchant'] = df['amt'] / (df['merchant_avg_amt'] + 1e-9)
        df['amt_diff_customer_avg'] = df['amt'] - df['customer_avg_amt']
        df['hour_cos'] = np.cos(2 * np.pi * df['trans_hour'] / 24)
        df['amt_per_city_pop'] = df['amt'] / (df['city_pop'] + 1e-9)
        df['customer_min_amt'] = df.groupby('customer_id')['amt'].transform('min')
        df['merchant_min_amt'] = df.groupby('merchant_id')['amt'].transform('min')
        df['customer_amt_std'] = df.groupby('customer_id')['amt'].transform('std').fillna(0)
        df['merchant_amt_std'] = df.groupby('merchant_id')['amt'].transform('std').fillna(0)
        df['sqrt_amt'] = np.sqrt(df['amt'])
        return df

    def select_final_features(self, df):
        columns_to_keep = ['high_amt', 'amt_ratio_merchant', 'sqrt_amt', 'amt', 'customer_avg_amt',
                           'amt_diff_customer_avg', 'hour_cos', 'amt_per_city_pop', 'customer_min_amt',
                           'merchant_avg_amt', 'merchant_min_amt', 'customer_amt_std', 'merchant_amt_std',
                           'customer_id', 'merchant_id', 'transaction_unique', 'is_fraud']  # Keep 'is_fraud'
        return df[columns_to_keep]

    def normalize_features(self, df, scaler):
        num_features_to_scale = [col for col in df.select_dtypes(include=np.number).columns.tolist() if
                                  col not in ['customer_id', 'merchant_id', 'transaction_unique', 'is_fraud']] # don't scale 'is_fraud'
        df[num_features_to_scale] = scaler.transform(df[num_features_to_scale]).round(5)
        return df

    def preprocess(self):
        df = self.load_data()
        if df is None:
            return None
        customer_mapping, merchant_mapping, label_encoders, scaler = self.load_artifacts()
        if customer_mapping is None:
            return None

        df = self.handle_missing_values(df)
        df = self.create_ids(df, customer_mapping, merchant_mapping)
        df = self.extract_datetime(df)
        df = self.encode_categorical(df, label_encoders)
        df = self.engineer_features(df)
        df = self.select_final_features(df)
        df = self.normalize_features(df, scaler)

        preprocessed_path = os.path.join(self.config.root_dir, "transformed_test_dataset.csv")
        df.to_csv(preprocessed_path, index=False)
        logger.info(f"Preprocessed test data saved at {preprocessed_path}")
        return df

In [6]:
try:
    config = ConfigurationManager()
    test_data_transformation_config = config.get_test_data_transformation_config()
    test_data_transformation = TestDataTransformation(
        config=test_data_transformation_config,
        customer_mapping_path=test_data_transformation_config.customer_mapping_path,
        merchant_mapping_path=test_data_transformation_config.merchant_mapping_path,
        label_encoders_path=test_data_transformation_config.label_encoders_path,
        scaler_path=test_data_transformation_config.scaler_path,
    )
    test_data_transformation.preprocess()
except Exception as e:
    raise e

[2025-03-24 16:50:17,920: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-03-24 16:50:17,922: INFO: common: yaml file: params.yaml loaded successfully]
[2025-03-24 16:50:17,923: INFO: common: yaml file: schema.yaml loaded successfully]
[2025-03-24 16:50:17,925: INFO: common: created directory at: artifacts]
[2025-03-24 16:50:17,926: INFO: common: created directory at: artifacts\Testing]
[2025-03-24 16:50:25,440: INFO: 231566482: Preprocessed test data saved at artifacts\Testing\transformed_test_dataset.csv]
