In [1]:
import os
os.chdir("../")
%pwd

'd:\\Final-Year-Project\\Credit-Card-Fraud-Detection-Using-GNN'

In [2]:
# Entity

from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path

In [3]:
from Credit_Card_Fraud_Detection.constants import *
from Credit_Card_Fraud_Detection.utils.common import read_yaml, create_directories

In [4]:
# Configuration

class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])


    
    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation

        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
        )

        return data_transformation_config

In [5]:
import os
import numpy as np
import pandas as pd
from Credit_Card_Fraud_Detection import logger
from sklearn.preprocessing import LabelEncoder, StandardScaler


In [6]:


class DataTransformation:
    def __init__(self, config):
        self.config = config
        self.label_encoders = {}
        self.scaler = StandardScaler()
        self.customer_mapping = {}
        self.merchant_mapping = {}

    def load_data(self):
        try: return pd.read_csv(self.config.data_path)
        except FileNotFoundError: logger.error(f"File not found: {self.config.data_path}"); return None

    def handle_missing_values(self, df):
        df["category"] = df["category"].fillna("unknown")
        df["state"] = df["state"].fillna("unknown")
        num_features = ["amt", "city_pop", "lat", "long", "merch_lat", "merch_long"]
        existing_features = [col for col in num_features if col in df.columns]
        df[existing_features] = df[existing_features].apply(pd.to_numeric, errors='coerce').fillna(0)
        return df

    def create_ids(self, df):
        """
        Creates unique customer and merchant IDs, ensuring merchant IDs start after the last customer ID.
        """
        unique_customers = df['cc_num'].unique()
        self.customer_mapping = {customer: idx for idx, customer in enumerate(unique_customers)}
        df['customer_id'] = df['cc_num'].map(self.customer_mapping).astype(int)

        unique_merchants = df['merchant'].unique()
        
        # Corrected offset calculation:
        max_customer_id = df['customer_id'].max()
        self.merchant_mapping = {merchant: idx + max_customer_id + 1 for idx, merchant in enumerate(unique_merchants)}
        
        df['merchant_id'] = df['merchant'].map(self.merchant_mapping).astype(int)

        df.drop(columns=['cc_num', 'merchant'], inplace=True)
        return df
    
    
    def extract_datetime(self, df):
        if 'trans_date_trans_time' in df.columns:
            df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])
            df['trans_hour'] = df['trans_date_trans_time'].dt.hour
        return df

    def encode_categorical(self, df):
        if "gender" in df.columns: df["gender"] = df["gender"].map({"M": 1, "F": 0})
        for col in ["category", "state"]:
            if col not in self.label_encoders: self.label_encoders[col] = LabelEncoder(); df[col] = self.label_encoders[col].fit_transform(df[col])
            else: df[col] = self.label_encoders[col].transform(df[col])
        return df

    def engineer_features(self, df):
        df["transaction_unique"] = range(len(df))
        df['customer_avg_amt'] = df.groupby('customer_id')['amt'].transform('mean')
        df['merchant_category_fraud_risk'] = df.groupby('category')['is_fraud'].mean().to_dict()
        df['merchant_category_fraud_risk'] = df['category'].map(df['merchant_category_fraud_risk'])
        df['merchant_avg_amt'] = df.groupby('merchant_id')['amt'].transform('mean')
        df['high_amt'] = (df['amt'] > df['customer_avg_amt'] + 3 * df.groupby('customer_id')['amt'].transform('std').fillna(0)).astype(int)
        df['amt_ratio_merchant'] = df['amt'] / (df['merchant_avg_amt'] + 1e-9)
        df['amt_diff_customer_avg'] = df['amt'] - df['customer_avg_amt']
        df['hour_cos'] = np.cos(2 * np.pi * df['trans_hour'] / 24)
        df['amt_per_city_pop'] = df['amt'] / (df['city_pop'] + 1e-9)
        df['customer_min_amt'] = df.groupby('customer_id')['amt'].transform('min')
        df['merchant_min_amt'] = df.groupby('merchant_id')['amt'].transform('min')
        df['customer_amt_std'] = df.groupby('customer_id')['amt'].transform('std').fillna(0)
        df['merchant_amt_std'] = df.groupby('merchant_id')['amt'].transform('std').fillna(0)
        df['sqrt_amt'] = np.sqrt(df['amt'])
        return df

    def select_final_features(self, df):
        columns_to_keep = ['high_amt', 'amt_ratio_merchant', 'sqrt_amt', 'amt', 'customer_avg_amt','amt_diff_customer_avg', 'hour_cos', 'amt_per_city_pop', 'customer_min_amt','merchant_category_fraud_risk', 'merchant_avg_amt', 'merchant_min_amt','customer_amt_std', 'merchant_amt_std', 'customer_id','merchant_id','transaction_unique', 'is_fraud']
        return df[columns_to_keep]

    def normalize_features(self, df):
        num_features_to_scale = [col for col in df.select_dtypes(include=np.number).columns.tolist() if col not in ['is_fraud', 'customer_id', 'merchant_id','transaction_unique']]
        df[num_features_to_scale] = self.scaler.fit_transform(df[num_features_to_scale]).round(5)
        return df

    def preprocess(self):
        df = self.load_data()
        if df is None: return
        df = self.handle_missing_values(df)
        df = self.create_ids(df)
        df = self.extract_datetime(df)
        df = self.encode_categorical(df)
        df = self.engineer_features(df)
        df = self.select_final_features(df)
        df = self.normalize_features(df)

        preprocessed_path = os.path.join(self.config.root_dir, "transformed_dataset.csv")
        df.to_csv(preprocessed_path, index=False)
        logger.info(f"Preprocessed data saved at {preprocessed_path}")
        return df

In [7]:
# Pipeline

try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config=data_transformation_config)
    transformed_data = data_transformation.preprocess()  # Corrected method call
except Exception as e:
    raise e  # This will re-raise the exception if any error occurs

[2025-03-24 09:31:12,312: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-03-24 09:31:12,312: INFO: common: yaml file: params.yaml loaded successfully]
[2025-03-24 09:31:12,315: INFO: common: yaml file: schema.yaml loaded successfully]
[2025-03-24 09:31:12,315: INFO: common: created directory at: artifacts]
[2025-03-24 09:31:12,316: INFO: common: created directory at: artifacts/data_transformation]
[2025-03-24 09:31:28,115: INFO: 2947974737: Preprocessed data saved at artifacts/data_transformation\transformed_dataset.csv]
