In [1]:
import os
os.chdir("../")
%pwd

'd:\\Final-Year-Project\\Credit-Card-Fraud-Detection-Using-GNN'

In [2]:
# Entity

from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path

In [3]:
from Credit_Card_Fraud_Detection.constants import *
from Credit_Card_Fraud_Detection.utils.common import read_yaml, create_directories

In [4]:
# Configuration

class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])


    
    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation

        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
        )

        return data_transformation_config

In [5]:
import os
import numpy as np
import pandas as pd
from Credit_Card_Fraud_Detection import logger
from sklearn.preprocessing import LabelEncoder, StandardScaler
from imblearn.over_sampling import SMOTE

In [6]:
# import pandas as pd
# from sklearn.preprocessing import LabelEncoder, StandardScaler
# import os
# import logging
# import numpy as np
# from geopy.distance import geodesic

# # Configure logging to see what the code is doing
# logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
# logger = logging.getLogger(__name__)

# class DataTransformation:
#     def __init__(self, config):
#         self.config = config
#         self.label_encoders = {}
#         self.scaler = StandardScaler()
#         self.customer_mapping = {}
#         self.merchant_mapping = {}

#     def load_data(self):
#         try:
#             df = pd.read_csv(self.config.data_path)
#             logger.info("Data loaded successfully.") # Tells us the data was loaded
#             return df
#         except FileNotFoundError:
#             logger.error(f"File not found: {self.config.data_path}") # Tells us if the file wasn't found
#             return None

#     def drop_unwanted_columns(self, df):
#         # We don't need some columns, so we remove them
#         columns_to_drop = ["merch_zipcode", "Unnamed: 0", "trans_num", "unix_time", "zip", "street", "city", "job", "first", "last"]
#         df = df.drop(columns=columns_to_drop, errors="ignore")
#         logger.info(f"Unwanted columns {columns_to_drop} dropped.") # Tells us what columns were removed
#         return df

#     def handle_missing_categorical(self, df):
#         # If some categories or states are missing, we fill them with "unknown"
#         df["category"] = df["category"].fillna("unknown")
#         df["state"] = df["state"].fillna("unknown")
#         logger.info("Missing categorical values handled.") # Tells us missing values were filled
#         return df

#     def handle_numerical_features(self, df):
#         # We make sure numbers are numbers, and fill missing numbers with 0
#         num_features = ["amt", "city_pop", "lat", "long", "merch_lat", "merch_long"]
#         existing_features = [col for col in num_features if col in df.columns]
#         df[existing_features] = df[existing_features].apply(pd.to_numeric, errors='coerce').fillna(0)
#         logger.info("Numerical features converted and missing values handled.") # Tells us numbers were fixed
#         return df

#     def normalize_numerical_features(self, df):
#         # We make the numbers have similar scales
#         num_features = ["amt", "city_pop", "lat", "long", "merch_lat", "merch_long"]
#         existing_features = [col for col in num_features if col in df.columns]
#         df[existing_features] = self.scaler.fit_transform(df[existing_features])
#         for col in ["lat", "long", "merch_lat", "merch_long"]:
#             if col in df.columns:
#                 df[col] = df[col].round(3)
#         logger.info("Numerical features normalized.") # Tells us the numbers were scaled
#         return df

#     def create_customer_and_merchant_ids(self, df):
#         # We give each customer and merchant a unique number
#         unique_customers = df['cc_num'].unique()
#         self.customer_mapping = {customer: idx for idx, customer in enumerate(unique_customers)}
#         df['customer_id'] = df['cc_num'].map(self.customer_mapping).astype(int)
#         unique_merchants = df['merchant'].unique()
#         self.merchant_mapping = {merchant: idx + len(unique_customers) for idx, merchant in enumerate(unique_merchants)}
#         df['merchant_id'] = df['merchant'].map(self.merchant_mapping).astype(int)
#         df.drop(columns=['cc_num', 'merchant'], inplace=True)
#         logger.info("Customer and merchant IDs created.") # Tells us IDs were created
#         return df

#     def extract_datetime_components(self, df):
#         # We break down the date and time into year, month, day, etc.
#         if 'trans_date_trans_time' in df.columns:
#             df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])
#             df['trans_year'] = df['trans_date_trans_time'].dt.year
#             df['trans_month'] = df['trans_date_trans_time'].dt.month
#             df['trans_day'] = df['trans_date_trans_time'].dt.day
#             df['trans_hour'] = df['trans_date_trans_time'].dt.hour
#             df['trans_minute'] = df['trans_date_trans_time'].dt.minute
#             df['trans_second'] = df['trans_date_trans_time'].dt.second
#             df['trans_weekday'] = df['trans_date_trans_time'].dt.day_name()
#             def get_season(month):
#                 if month in [3, 4, 5]: return 'Spring'
#                 elif month in [6, 7, 8]: return 'Summer'
#                 elif month in [9, 10, 11]: return 'Autumn'
#                 else: return 'Winter'
#             df['trans_season'] = df['trans_month'].apply(get_season)
#             df.drop('trans_date_trans_time', axis=1, inplace=True)
#             logger.info("Datetime components extracted and trans_date_trans_time dropped.") # Tells us time was broken down
#         else:
#             logger.error("trans_date_trans_time column not found.") # Tells us if time column is missing
#         return df

#     def calculate_age(self, df):
#         # We calculate the age of the customer
#         if 'dob' in df.columns and 'trans_year' in df.columns:
#             df['dob'] = pd.to_datetime(df['dob'])
#             df['age'] = (df['trans_year'] - df['dob'].dt.year) - (
#                 (df['trans_month'] < df['dob'].dt.month) |
#                 ((df['trans_month'] == df['dob'].dt.month) & (df['trans_day'] < df['dob'].dt.day))
#             )
#             df.drop('dob', axis=1, inplace=True)
#             logger.info("Age calculated from dob and trans_year.") # Tells us age was calculated
#         else:
#             logger.error("dob or trans_year column not found.") # Tells us if age columns are missing
#         return df

#     def encode_data(self, df):
#         # We turn categories and states into numbers
#         if "gender" in df.columns:
#             df["gender"] = df["gender"].map({"M": 1, "F": 0})
#             logger.info("Gender column encoded (M -> 1, F -> 0).") # Tells us gender was encoded
#         for col in ["category", "state"]:
#             if col not in self.label_encoders:
#                 self.label_encoders[col] = LabelEncoder()
#                 df[col] = self.label_encoders[col].fit_transform(df[col])
#             else:
#                 df[col] = self.label_encoders[col].transform(df[col])
#         logger.info("Categorical columns category and state encoded.") # Tells us categories were encoded
#         if "trans_weekday" in df.columns:
#             weekday_mapping = {
#                 "Monday": 0, "Tuesday": 1, "Wednesday": 2, "Thursday": 3,
#                 "Friday": 4, "Saturday": 5, "Sunday": 6
#             }
#             df["trans_weekday"] = df["trans_weekday"].map(weekday_mapping)
#             logger.info("trans_weekday column encoded.") # Tells us day of week was encoded
#         if "trans_season" in df.columns:
#             season_mapping = {"Spring": 0, "Summer": 1, "Autumn": 2, "Winter": 3}
#             df["trans_season"] = df["trans_season"].map(season_mapping)
#             logger.info("trans_season column encoded.") # Tells us season was encoded
#         return df

#     def feature_engineering(self, df):
#         # We create new features to help the model learn better
#         # Transaction Frequency and Recency (How often and how recently)
#         df['customer_trans_freq'] = df.groupby('customer_id')['transaction_unique'].transform('count') # How many transactions per customer
#         df['merchant_trans_freq'] = df.groupby('merchant_id')['transaction_unique'].transform('count') # How many transactions per merchant
#         df['customer_last_trans'] = df.groupby('customer_id')['trans_year'].transform('max') # Last transaction year per customer
#         df['customer_trans_recency'] = df['trans_year'] - df['customer_last_trans'] # How recent the customer's last transaction was

#         # Time Based Features (Time of day, weekend)
#         def time_of_day(hour):
#             if 6 <= hour < 12: return 'Morning'
#             elif 12 <= hour < 18: return 'Afternoon'
#             elif 18 <= hour < 24: return 'Evening'
#             else: return 'Night'
#         df['time_of_day'] = df['trans_hour'].apply(time_of_day)
#         df = pd.get_dummies(df, columns=['time_of_day'], prefix='time_of_day')  # one hot encoding
#         for col in ['time_of_day_Afternoon', 'time_of_day_Evening', 'time_of_day_Morning', 'time_of_day_Night']:
#             if col in df.columns:
#                 df[col] = df[col].astype(int)

#         df['is_weekend'] = df['trans_weekday'].apply(lambda x: 1 if x in ['Saturday', 'Sunday'] else 0)

#         # Location Based Features (Distance, state frequency)
#         def calculate_distance(row):
#             try:
#                 customer_coords = (row['lat'], row['long'])
#                 merchant_coords = (row['merch_lat'], row['merch_long'])
#                 return geodesic(customer_coords, merchant_coords).km
#             except ValueError:
#                 return np.nan  # Handle cases with missing lat/long

#         df['distance_customer_merchant'] = df.apply(calculate_distance, axis=1) # Distance between customer and merchant
#         df['state_freq'] = df.groupby('state')['transaction_unique'].transform('count') # How often transactions happen in each state

#         # Transaction Amount Features (Average amount, ratio, std, large transaction)
#         df['customer_avg_amt'] = df.groupby('customer_id')['amt'].transform('mean') # Average transaction amount per customer
#         df['amt_ratio_avg'] = df['amt'] / (df['customer_avg_amt'] + 1e-9)  # Avoid division by zero, ratio of transaction amount to average
#         df['customer_amt_std'] = df.groupby('customer_id')['amt'].transform('std').fillna(0) # Standard deviation of transaction amounts per customer
#         df['large_transaction'] = (df['amt'] > df['customer_avg_amt'] + 3 * df['customer_amt_std']).astype(int) # Indicator if transaction is much larger than usual

#         # Customer Behavior Features (Most used category, category count)
#         df['customer_most_used_category'] = df.groupby('customer_id')['category'].transform(lambda x: x.mode()[0] if not x.mode().empty else np.nan) # Most frequent transaction category per customer
#         df['customer_category_count'] = df.groupby(['customer_id', 'category'])['transaction_unique'].transform('count') # How many transactions per customer and category

#         # Merchant Behavior Features (Merchant amount std and average)
#         df['merchant_amt_std'] = df.groupby('merchant_id')['amt'].transform('std').fillna(0) # Standard deviation of transaction amounts per merchant
#         df['merchant_avg_amt'] = df.groupby('merchant_id')['amt'].transform('mean') # Average transaction amount per merchant

#         #Time since last category change, and previous category
#         df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'], errors='coerce')
#         df.sort_values(['customer_id', 'trans_date_trans_time'], inplace=True)
#         df['prev_category'] = df.groupby('customer_id')['category'].shift(1).fillna(df['category']) # Previous category
#         df['time_since_last_category_change'] = df.groupby('customer_id')['trans_date_trans_time'].diff().dt.total_seconds().fillna(0) # Time since customer changed categories

#         #Transaction speed
#         df['transaction_speed'] = df.groupby('customer_id')['trans_date_trans_time'].diff().dt.total_seconds().fillna(0) # Time between transactions

#         #Merchant category fraud risk.
#         merchant_cat_fraud = df.groupby('category')['is_fraud'].mean().to_dict()
#         df['merchant_category_fraud_risk'] = df['category'].map(merchant_cat_fraud) #average fraud rate per category

#         logger.info("Advanced features engineered.") # Tells us new features were created
#         return df

#     def preprocess(self):
#         df = self.load_data()
#         if df is None:
#             return

#         df = self.drop_unwanted_columns(df)
#         df = self.handle_missing_categorical(df)
#         df = self.handle_numerical_features(df)
#         df = self.normalize_numerical_features(df)
#         df = self.create_customer_and_merchant_ids(df)
#         df = self.extract_datetime_components(df)
#         df = self.calculate_age(df)
#         df = self.encode_data(df)

#         df["transaction_unique"] = range(len(df))

#         df = self.feature_engineering(df)

#         preprocessed_path = os.path.join(self.config.root_dir, "transformed_dataset.csv")
#         df.to_csv(preprocessed_path, index=False)
#         logger.info(f"Preprocessed data saved at {preprocessed_path}") # Tells us data was saved

#         return df

In [7]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
import os
import logging
import numpy as np
from geopy.distance import geodesic

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class DataTransformation:
    def __init__(self, config):
        self.config = config
        self.label_encoders = {}
        self.scaler = StandardScaler()
        self.customer_mapping = {}
        self.merchant_mapping = {}

    def load_data(self):
        try: return pd.read_csv(self.config.data_path)
        except FileNotFoundError: logger.error(f"File not found: {self.config.data_path}"); return None

    def handle_missing_values(self, df):
        df["category"] = df["category"].fillna("unknown")
        df["state"] = df["state"].fillna("unknown")
        num_features = ["amt", "city_pop", "lat", "long", "merch_lat", "merch_long"]
        existing_features = [col for col in num_features if col in df.columns]
        df[existing_features] = df[existing_features].apply(pd.to_numeric, errors='coerce').fillna(0)
        return df

    def create_ids(self, df):
        unique_customers = df['cc_num'].unique()
        self.customer_mapping = {customer: idx for idx, customer in enumerate(unique_customers)}
        df['customer_id'] = df['cc_num'].map(self.customer_mapping).astype(int)
        unique_merchants = df['merchant'].unique()
        self.merchant_mapping = {merchant: idx + len(unique_customers) for idx, merchant in enumerate(unique_merchants)}
        df['merchant_id'] = df['merchant'].map(self.merchant_mapping).astype(int)
        df.drop(columns=['cc_num', 'merchant'], inplace=True)
        return df

    def extract_datetime(self, df):
        if 'trans_date_trans_time' in df.columns:
            df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])
            df['trans_hour'] = df['trans_date_trans_time'].dt.hour
        return df

    def encode_categorical(self, df):
        if "gender" in df.columns: df["gender"] = df["gender"].map({"M": 1, "F": 0})
        for col in ["category", "state"]:
            if col not in self.label_encoders: self.label_encoders[col] = LabelEncoder(); df[col] = self.label_encoders[col].fit_transform(df[col])
            else: df[col] = self.label_encoders[col].transform(df[col])
        return df

    def engineer_features(self, df):
        df["transaction_unique"] = range(len(df))
        df['customer_avg_amt'] = df.groupby('customer_id')['amt'].transform('mean')
        df['merchant_category_fraud_risk'] = df.groupby('category')['is_fraud'].mean().to_dict()
        df['merchant_category_fraud_risk'] = df['category'].map(df['merchant_category_fraud_risk'])
        df['merchant_avg_amt'] = df.groupby('merchant_id')['amt'].transform('mean')
        df['high_amt'] = (df['amt'] > df['customer_avg_amt'] + 3 * df.groupby('customer_id')['amt'].transform('std').fillna(0)).astype(int)
        df['amt_ratio_merchant'] = df['amt'] / (df['merchant_avg_amt'] + 1e-9)
        df['amt_diff_customer_avg'] = df['amt'] - df['customer_avg_amt']
        df['hour_cos'] = np.cos(2 * np.pi * df['trans_hour'] / 24)
        df['amt_per_city_pop'] = df['amt'] / (df['city_pop'] + 1e-9)
        df['customer_min_amt'] = df.groupby('customer_id')['amt'].transform('min')
        df['merchant_min_amt'] = df.groupby('merchant_id')['amt'].transform('min')
        df['customer_amt_std'] = df.groupby('customer_id')['amt'].transform('std').fillna(0)
        df['merchant_amt_std'] = df.groupby('merchant_id')['amt'].transform('std').fillna(0)
        df['sqrt_amt'] = np.sqrt(df['amt'])
        return df

    def select_final_features(self, df):
        columns_to_keep = ['high_amt', 'amt_ratio_merchant', 'sqrt_amt', 'amt', 'customer_avg_amt','amt_diff_customer_avg', 'hour_cos', 'amt_per_city_pop', 'customer_min_amt','merchant_category_fraud_risk', 'merchant_avg_amt', 'merchant_min_amt','customer_amt_std', 'merchant_amt_std', 'customer_id','merchant_id','transaction_unique', 'is_fraud']
        return df[columns_to_keep]

    def normalize_features(self, df):
        num_features_to_scale = [col for col in df.select_dtypes(include=np.number).columns.tolist() if col not in ['is_fraud', 'customer_id', 'merchant_id','transaction_unique']]
        df[num_features_to_scale] = self.scaler.fit_transform(df[num_features_to_scale]).round(5)
        return df

    def preprocess(self):
        df = self.load_data()
        if df is None: return
        df = self.handle_missing_values(df)
        df = self.create_ids(df)
        df = self.extract_datetime(df)
        df = self.encode_categorical(df)
        df = self.engineer_features(df)
        df = self.select_final_features(df)
        df = self.normalize_features(df)

        preprocessed_path = os.path.join(self.config.root_dir, "transformed_dataset.csv")
        df.to_csv(preprocessed_path, index=False)
        logger.info(f"Preprocessed data saved at {preprocessed_path}")
        return df

In [8]:
# Pipeline

try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config=data_transformation_config)
    transformed_data = data_transformation.preprocess()  # Corrected method call
except Exception as e:
    raise e  # This will re-raise the exception if any error occurs

[2025-03-22 21:54:04,648: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-03-22 21:54:04,655: INFO: common: yaml file: params.yaml loaded successfully]
[2025-03-22 21:54:04,656: INFO: common: yaml file: schema.yaml loaded successfully]
[2025-03-22 21:54:04,657: INFO: common: created directory at: artifacts]
[2025-03-22 21:54:04,658: INFO: common: created directory at: artifacts/data_transformation]
[2025-03-22 21:54:20,887: INFO: 2722166197: Preprocessed data saved at artifacts/data_transformation\transformed_dataset.csv]
