In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from imblearn.over_sampling import SMOTE
from scipy.stats import chi2_contingency

In [3]:
# Load Dataset
data = pd.read_csv('transaction_data_updated.csv')

In [4]:
# Display basic info
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 781207 entries, 0 to 781206
Data columns (total 24 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   Unnamed: 0                781207 non-null  int64  
 1   accountNumber             781207 non-null  int64  
 2   customerId                781207 non-null  int64  
 3   creditLimit               781207 non-null  float64
 4   availableMoney            781207 non-null  float64
 5   transactionDateTime       781207 non-null  object 
 6   transactionAmount         781207 non-null  float64
 7   merchantName              781207 non-null  object 
 8   acqCountry                781207 non-null  object 
 9   merchantCountryCode       781207 non-null  object 
 10  posEntryMode              781207 non-null  int64  
 11  posConditionCode          781207 non-null  int64  
 12  merchantCategoryCode      781207 non-null  object 
 13  currentExpDate            781207 non-null  o

In [5]:
# Drop missing values
data.dropna(axis=0, inplace=True)

In [6]:
# Target column
target_column = 'isFraud'

In [7]:
### ---- CATEGORICAL FEATURE SELECTION ---- ###
# Identify categorical columns
categorical_columns = data.select_dtypes(include=['object']).columns

In [8]:
# Calculate Cramér's V for categorical associations
associations = {}
for column in categorical_columns:
    contingency_table = pd.crosstab(data[column], data[target_column])
    chi2, _, _, _ = chi2_contingency(contingency_table)
    n = contingency_table.sum().sum()
    phi2 = chi2 / n
    min_dim = min(contingency_table.shape) - 1
    associations[column] = np.sqrt(phi2 / min_dim)

In [9]:
# Define threshold for useful categorical features
association_threshold = 0.1
columns_to_drop = [col for col, assoc in associations.items() if assoc < association_threshold]
data.drop(columns=columns_to_drop, inplace=True)

In [10]:
### ---- NUMERIC FEATURE SELECTION ---- ###
# Select numeric columns
numeric_columns = data.select_dtypes(include=['int64', 'float64']).columns

In [11]:
# Compute correlations
correlations = data[numeric_columns].corrwith(data[target_column])

In [12]:
# Drop weakly correlated numeric features
correlation_threshold = 0.05  # Increased from 0.005 to keep more relevant features
columns_to_drop = [col for col, corr in correlations.items() if abs(corr) < correlation_threshold]
data.drop(columns=columns_to_drop, inplace=True)

In [13]:
### ---- BOOLEAN FEATURES ---- ###
# Convert boolean columns to integers
boolean_columns = data.select_dtypes(include=['bool']).columns
data[boolean_columns] = data[boolean_columns].astype(np.int64)

In [14]:
### ---- DATE & TIME FEATURE ENGINEERING ---- ###
# Convert date columns to datetime format
date_columns = ['accountOpenDate', 'dateOfLastAddressChange']
for col in date_columns:
    data[col] = pd.to_datetime(data[col], errors='coerce')

# Create time-based features
data['account_age_days'] = (pd.Timestamp.now() - data['accountOpenDate']).dt.days
data['days_since_address_change'] = (pd.Timestamp.now() - data['dateOfLastAddressChange']).dt.days

In [15]:
# Drop original date columns
data.drop(columns=date_columns, inplace=True)

In [16]:
### ---- APPLY FREQUENCY ENCODING ---- ###
categorical_columns = data.select_dtypes(include=['object']).columns
for col in categorical_columns:
    freq_map = data[col].value_counts().to_dict()
    data[col] = data[col].map(freq_map)

In [17]:
### ---- CHECK FOR DUPLICATE COLUMNS ---- ###
columns = data.columns
for i in range(len(columns)):
    for j in range(i + 1, len(columns)):
        if (data[columns[i]] == data[columns[j]]).all():
            data.drop(columns=[columns[j]], inplace=True)

In [18]:
# Save semi-processed data
data.to_csv('semi-treated_data.csv', index=False)

In [19]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 781207 entries, 0 to 781206
Data columns (total 8 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   transactionDateTime        781207 non-null  int64  
 1   transactionAmount          781207 non-null  float64
 2   merchantName               781207 non-null  int64  
 3   cardPresent                781207 non-null  int64  
 4   expirationDateKeyInMatch   781207 non-null  int64  
 5   isFraud                    781207 non-null  int64  
 6   account_age_days           781207 non-null  int64  
 7   days_since_address_change  781207 non-null  int64  
dtypes: float64(1), int64(7)
memory usage: 47.7 MB


In [20]:
print(data["isFraud"].unique())


[0 1]


In [21]:
print(data["isFraud"].value_counts())

isFraud
0    769094
1     12113
Name: count, dtype: int64


In [22]:
### ---- SPLIT DATA INTO TRAIN & TEST SETS ---- ###
X = data.drop(target_column, axis=1)
y = data[target_column]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [23]:
### ---- HANDLE CLASS IMBALANCE WITH SMOTE ---- ###
smote = SMOTE(sampling_strategy=0.2, random_state=42)  # Upsample fraud cases to 20% of the dataset
X_train, y_train = smote.fit_resample(X_train, y_train)

In [24]:
print("\n✅ Test Data NaN values:\n", pd.concat([X_test, y_test], axis=1).isnull().sum())


✅ Test Data NaN values:
 transactionDateTime          0
transactionAmount            0
merchantName                 0
cardPresent                  0
expirationDateKeyInMatch     0
account_age_days             0
days_since_address_change    0
isFraud                      0
dtype: int64


In [25]:
### ---- SCALE NUMERIC FEATURES ---- ###
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [26]:
y_test

229478    0
343389    0
595173    0
457112    0
83306     0
         ..
100524    0
668215    0
175827    0
15420     0
249854    0
Name: isFraud, Length: 156242, dtype: int64

In [27]:
# Save train and test sets
train_data = pd.concat([X_train, y_train], axis=1)
test_data = pd.concat([X_test, y_test], axis=1)

train_data.to_csv('train_fraud_transactions.csv', index=False)
test_data.to_csv('test_fraud_transactions.csv', index=False)
