In [10]:
import pandas as pd
import numpy as np

In [4]:
df = pd.read_csv(r'..\datasets\financial_fraud_detection_dataset.csv')

In [5]:
df

Unnamed: 0,transaction_id,timestamp,sender_account,receiver_account,amount,transaction_type,merchant_category,location,device_used,is_fraud,fraud_type,time_since_last_transaction,spending_deviation_score,velocity_score,geo_anomaly_score,payment_channel,ip_address,device_hash
0,T100000,2023-08-22T09:22:43.516168,ACC877572,ACC388389,343.78,withdrawal,utilities,Tokyo,mobile,False,,,-0.21,3,0.22,card,13.101.214.112,D8536477
1,T100001,2023-08-04T01:58:02.606711,ACC895667,ACC944962,419.65,withdrawal,online,Toronto,atm,False,,,-0.14,7,0.96,ACH,172.52.47.194,D2622631
2,T100002,2023-05-12T11:39:33.742963,ACC733052,ACC377370,2773.86,deposit,other,London,pos,False,,,-1.78,20,0.89,card,185.98.35.23,D4823498
3,T100003,2023-10-10T06:04:43.195112,ACC996865,ACC344098,1666.22,deposit,online,Sydney,pos,False,,,-0.60,6,0.37,wire_transfer,107.136.36.87,D9961380
4,T100004,2023-09-24T08:09:02.700162,ACC584714,ACC497887,24.43,transfer,utilities,Toronto,mobile,False,,,0.79,13,0.27,ACH,108.161.108.255,D7637601
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4999995,T5099995,2023-11-17T23:20:29.746144,ACC597319,ACC749300,10.87,withdrawal,retail,Toronto,atm,False,,1416.524233,-0.14,17,0.18,UPI,243.92.38.163,D4439579
4999996,T5099996,2023-09-23T11:23:20.659686,ACC749625,ACC709783,181.40,payment,grocery,Sydney,atm,False,,999.089702,-1.79,4,0.58,wire_transfer,28.252.18.249,D5029311
4999997,T5099997,2023-11-18T00:52:34.527092,ACC629492,ACC680736,12.54,payment,utilities,New York,mobile,False,,3871.584025,-0.30,6,0.99,card,111.199.174.121,D6333607
4999998,T5099998,2023-03-25T04:32:13.609837,ACC984720,ACC296935,376.29,deposit,restaurant,Dubai,pos,False,,-4096.765453,-1.43,5,0.32,wire_transfer,221.110.215.14,D1551203


In [8]:
# Initial data inspection
print("Data Shape:", df.shape)
print("\nData Info:")
df.info()

Data Shape: (5000000, 18)

Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000000 entries, 0 to 4999999
Data columns (total 18 columns):
 #   Column                       Dtype  
---  ------                       -----  
 0   transaction_id               object 
 1   timestamp                    object 
 2   sender_account               object 
 3   receiver_account             object 
 4   amount                       float64
 5   transaction_type             object 
 6   merchant_category            object 
 7   location                     object 
 8   device_used                  object 
 9   is_fraud                     bool   
 10  fraud_type                   object 
 11  time_since_last_transaction  float64
 12  spending_deviation_score     float64
 13  velocity_score               int64  
 14  geo_anomaly_score            float64
 15  payment_channel              object 
 16  ip_address                   object 
 17  device_hash                  object 
dtypes: b

In [9]:
# Check for class imbalance (crucial for fraud detection)
print("\nFraud Distribution:")
print(df['is_fraud'].value_counts(normalize=True))


Fraud Distribution:
is_fraud
False    0.964089
True     0.035911
Name: proportion, dtype: float64


In [13]:
# Convert timestamp to a proper datetime object using the robust ISO8601 format
df['timestamp'] = pd.to_datetime(df['timestamp'], format='ISO8601')

# --- Feature Engineering ---
df['transaction_hour'] = df['timestamp'].dt.hour
df['day_of_week'] = df['timestamp'].dt.dayofweek

# --- Data Cleaning ---
# Handle missing values by reassigning the column (avoids FutureWarning)
median_time = df['time_since_last_transaction'].median()
df['time_since_last_transaction'] = df['time_since_last_transaction'].fillna(median_time)

# Define features (X) and target (y)
X = df.drop(columns=['transaction_id', 'timestamp', 'sender_account', 'receiver_account', 
                     'ip_address', 'device_hash', 'is_fraud', 'fraud_type'])
y = df['is_fraud']

# Automatically identify feature types for the preprocessing pipeline
categorical_features = X.select_dtypes(include=['object', 'category']).columns
numerical_features = X.select_dtypes(include=np.number).columns

print("--- Preprocessing Complete ---")
print(f"Number of features: {X.shape[1]}")
print("\nCategorical Features:", list(categorical_features))
print("Numerical Features:", list(numerical_features))
print("\nSample of the processed feature set (X):")
print(X.head())

--- Preprocessing Complete ---
Number of features: 12

Categorical Features: ['transaction_type', 'merchant_category', 'location', 'device_used', 'payment_channel']
Numerical Features: ['amount', 'time_since_last_transaction', 'spending_deviation_score', 'velocity_score', 'geo_anomaly_score', 'transaction_hour', 'day_of_week']

Sample of the processed feature set (X):
    amount transaction_type merchant_category location device_used  \
0   343.78       withdrawal         utilities    Tokyo      mobile   
1   419.65       withdrawal            online  Toronto         atm   
2  2773.86          deposit             other   London         pos   
3  1666.22          deposit            online   Sydney         pos   
4    24.43         transfer         utilities  Toronto      mobile   

   time_since_last_transaction  spending_deviation_score  velocity_score  \
0                     0.844275                     -0.21               3   
1                     0.844275                     -0.14