In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
import xgboost as xgb
from sklearn.metrics import classification_report, roc_auc_score, precision_recall_curve, auc 

In [11]:
df = pd.read_csv(r'..\datasets\financial_fraud_detection_dataset.csv')

In [12]:
df

Unnamed: 0,transaction_id,timestamp,sender_account,receiver_account,amount,transaction_type,merchant_category,location,device_used,is_fraud,fraud_type,time_since_last_transaction,spending_deviation_score,velocity_score,geo_anomaly_score,payment_channel,ip_address,device_hash
0,T100000,2023-08-22T09:22:43.516168,ACC877572,ACC388389,343.78,withdrawal,utilities,Tokyo,mobile,False,,,-0.21,3,0.22,card,13.101.214.112,D8536477
1,T100001,2023-08-04T01:58:02.606711,ACC895667,ACC944962,419.65,withdrawal,online,Toronto,atm,False,,,-0.14,7,0.96,ACH,172.52.47.194,D2622631
2,T100002,2023-05-12T11:39:33.742963,ACC733052,ACC377370,2773.86,deposit,other,London,pos,False,,,-1.78,20,0.89,card,185.98.35.23,D4823498
3,T100003,2023-10-10T06:04:43.195112,ACC996865,ACC344098,1666.22,deposit,online,Sydney,pos,False,,,-0.60,6,0.37,wire_transfer,107.136.36.87,D9961380
4,T100004,2023-09-24T08:09:02.700162,ACC584714,ACC497887,24.43,transfer,utilities,Toronto,mobile,False,,,0.79,13,0.27,ACH,108.161.108.255,D7637601
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4999995,T5099995,2023-11-17T23:20:29.746144,ACC597319,ACC749300,10.87,withdrawal,retail,Toronto,atm,False,,1416.524233,-0.14,17,0.18,UPI,243.92.38.163,D4439579
4999996,T5099996,2023-09-23T11:23:20.659686,ACC749625,ACC709783,181.40,payment,grocery,Sydney,atm,False,,999.089702,-1.79,4,0.58,wire_transfer,28.252.18.249,D5029311
4999997,T5099997,2023-11-18T00:52:34.527092,ACC629492,ACC680736,12.54,payment,utilities,New York,mobile,False,,3871.584025,-0.30,6,0.99,card,111.199.174.121,D6333607
4999998,T5099998,2023-03-25T04:32:13.609837,ACC984720,ACC296935,376.29,deposit,restaurant,Dubai,pos,False,,-4096.765453,-1.43,5,0.32,wire_transfer,221.110.215.14,D1551203


In [13]:
# Initial data inspection
print("Data Shape:", df.shape)
print("\nData Info:")
df.info(verbose=True, memory_usage='deep', show_counts=True)

Data Shape: (5000000, 18)

Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000000 entries, 0 to 4999999
Data columns (total 18 columns):
 #   Column                       Non-Null Count    Dtype  
---  ------                       --------------    -----  
 0   transaction_id               5000000 non-null  object 
 1   timestamp                    5000000 non-null  object 
 2   sender_account               5000000 non-null  object 
 3   receiver_account             5000000 non-null  object 
 4   amount                       5000000 non-null  float64
 5   transaction_type             5000000 non-null  object 
 6   merchant_category            5000000 non-null  object 
 7   location                     5000000 non-null  object 
 8   device_used                  5000000 non-null  object 
 9   is_fraud                     5000000 non-null  bool   
 10  fraud_type                   179553 non-null   object 
 11  time_since_last_transaction  4103487 non-null  float64
 12  spen

In [14]:
# Check for class imbalance (crucial for fraud detection)
print("\nFraud Distribution:")
print(df['is_fraud'].value_counts(normalize=True))


Fraud Distribution:
is_fraud
False    0.964089
True     0.035911
Name: proportion, dtype: float64


In [15]:
# Data preprocessing function
def preprocess_data(df):
    df = df.drop([
        'transaction_id', 'fraud_type', 'sender_account', 
        'receiver_account', 'ip_address', 'device_hash'
    ], axis=1)
    df['timestamp'] = pd.to_datetime(df['timestamp'], format='mixed')
    df['hour_of_day'] = df['timestamp'].dt.hour
    df['day_of_week'] = df['timestamp'].dt.dayofweek
    df = df.drop('timestamp', axis=1)
    df['time_since_last_transaction'] = df['time_since_last_transaction'].fillna(-1)
    return df

df_processed = preprocess_data(df.copy())

In [16]:
# Feature and target definition
target_column = 'is_fraud'
feature_columns = [col for col in df_processed.columns if col != target_column]

X = df_processed[feature_columns]
y = df_processed[target_column]

# Train-test split
TEST_SIZE = 0.2
RANDOM_STATE = 42

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, stratify=y, random_state=RANDOM_STATE
)

# Preprocessing pipeline
numerical_features = [
    'amount', 'time_since_last_transaction', 'spending_deviation_score',
    'velocity_score', 'geo_anomaly_score', 'hour_of_day', 'day_of_week'
]
categorical_features = [
    'transaction_type', 'merchant_category', 'location',
    'device_used', 'payment_channel'
]

numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='passthrough'
)

# Define the modeling pipeline
model_pipeline = ImbPipeline(steps=[
    ('preprocessing', preprocessor),
    ('oversampling', SMOTE(random_state=RANDOM_STATE)),
    ('classifier', xgb.XGBClassifier(
        eval_metric='logloss',
        random_state=RANDOM_STATE
    ))
])

In [17]:
model_pipeline.fit(X_train, y_train)

In [20]:
def evaluate_model(pipeline, X_test, y_test):
    """
    Evaluates a classification model using metrics suitable for imbalanced datasets.
    
    Parameters:
        pipeline: Fitted model pipeline.
        X_test: Test features.
        y_test: True labels for test set.
    """
    # Predict labels and probabilities
    y_pred = pipeline.predict(X_test)
    y_pred_proba = pipeline.predict_proba(X_test)[:, 1]

    # Classification report
    print("\n--- Classification Report ---")
    print("Unique values in y_test:", set(y_test))
    print("Unique values in y_pred:", set(y_pred))
    print(classification_report(y_test, y_pred, digits=4))

    # ROC-AUC score
    roc_auc = roc_auc_score(y_test, y_pred_proba)
    print(f"ROC-AUC Score: {roc_auc:.4f}")

    # Precision-Recall AUC
    precision, recall, _ = precision_recall_curve(y_test, y_pred_proba)
    prc_auc = auc(recall, precision)
    print(f"PRC-AUC Score: {prc_auc:.4f}")

In [21]:
evaluate_model(model_pipeline, X_test, y_test)


--- Classification Report ---
Unique values in y_test: {False, True}
Unique values in y_pred: {np.int64(0)}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

       False     0.9641    1.0000    0.9817    964089
        True     0.0000    0.0000    0.0000     35911

    accuracy                         0.9641   1000000
   macro avg     0.4820    0.5000    0.4909   1000000
weighted avg     0.9295    0.9641    0.9465   1000000

ROC-AUC Score: 0.5947
PRC-AUC Score: 0.0440
