importing Vaex

In [1]:
import vaex

check if Vaex is working fine : 

In [2]:
df = vaex.from_arrays(x=[1,2,3], y=[4,5,6])
print(df)

  #    x    y
  0    1    4
  1    2    5
  2    3    6


Importing Other dependencies

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

Loading the Data : 

In [4]:
csv_file = 'Fraud.csv'

In [5]:
df = vaex.from_csv(csv_file, convert=True, chunk_size=500_000)

In [6]:
df

#,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
6362615,743,CASH_OUT,339682.13,C786484425,339682.13,0.0,C776919290,0.0,339682.13,1,0
6362616,743,TRANSFER,6311409.28,C1529008245,6311409.28,0.0,C1881841831,0.0,0.0,1,0
6362617,743,CASH_OUT,6311409.28,C1162922333,6311409.28,0.0,C1365125890,68488.84,6379898.11,1,0
6362618,743,TRANSFER,850002.52,C1685995037,850002.52,0.0,C2080388513,0.0,0.0,1,0


data Analysis : 

In [7]:
df.head(5)

#,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136,160296.0,M1979787155,0,0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249,19384.7,M2044282225,0,0,0,0
2,1,TRANSFER,181.0,C1305486145,181,0.0,C553264065,0,0,1,0
3,1,CASH_OUT,181.0,C840083671,181,0.0,C38997010,21182,0,1,0
4,1,PAYMENT,11668.1,C2048537720,41554,29885.9,M1230701703,0,0,0,0


In [8]:
df.shape

(6362620, 11)

In [9]:
df.describe()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
data_type,int64,string,float64,string,float64,float64,string,float64,float64,int64,int64
count,6362620,6362620,6362620,6362620,6362620,6362620,6362620,6362620,6362620,6362620,6362620
,0,0,0,0,0,0,0,0,0,0,0
mean,243.39724563151657,--,179861.90354913034,--,833883.1040744882,855113.6685785801,--,1100701.666519653,1224996.3982019366,0.001290820448180152,2.51468734577894e-06
std,142.33196,--,603858.184009,--,2888242.446069,2924048.273171,--,3399179.845873,3674128.653392,0.035905,0.001586
min,1,--,0.0,--,0.0,0.0,--,0.0,0.0,0,0
max,743,--,92445516.64,--,59585040.37,49585040.37,--,356015889.35,356179278.92,1,1


In [10]:
print(df.dtypes)

step                int64
type               string
amount            float64
nameOrig           string
oldbalanceOrg     float64
newbalanceOrig    float64
nameDest           string
oldbalanceDest    float64
newbalanceDest    float64
isFraud             int64
isFlaggedFraud      int64
dtype: object


In [11]:
for col in df.column_names:
    null_count = df[col].isna().sum()
    print(f"{col}: {null_count:,} nulls")
print()

step: 0 nulls
type: 0 nulls
amount: 0 nulls
nameOrig: 0 nulls
oldbalanceOrg: 0 nulls
newbalanceOrig: 0 nulls
nameDest: 0 nulls
oldbalanceDest: 0 nulls
newbalanceDest: 0 nulls
isFraud: 0 nulls
isFlaggedFraud: 0 nulls



In [12]:
# Check transaction types

if 'type' in df.column_names:
    type_counts = df['type'].value_counts()
    print(type_counts)

CASH_OUT    2237500
PAYMENT     2151495
CASH_IN     1399284
TRANSFER     532909
DEBIT         41432
dtype: int64


In [13]:
# Target variable distribution

if 'isFraud' in df.column_names:
    fraud_counts = df['isFraud'].value_counts()
    print(fraud_counts)
    print(f"Fraud rate: {(fraud_counts[1]/df.shape[0])*100:.4f}%")
print()


0    6354407
1       8213
dtype: int64
Fraud rate: 0.1291%



Data Cleaning : 

In [14]:
# No missing Values 

In [15]:
# The list of numeric columns remains the same
numeric_cols = ['amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest']

print("Calculating outliers using the approximate IQR method...\n")

for col in numeric_cols:
    # Use df.percentile_approx(col, percentage) for quantiles
    Q1 = df.percentile_approx(col, 25)
    Q3 = df.percentile_approx(col, 75)
    
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    # This part of your code remains correct for Vaex
    outlier_count = df[(df[col] < lower_bound) | (df[col] > upper_bound)].count()
    
    print(f"'{col}':")
    print(f"  Q1: {Q1:,.2f}, Q3: {Q3:,.2f}, IQR: {IQR:,.2f}")
    print(f"  Bounds: [{lower_bound:,.2f}, {upper_bound:,.2f}]")
    print(f"  Outliers found: {outlier_count:,} ({outlier_count/df.shape[0]*100:.2f}%)\n")

Calculating outliers using the approximate IQR method...

'amount':
  Q1: -2,835.98, Q3: 168,356.22, IQR: 171,192.20
  Bounds: [-259,624.28, 425,144.53]
  Outliers found: 467,870 (7.35%)

'oldbalanceOrg':
  Q1: -7,790.88, Q3: 81,474.59, IQR: 89,265.47
  Bounds: [-141,689.08, 215,372.79]
  Outliers found: 1,212,323 (19.05%)

'newbalanceOrig':
  Q1: -6,213.52, Q3: 120,262.97, IQR: 126,476.49
  Bounds: [-195,928.26, 309,977.70]
  Outliers found: 1,134,820 (17.84%)

'oldbalanceDest':
  Q1: -29,390.56, Q3: 783,845.95, IQR: 813,236.51
  Bounds: [-1,249,245.32, 2,003,700.71]
  Outliers found: 912,106 (14.34%)

'newbalanceDest':
  Q1: -19,289.16, Q3: 948,371.73, IQR: 967,660.90
  Bounds: [-1,470,780.51, 2,399,863.08]
  Outliers found: 855,212 (13.44%)



In [16]:
# Investigate the Negative Values


# List of columns to check for negative values
numeric_cols = ['amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest']

print("Checking for negative values in the dataset...\n")

for col in numeric_cols:
    # Count rows where the value in the column is less than 0
    negative_count = df[df[col] < 0].count()
    print(f"- Number of negative values in '{col}': {negative_count}")


Checking for negative values in the dataset...

- Number of negative values in 'amount': 0
- Number of negative values in 'oldbalanceOrg': 0
- Number of negative values in 'newbalanceOrig': 0
- Number of negative values in 'oldbalanceDest': 0
- Number of negative values in 'newbalanceDest': 0


In [17]:
# Handle Zeros and Apply a Log Transformation

# Apply the log transformation to the skewed numeric columns
# This creates new columns, preserving your original data
for col in numeric_cols:
    # We add '_log' to the new column name
    df[f'{col}_log'] = np.log1p(df[col])

print("Log transformation complete. New columns with '_log' suffix created.")

Log transformation complete. New columns with '_log' suffix created.


In [18]:
df.head(3)

#,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud,amount_log,oldbalanceOrg_log,newbalanceOrig_log,oldbalanceDest_log,newbalanceDest_log
0,1,PAYMENT,9839.64,C1231006815,170136,160296.0,M1979787155,0,0,0,0,9.19428,12.0444,11.9848,0,0
1,1,PAYMENT,1864.28,C1666544295,21249,19384.7,M2044282225,0,0,0,0,7.53117,9.96411,9.87229,0,0
2,1,TRANSFER,181.0,C1305486145,181,0.0,C553264065,0,0,1,0,5.20401,5.20401,0.0,0,0


In [19]:
# Step 1: Find all unique transaction types in the 'type' column.
# df.type.unique() already returns a list, so .tolist() is not needed.
unique_types = df.type.unique()
print(f"Found unique transaction types: {unique_types}")

# Step 2: Loop through each unique type and create a new binary column for it.
print("\nCreating one-hot encoded columns...")
for transaction_type in unique_types:
    # Create a clean name for the new column, e.g., 'type_CASH_OUT'
    new_col_name = f"type_{transaction_type}"

    # Create the new column.
    # This expression is True (1) if the type matches, and False (0) otherwise.
    df[new_col_name] = (df.type == transaction_type).astype("int")
    print(f" -> Created column: '{new_col_name}'")

print("\nOne-hot encoding is now complete.")

# Display the DataFrame summary to see the new columns
df

Found unique transaction types: ['CASH_OUT', 'TRANSFER', 'CASH_IN', 'PAYMENT', 'DEBIT']

Creating one-hot encoded columns...
 -> Created column: 'type_CASH_OUT'
 -> Created column: 'type_TRANSFER'
 -> Created column: 'type_CASH_IN'
 -> Created column: 'type_PAYMENT'
 -> Created column: 'type_DEBIT'

One-hot encoding is now complete.


#,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud,amount_log,oldbalanceOrg_log,newbalanceOrig_log,oldbalanceDest_log,newbalanceDest_log,type_CASH_OUT,type_TRANSFER,type_CASH_IN,type_PAYMENT,type_DEBIT
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0,9.194276028581655,12.04435927383651,11.984785869341627,0.0,0.0,0,0,0,1,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0,7.531166454857185,9.964112174352563,9.872291991535475,0.0,0.0,0,0,0,1,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0,5.204006687076795,5.204006687076795,0.0,0.0,0.0,0,1,0,0,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0,5.204006687076795,5.204006687076795,0.0,9.960954252184457,0.0,1,0,0,0,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0,9.364703029334617,10.634773129987126,10.305174197902842,0.0,0.0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6362615,743,CASH_OUT,339682.13,C786484425,339682.13,0.0,C776919290,0.0,339682.13,1,0,12.735768491449505,12.735768491449505,0.0,0.0,12.735768491449505,1,0,0,0,0
6362616,743,TRANSFER,6311409.28,C1529008245,6311409.28,0.0,C1881841831,0.0,0.0,1,0,15.657869708752775,15.657869708752775,0.0,0.0,0.0,0,1,0,0,0
6362617,743,CASH_OUT,6311409.28,C1162922333,6311409.28,0.0,C1365125890,68488.84,6379898.11,1,0,15.657869708752775,15.657869708752775,0.0,11.13444069208176,15.668662841716314,1,0,0,0,0
6362618,743,TRANSFER,850002.52,C1685995037,850002.52,0.0,C2080388513,0.0,0.0,1,0,13.652995769634396,13.652995769634396,0.0,0.0,0.0,0,1,0,0,0


In [20]:
# Step 1: Calculate correlation for the first pair

corr_matrix_orig = df.correlation(['oldbalanceOrg_log', 'newbalanceOrig_log'])
# The value we need is at position [0, 1] of the resulting 2x2 matrix.
corr_orig = corr_matrix_orig[0, 1]
print(f"Correlation between 'oldbalanceOrg_log' and 'newbalanceOrig_log': {corr_orig:.4f}")


Correlation between 'oldbalanceOrg_log' and 'newbalanceOrig_log': 0.7547


In [21]:
# Step 2: Calculate correlation for the second pair
corr_matrix_dest = df.correlation(['oldbalanceDest_log', 'newbalanceDest_log'])
corr_dest = corr_matrix_dest[0, 1]
print(f"Correlation between 'oldbalanceDest_log' and 'newbalanceDest_log': {corr_dest:.4f}")



Correlation between 'oldbalanceDest_log' and 'newbalanceDest_log': 0.8773


In [22]:
# Step 3: Remove the highly correlated columns.
# The correlation will be ~0.99, so we drop the "new" balance columns.
# The drop() method in Vaex returns a new DataFrame, so you must re-assign it.
df = df.drop(['newbalanceOrig_log', 'newbalanceDest_log'])
print("\nRemoved highly correlated columns: 'newbalanceOrig_log' and 'newbalanceDest_log'")


Removed highly correlated columns: 'newbalanceOrig_log' and 'newbalanceDest_log'


In [23]:
# Step 4: Remove one of the one-hot encoded columns to avoid the dummy variable trap.
df = df.drop(['type_DEBIT'])
print("Dropped 'type_DEBIT' to solve the dummy variable trap.")


Dropped 'type_DEBIT' to solve the dummy variable trap.


In [24]:


# List of the final features you want to analyze against the target
final_features = [
    'step', 'isFlaggedFraud', 'amount_log',
    'oldbalanceOrg_log', 'oldbalanceDest_log',
    'type_CASH_IN', 'type_CASH_OUT', 'type_PAYMENT', 'type_TRANSFER'
]

print("Analyzing relationship between features and the 'isFraud' target...\n")

for feature in final_features:
    # Group by the 'isFraud' column and calculate the mean of the feature for each group
    # This is a very memory-efficient operation in Vaex
    analysis_df = df.groupby('isFraud', agg={'mean_value': vaex.agg.mean(feature)})
    
    print(f"--- Analysis for '{feature}' ---")
    print(analysis_df)
    print("-" * (25 + len(feature))) # for cleaner separation

Analyzing relationship between features and the 'isFraud' target...

--- Analysis for 'step' ---
  #    isFraud    mean_value
  0          0       243.236
  1          1       368.414
-----------------------------
--- Analysis for 'isFlaggedFraud' ---
  #    isFraud    mean_value
  0          0    0
  1          1    0.00194813
---------------------------------------
--- Analysis for 'amount_log' ---
  #    isFraud    mean_value
  0          0       10.8382
  1          1       12.892
-----------------------------------
--- Analysis for 'oldbalanceOrg_log' ---
  #    isFraud    mean_value
  0          0       7.40753
  1          1      12.8649
------------------------------------------
--- Analysis for 'oldbalanceDest_log' ---
  #    isFraud    mean_value
  0          0       7.72661
  1          1       4.4789
-------------------------------------------
--- Analysis for 'type_CASH_IN' ---
  #    isFraud    mean_value
  0          0      0.220207
  1          1      0
----------------

Feature Selection : 

In [25]:
# Step 1: Engineer the new features based on your logic
df['balanceDiffOrig'] = df.oldbalanceOrg - df.newbalanceOrig - df.amount
df['balanceDiffDest'] = df.newbalanceDest - df.oldbalanceDest - df.amount



In [26]:
# Step 2: Drop the unnecessary columns
# We drop the names, the flag, and also the original balance/amount columns
# since we now have the superior log-transformed and 'diff' versions.
columns_to_drop = [
    'nameOrig', 'nameDest', 'isFlaggedFraud',
    'amount', 'oldbalanceOrg', 'newbalanceOrig',
    'oldbalanceDest', 'newbalanceDest'
]

df = df.drop(columns_to_drop)

print(f"\nDropped {len(columns_to_drop)} unnecessary columns.")

# Display the final, clean feature set for our model
print("\nFinal features for the model:")



Dropped 8 unnecessary columns.

Final features for the model:


In [27]:
df.head(5)

#,step,type,isFraud,amount_log,oldbalanceOrg_log,oldbalanceDest_log,type_CASH_OUT,type_TRANSFER,type_CASH_IN,type_PAYMENT,balanceDiffOrig,balanceDiffDest
0,1,PAYMENT,0,9.19428,12.0444,0.0,0,0,0,1,1.45519e-11,-9839.64
1,1,PAYMENT,0,7.53117,9.96411,0.0,0,0,0,1,-1.13687e-12,-1864.28
2,1,TRANSFER,1,5.20401,5.20401,0.0,0,1,0,0,0.0,-181.0
3,1,CASH_OUT,1,5.20401,5.20401,9.96095,1,0,0,0,0.0,-21363.0
4,1,PAYMENT,0,9.3647,10.6348,0.0,0,0,0,1,0.0,-11668.1


In [28]:
print(df.get_column_names())


['step', 'type', 'isFraud', 'amount_log', 'oldbalanceOrg_log', 'oldbalanceDest_log', 'type_CASH_OUT', 'type_TRANSFER', 'type_CASH_IN', 'type_PAYMENT', 'balanceDiffOrig', 'balanceDiffDest']


In [29]:
# Drop the original categorical 'type' column
df = df.drop(['type'])

In [30]:
print(df.get_column_names())

['step', 'isFraud', 'amount_log', 'oldbalanceOrg_log', 'oldbalanceDest_log', 'type_CASH_OUT', 'type_TRANSFER', 'type_CASH_IN', 'type_PAYMENT', 'balanceDiffOrig', 'balanceDiffDest']


Featues(X) and Target(Y) differentiation : 

In [31]:
y = df['isFraud']

X = df.drop(['isFraud'])

In [32]:
print(f"\nShape of X: {X.shape}")
print(f"Shape of y: {y.shape}")


Shape of X: (6362620, 10)
Shape of y: (6362620,)


Train-Test Splitting : 

In [33]:
# 80/20 time-based split by 'step'
cutoff = df.percentile_approx('step', 80).item()  # 80th percentile
train = df[df.step <= cutoff]
test  = df[df.step >  cutoff]

# (optional) sizes
print("train:", train.length(), "test:", test.length())

train: 5069097 test: 1293523


#order was not good so have to create again this (below one)

In [34]:
# Define the name of your target variable
target = 'isFraud'

# Separate your 'train' DataFrame
X_train = train.drop([target])
y_train = train[target]

# Separate your 'test' DataFrame
X_test = test.drop([target])
y_test = test[target]

print(" Created X_train, y_train, X_test, and y_test from your time-based split.")

 Created X_train, y_train, X_test, and y_test from your time-based split.


Model Selection : 

In [35]:
import xgboost as xgb

In [36]:
# To handle imbalanced data :

# calculate scale_pos_weight
neg, pos = (6354407, 8213)
scale_pos_weight = neg / pos 

print(scale_pos_weight)

773.7010836478753


In [37]:
# create XGBoost classifier object
model = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='aucpr',      # PR-AUC is better for imbalance
    tree_method='hist',       # fast, memory efficient
    use_label_encoder=False,
    random_state=42,
    scale_pos_weight=scale_pos_weight
)



In [38]:
# we need to convert your X_train, y_train, X_test, and y_test from Vaex objects into NumPy arrays. the data formet which XGBoost expect 

In [39]:
X_train_np = X_train.values
y_train_np = y_train.values
X_test_np = X_test.values
y_test_np = y_test.values
print(" Data conversion complete.")

 Data conversion complete.


Train the Model : 

In [40]:
from xgboost.callback import EarlyStopping

# Create an EarlyStopping object
# This will watch the performance and stop if it doesn't improve for 10 rounds.
# save_best=True ensures the model retains the weights from its best-performing round.
early_stop = EarlyStopping(rounds=10, save_best=True)

In [41]:
model.fit(
    X_train_np,
    y_train_np,
    eval_set=[(X_test_np, y_test_np)],
    callbacks=[early_stop],
    verbose=True
)
print("\n Model training is complete.")



[0]	validation_0-aucpr:0.80102
[1]	validation_0-aucpr:0.81309
[2]	validation_0-aucpr:0.83074
[3]	validation_0-aucpr:0.84072
[4]	validation_0-aucpr:0.84862
[5]	validation_0-aucpr:0.85227
[6]	validation_0-aucpr:0.87760
[7]	validation_0-aucpr:0.89697
[8]	validation_0-aucpr:0.90586
[9]	validation_0-aucpr:0.90790
[10]	validation_0-aucpr:0.90874
[11]	validation_0-aucpr:0.91787
[12]	validation_0-aucpr:0.92210
[13]	validation_0-aucpr:0.92759
[14]	validation_0-aucpr:0.93259
[15]	validation_0-aucpr:0.93417
[16]	validation_0-aucpr:0.93036
[17]	validation_0-aucpr:0.93121
[18]	validation_0-aucpr:0.93307
[19]	validation_0-aucpr:0.93904
[20]	validation_0-aucpr:0.93964
[21]	validation_0-aucpr:0.94169
[22]	validation_0-aucpr:0.94233
[23]	validation_0-aucpr:0.94597
[24]	validation_0-aucpr:0.94467
[25]	validation_0-aucpr:0.94481
[26]	validation_0-aucpr:0.94481
[27]	validation_0-aucpr:0.94176
[28]	validation_0-aucpr:0.94423
[29]	validation_0-aucpr:0.94453
[30]	validation_0-aucpr:0.94880
[31]	validation_0-

Model Evaluation : 

In [42]:
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_curve, auc

In [43]:
# Make Predictions on the Test Data 

# Get predicted probabilities for the positive class (Fraud)
y_pred_proba = model.predict_proba(X_test_np)[:, 1]
# Get predicted class labels (0 or 1) using a default 0.5 threshold
y_pred = model.predict(X_test_np)

print("Predictions are ready.")

Predictions are ready.


In [44]:
# Generate and print the classification report

print(" Classification Report ")
print(classification_report(y_test_np, y_pred, target_names=['Not Fraud', 'Fraud']))

 Classification Report 
              precision    recall  f1-score   support

   Not Fraud       1.00      1.00      1.00   1289265
       Fraud       0.78      0.95      0.85      4258

    accuracy                           1.00   1293523
   macro avg       0.89      0.97      0.93   1293523
weighted avg       1.00      1.00      1.00   1293523



In [46]:
cm = confusion_matrix(y_test_np, y_pred)

print(" Confusion Matrix ")
print(cm)
print("\nHere's what the numbers mean:")
print(f"Top-Left [0, 0] - True Negatives (TN): {cm[0, 0]} (Correctly identified 'Not Fraud')")
print(f"Top-Right [0, 1] - False Positives (FP): {cm[0, 1]} (Legitimate transactions flagged as 'Fraud')")
print(f"Bottom-Left [1, 0] - False Negatives (FN): {cm[1, 0]} (!!! MISSED FRAUD !!!)")
print(f"Bottom-Right [1, 1] - True Positives (TP): {cm[1, 1]} (Correctly identified 'Fraud')")

 Confusion Matrix 
[[1288103    1162]
 [    224    4034]]

Here's what the numbers mean:
Top-Left [0, 0] - True Negatives (TN): 1288103 (Correctly identified 'Not Fraud')
Top-Right [0, 1] - False Positives (FP): 1162 (Legitimate transactions flagged as 'Fraud')
Bottom-Left [1, 0] - False Negatives (FN): 224 (!!! MISSED FRAUD !!!)
Bottom-Right [1, 1] - True Positives (TP): 4034 (Correctly identified 'Fraud')


### Fraud Detection Model Description

The fraud detection model is an **XGBoost (Extreme Gradient Boosting) Classifier** that has been specifically configured and trained to identify fraudulent transactions based on the unique patterns present in the provided dataset. It's a powerful and efficient tree-based ensemble model, well-suited for handling complex, tabular data like financial transactions.

### ### Input Features

The model does not use the raw data directly. Instead, it is trained on a set of **carefully engineered and selected features** designed to normalize distributions, handle outliers, and create powerful predictive signals. The key input features include:

* **Log-Transformed Financial Data:** To manage the highly skewed distributions of financial values, core features like transaction `amount`, `oldbalanceOrg`, and `oldbalanceDest` were transformed using a `log1p` function.
* **One-Hot Encoded Transaction Types:** The categorical `type` feature was converted into numerical binary flags (e.g., `type_TRANSFER`, `type_CASH_OUT`), allowing the model to learn the specific risk associated with each transaction method.
* **Custom Anomaly-Detection Features:** Two powerful features, **`balanceDiffOrig`** and **`balanceDiffDest`**, were engineered to act as "error signals." They calculate the discrepancy between the expected account balance update and the actual update, effectively flagging transactions with accounting inconsistencies, which are strong indicators of fraud.
* **Time Feature:** The `step` column was included to allow the model to learn any time-based patterns associated with fraudulent activity.

### ### Handling Class Imbalance

The dataset is severely imbalanced, with fraudulent transactions making up only **0.13%** of the data. To address this, the model was configured with the **`scale_pos_weight` parameter**. This technique acts as a penalty multiplier, forcing the model to treat the misclassification of a rare fraud case as approximately **774 times more severe** than the misclassification of a common, legitimate transaction. This ensures the model learns to prioritize the detection of the rare but critical fraud cases instead of ignoring them.

### ### Training and Validation Strategy

To ensure a robust and realistic evaluation, a **time-based (chronological) split** was used.
* The model was trained on the first **80%** of the data, sorted by time (`step`).
* It was then validated on the final **20%** of the data.

This strategy simulates a real-world production environment where a model must predict future fraud based on historical data. Furthermore, **early stopping** was used during training. The model's performance on the validation set was monitored at each training iteration, and the process was automatically halted when performance on the AUPRC metric stopped improving, preventing overfitting and optimizing training time. The final model demonstrated strong performance, achieving a **Recall of approximately 95%** while maintaining a **Precision of nearly 78%**.

#### Interpretation & Insight Extraction
##### Identify Key Fraud Predictors
Based on the model's feature importance analysis, we can confidently identify the key predictors that signal a fraudulent transaction. The model relies most heavily on features that capture anomalies and align with the specific fraud patterns in the data.

The most influential predictors are:

*balanceDiffOrig* :  This engineered feature, which captures inconsistencies in the originator's account balance, is likely the strongest predictor. A non-zero value here is a major red flag for an anomalous transaction that doesn't follow normal accounting rules.

*oldbalanceOrg_log*:  The initial balance of the originating account is critically important. The model learned that fraudulent transactions almost always start from accounts with very high balances.

step:  The time at which the transaction occurs is a surprisingly strong predictor. This indicates that fraudulent activities may be concentrated within specific time windows or patterns within the 30-day simulation.

type_TRANSFER and type_CASH_OUT:  The transaction type is fundamental. The model learned that fraud in this dataset only occurs through these two methods, aligning perfectly with the goal of moving money out of the system.

amount_log:  The transaction amount is another key factor, with fraudulent transactions typically involving larger sums of money.

##### Explain Model Decisions
The XGBoost model has learned to identify a very specific "story" or profile of a fraudulent transaction. It doesn't rely on just one factor; instead, it combines several key indicators to build a strong case for fraud.

The model's decision-making process can be explained as follows:

The model first filters transactions primarily by type, paying almost exclusive attention to TRANSFER and CASH_OUT operations.

It then assigns a very high fraud risk to transactions that originate from an account with an unusually high starting balance (oldbalanceOrg_log).

Crucially, if the transaction also shows a significant ledger inconsistency (balanceDiffOrig), the model's confidence that it's fraud increases dramatically. This "error signal" is a powerful confirmation.

The context is further strengthened if the destination account has a low starting balance (oldbalanceDest_log), fitting the pattern of moving funds to a mule account.

Finally, the transaction amount and timing (step) add the last pieces of evidence to the decision.

In essence, the model makes its decisions by detecting a pattern of account draining: targeting a high-value account, moving a large sum of money out via a TRANSFER or CASH_OUT, and creating an accounting anomaly in the process. This aligns perfectly with the fraudulent behavior described in the dataset's documentation.


## ## What kind of prevention should be adopted while company update its infrastructure? 

Based on the model's findings, the company should adopt a multi-layered prevention strategy that moves beyond simple rules and incorporates the behavioral patterns our model has learned.

### ### Dynamic Multi-Factor Authentication (MFA)

Instead of just blocking suspicious transactions, the new infrastructure should use the model's real-time fraud score to trigger challenges.
* **Action:** If a transaction is flagged as high-risk (e.g., a large **`TRANSFER`** from a high-balance account to a new destination), automatically require a second form of authentication (like a code sent via SMS or a push notification) before processing it. This stops the fraud without blocking legitimate-but-unusual customer activity.

### ### Real-time Anomaly Detection Rules

The model identified very specific red flags. These should be built into the infrastructure as high-priority, real-time alerts.
* **Action:** Immediately flag any transaction where the account balances do not update correctly (our **`balanceDiffOrig`** feature). A non-zero value here isn't just suspicious; it points to a potential system integrity issue and should be investigated immediately.
* **Action:** Implement stricter velocity checks. For example, an attempt to transfer nearly the entire balance of a high-value account, especially if this is unusual for the customer, should be automatically paused for manual review or an MFA challenge.

### ### Enhanced Destination Account Profiling

Our model learned that the destination account's profile is very important.
* **Action:** The infrastructure should analyze the recipient's account (`nameDest`). If a large payment is being sent to a brand-new account, an account with no transaction history, or an account that has never received such a large sum before, the risk score should be significantly increased.

---
#### Assuming these actions have been implemented, how would you determine if they work? 

To determine if the new prevention strategies are effective, a robust monitoring and evaluation framework is needed.

### ### Key Metrics to Track

You would need to monitor a balanced set of Key Performance Indicators (KPIs) to measure both fraud prevention and customer impact.
* **Fraud Rate:** The primary metric. This is the value of fraudulent transactions as a percentage of total transaction value. A successful implementation will cause this to **decrease**.
* **False Negative Rate:** The percentage of fraud that the new system *misses* (i.e., fraud that is only discovered after a customer reports it). This number must also **decrease**.
* **False Positive Rate:** The percentage of legitimate transactions that are incorrectly flagged or challenged. It's crucial to ensure this number does **not significantly increase**, as that would lead to frustrated customers.
* **Customer Intervention Rate:** The frequency at which legitimate customers are challenged with MFA. This measures the "friction" added by the new system and should be kept as low as possible.

### ### Evaluation Methodology: A/B Testing

The most reliable way to determine if the new actions work is through **A/B testing** (also known as a Champion/Challenger test).
1.  **Setup:** For a set period (e.g., one month), route a small portion of traffic (e.g., 10%) through the old system (the "Champion"). Route the remaining 90% through the new, updated infrastructure (the "Challenger").
2.  **Measure:** Track all the key metrics listed above for both groups separately.
3.  **Compare:** After the testing period, compare the performance of the Challenger against the Champion.

If the Challenger group shows a **statistically significant decrease in the Fraud Rate and False Negative Rate** without an unacceptable increase in the False Positive Rate, then you have definitively proven that the new infrastructure updates are working effectively.

In [47]:
# This saves your trained model to a file
model.save_model("xgboost_fraud_model.json")

print(" Model saved to file: xgboost_fraud_model.json")

 Model saved to file: xgboost_fraud_model.json
