In [1]:
import pandas as pd
txn = pd.read_csv("synthetic_transactions_small.csv", parse_dates=['timestamp'])
refund_count = txn['is_refund'].sum()
chargeback_count = txn['is_chargeback'].sum()

print("Total refunds:", refund_count)
print("Total chargebacks:", chargeback_count)

Total refunds: 583
Total chargebacks: 100


Per Merchant Metrics:
- refund ratio
- chargeback ratio
- amount 

In [2]:
txn_by_merchant = txn.groupby('merchant_id')
metrics = txn_by_merchant.agg(
	txn_count = ('txn_id', 'count'),
	refund_count = ('is_refund', 'sum'),
	chargeback_count = ('is_chargeback', 'sum'),
	amount_sum = ('amount', 'sum'),
	amount_avg = ('amount', 'mean'),
	amount_p95 = ('amount', lambda s: s.quantile(0.95)), # Takes a series (s)(like all amount of transactions per merchant)
	# For each merchant group, apply this function to the 'amount' column and record the 95th percentile value as amount_p95
).reset_index()
metrics['refund_ratio'] = metrics['refund_count']/metrics['txn_count']
metrics['chargeback_ratio'] = metrics['chargeback_count']/metrics['txn_count']
metrics.head(10)

Unnamed: 0,merchant_id,txn_count,refund_count,chargeback_count,amount_sum,amount_avg,amount_p95,refund_ratio,chargeback_ratio
0,M001,158,4,0,3942.89,24.955,58.4975,0.025316,0.0
1,M002,316,7,1,9608.49,30.406614,78.9825,0.022152,0.003165
2,M003,449,41,3,13516.42,30.103385,82.12,0.091314,0.006682
3,M004,264,7,1,6453.38,24.444621,53.7855,0.026515,0.003788
4,M005,337,5,3,8564.7,25.41454,65.264,0.014837,0.008902
5,M006,294,4,1,7700.11,26.19085,65.6255,0.013605,0.003401
6,M007,268,6,1,6967.33,25.9975,67.4985,0.022388,0.003731
7,M008,559,12,1,14677.58,26.256852,64.304,0.021467,0.001789
8,M009,368,20,5,10887.49,29.585571,74.872,0.054348,0.013587
9,M010,530,8,3,13535.49,25.53866,64.8205,0.015094,0.00566


Anomaly detection methods:
- Interquantile Range (IQR) - measure of statistical dispersion (spread of data)

In [3]:
def iqr_upper(s):
	"""Defines a function that takes a pandas series (eg.'refund_ratio') and returns a single number:
	The upper cut-off beyond which values are considered outliers"""
	q1, q3 = s.quantile(0.25), s.quantile(0.75) # Computes the 75% percentage and the 25% percentage
	iqr = q3 - q1 # Measures the spread of the middle 50% of the data - higher IQR = more dispersion in the bulk of values
	return q3 + 1.5*iqr # Return upper fence from Tukey's rule: anything > Q3 + 1.5 * IQR -> outlier
thr_refund = iqr_upper(metrics['refund_ratio'])
thr_cb = iqr_upper(metrics['chargeback_ratio'])

# Flag merchants who exceed this threshold
metrics['outlier_refund_iqr'] = metrics['refund_ratio'] > thr_refund
metrics['outlier_cb_iqr'] = metrics['chargeback_ratio'] > thr_cb
metrics.head(10)

Unnamed: 0,merchant_id,txn_count,refund_count,chargeback_count,amount_sum,amount_avg,amount_p95,refund_ratio,chargeback_ratio,outlier_refund_iqr,outlier_cb_iqr
0,M001,158,4,0,3942.89,24.955,58.4975,0.025316,0.0,False,False
1,M002,316,7,1,9608.49,30.406614,78.9825,0.022152,0.003165,False,False
2,M003,449,41,3,13516.42,30.103385,82.12,0.091314,0.006682,True,False
3,M004,264,7,1,6453.38,24.444621,53.7855,0.026515,0.003788,False,False
4,M005,337,5,3,8564.7,25.41454,65.264,0.014837,0.008902,False,False
5,M006,294,4,1,7700.11,26.19085,65.6255,0.013605,0.003401,False,False
6,M007,268,6,1,6967.33,25.9975,67.4985,0.022388,0.003731,False,False
7,M008,559,12,1,14677.58,26.256852,64.304,0.021467,0.001789,False,False
8,M009,368,20,5,10887.49,29.585571,74.872,0.054348,0.013587,True,True
9,M010,530,8,3,13535.49,25.53866,64.8205,0.015094,0.00566,False,False


Anomaly detection methods:
- Z-score (standard score) = how far a data point is from the mean, measured in units of standard deviation

In [4]:
z_refund = (metrics['refund_ratio'] - metrics['refund_ratio'].mean()) / metrics['refund_ratio'].std(ddof=0)
z_cb = (metrics['chargeback_ratio'] - metrics['chargeback_ratio'].mean()) / metrics['chargeback_ratio'].std(ddof=0)
 # How far a data point deviates from the mean, measured in units of standard deviation
metrics['outlier_refund_z'] = z_refund.abs() > 2.5
metrics['outlier_cb_z'] = z_cb.abs() > 2.5
metrics.head()

Unnamed: 0,merchant_id,txn_count,refund_count,chargeback_count,amount_sum,amount_avg,amount_p95,refund_ratio,chargeback_ratio,outlier_refund_iqr,outlier_cb_iqr,outlier_refund_z,outlier_cb_z
0,M001,158,4,0,3942.89,24.955,58.4975,0.025316,0.0,False,False,False,False
1,M002,316,7,1,9608.49,30.406614,78.9825,0.022152,0.003165,False,False,False,False
2,M003,449,41,3,13516.42,30.103385,82.12,0.091314,0.006682,True,False,True,False
3,M004,264,7,1,6453.38,24.444621,53.7855,0.026515,0.003788,False,False,False,False
4,M005,337,5,3,8564.7,25.41454,65.264,0.014837,0.008902,False,False,False,False


Anomaly detection methods:
- Isolation Forest 
  - unsupervised multivariate model that builds a forest 
  - each tree randomly picks an attribute (from a given list)
  - randomly chooses a split value
  - cuts the data based on that split value
  - repeats until all values are isolated (builds an isolation tree)
  - Theory: 
    - outliers will require few splits
    - normal points will require many splits
  - Outliers will be picked based on the n of splits in the forest


In [5]:
from sklearn.ensemble import IsolationForest
X = metrics[['refund_ratio', 'chargeback_ratio', 'amount_avg', 'amount_p95', 'txn_count']] # Ratios and distributional features

iso = IsolationForest(contamination = 0.05, random_state = 42)
metrics['isoforest_outlier'] = (iso.fit_predict(X) == -1)
metrics.head()


Unnamed: 0,merchant_id,txn_count,refund_count,chargeback_count,amount_sum,amount_avg,amount_p95,refund_ratio,chargeback_ratio,outlier_refund_iqr,outlier_cb_iqr,outlier_refund_z,outlier_cb_z,isoforest_outlier
0,M001,158,4,0,3942.89,24.955,58.4975,0.025316,0.0,False,False,False,False,False
1,M002,316,7,1,9608.49,30.406614,78.9825,0.022152,0.003165,False,False,False,False,False
2,M003,449,41,3,13516.42,30.103385,82.12,0.091314,0.006682,True,False,True,False,True
3,M004,264,7,1,6453.38,24.444621,53.7855,0.026515,0.003788,False,False,False,False,False
4,M005,337,5,3,8564.7,25.41454,65.264,0.014837,0.008902,False,False,False,False,False


In [6]:
from sklearn.ensemble import IsolationForest
import numpy as np 
import shap

# 1. Select features you used in IsolationForest
feature_cols = ["refund_ratio", "chargeback_ratio", "amount_avg", "amount_p95", "txn_count"]
X = metrics[feature_cols].fillna(0)

# 2. Fit IsolationForest (or reuse your already-fitted model)
iso = IsolationForest(
    n_estimators=200,
    contamination=0.05,
    random_state=42
)
iso.fit(X)

# 3. Use SHAP to explain the IsolationForest anomaly scores
#    TreeExplainer works directly on tree-based models
explainer = shap.TreeExplainer(iso)

# model_output="raw" so we explain the raw anomaly score
shap_values = explainer.shap_values(X)

# shap_values will be shape (n_samples, n_features)

# 4. Compute mean |SHAP| per feature => global importance
mean_abs_shap = np.mean(np.abs(shap_values), axis=0)

feature_importance = pd.Series(mean_abs_shap, index=feature_cols)

# 5. Turn importances into normalized weights that sum to 1
weights = feature_importance / feature_importance.sum()

print("SHAP-based feature importance:")
print(feature_importance.sort_values(ascending=False))

print("\nNormalized weights for composite risk score:")
print(weights.sort_values(ascending=False))

  from .autonotebook import tqdm as notebook_tqdm


SHAP-based feature importance:
amount_p95          0.372887
amount_avg          0.367340
refund_ratio        0.358921
txn_count           0.357185
chargeback_ratio    0.327810
dtype: float64

Normalized weights for composite risk score:
amount_p95          0.209001
amount_avg          0.205891
refund_ratio        0.201173
txn_count           0.200200
chargeback_ratio    0.183735
dtype: float64


Composite Risk Score:
- Simple numeric value that summarizes several different risk factors
- Instead of looking at separate metrics (refunds, chargeback, etc.), you combine them into one overall score to rank merchants by relative riskiness


In [7]:
rs = ( # Rank assigns a rank order - pct scales all values to 0-1
	metrics['refund_ratio'].rank(pct=True)*0.5 + # High importance to refund_ratio
	metrics['chargeback_ratio'].rank(pct = True)*0.3 + # A direct compliance and financial risk. Chargebacks involve loss of funds, operational cost and potential Visa Rule breaches
	metrics['amount_p95'].rank(pct = True)*0.1 + # Reflects exposure - merchants with unusually high-value transactions can create larger financial losses per dispute
	metrics['txn_count'].rank(pct = True)*0.1 # Adds scale context - A high-risk behaviour in large merchant has bigger impact on the ecosystem than in a small one
)

metrics['risk_score'] = rs.round(3)
metrics['iqr_score'] = (
	metrics['outlier_refund_iqr'].astype(int)
	+ metrics['outlier_cb_iqr'].astype(int)
)
metrics['z_score'] = (
	metrics['outlier_refund_z'].astype(int) +
	metrics['outlier_cb_z'].astype(int)
)

top_10_iqr_score = metrics.sort_values('iqr_score', ascending = False).head(10)
top_10_isoforest_score = metrics.sort_values('isoforest_outlier', ascending = False).head(10)
top_10_z_score = metrics.sort_values('z_score', ascending = False).head(10)
top_10_risk_score = metrics.sort_values('risk_score', ascending = False).head(10)

Export top by method for Tableau visualisations

In [8]:
# Combine top merchants by each method into one dataset for Tableau
top_10_iqr_score["method"] = "IQR"
top_10_z_score["method"] = "Z-Score"
top_10_isoforest_score["method"] = "IsolationForest"
top_10_risk_score["method"] = "CompositeRiskScore"

# Concatenate all results
top_combined = pd.concat([
    top_10_iqr_score,
    top_10_z_score,
    top_10_isoforest_score,
    top_10_risk_score
], axis=0)

# Keep only relevant columns for Tableau visualization
cols_to_keep = [
    "merchant_id", "txn_count", "refund_ratio", "chargeback_ratio",
    "amount_avg", "amount_p95", "risk_score", "iqr_score", "z_score",
    "isoforest_outlier", "method"
]
top_combined = top_combined[cols_to_keep].reset_index(drop=True)

# Export to CSV for Tableau import
top_combined.to_csv("top_merchants_by_method.csv", index=False)
print("Exported: top_merchants_by_method.csv")


Exported: top_merchants_by_method.csv
