# Unsupervised Fraud Detection with KMeans

This notebook demonstrates how to use **KMeans clustering** for fraud detection on the Kaggle credit card fraud dataset. We train KMeans on a mixed sample of normal and fraudulent transactions, then evaluate its performance.

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans
from sklearn.metrics import (
    classification_report, accuracy_score, precision_score,
    recall_score, f1_score, confusion_matrix
)
import kagglehub

# General settings
RANDOM_SEED = 42
TEST_SIZE = 0.3

In [None]:
# Download and load the dataset
path = kagglehub.dataset_download("mlg-ulb/creditcardfraud")
data = pd.read_csv(f"{path}/creditcard.csv")
data['Class'] = data['Class'].astype(int)
data.head()

In [None]:
# Drop 'Time' and scale 'Amount'
data = data.drop(['Time'], axis=1)
data['Amount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))

fraud = data[data['Class'] == 1]
nonfraud = data[data['Class'] == 0]

print(f'Fraudulent: {len(fraud)}, Non-fraudulent: {len(nonfraud)}')
print(f'Fraud percentage: {len(fraud)}/{len(fraud) + len(nonfraud)} '
      f'({len(fraud)/(len(fraud) + len(nonfraud)) * 100:.3f}%)')

In [None]:
# Features & labels
X = np.asarray(data.drop(columns=['Class']))
y = np.asarray(data['Class'])

# Stratified split
x_train, x_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_SEED, stratify=y
)

# Scale
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [None]:
# Create training sample: 5000 normal + 500 fraud
np.random.seed(RANDOM_SEED)

normal_train_indices = np.where(y_train == 0)[0]
fraud_train_indices = np.where(y_train == 1)[0]

np.random.shuffle(normal_train_indices)
np.random.shuffle(fraud_train_indices)

selected_normal_train = normal_train_indices[:5000]
selected_fraud_train = fraud_train_indices[:500]

selected_train_indices = np.concatenate([selected_normal_train, selected_fraud_train])
np.random.shuffle(selected_train_indices)

n_x_train = x_train[selected_train_indices]

print(f"Custom training sample shape: {n_x_train.shape}")

In [None]:
# Try different k values
scores = []
for k in range(2, 8):
    kmeans = KMeans(n_clusters=k, init='k-means++', random_state=RANDOM_SEED)
    kmeans.fit(n_x_train)
    score = silhouette_score(n_x_train, kmeans.labels_)
    scores.append(score)
    print(f"k={k}, Silhouette Score={score:.4f}")

optimal_k = np.argmax(scores) + 2
print(f"\nOptimal k: {optimal_k}")

# Fit final KMeans
kmeans = KMeans(n_clusters=optimal_k, init='k-means++', random_state=RANDOM_SEED)
kmeans.fit(n_x_train)

In [None]:
# Predict on test set
y_pred_test = kmeans.predict(x_test)

# Align cluster labels
def align_labels(y_true, y_pred, n_clusters):
    labels = np.zeros_like(y_pred)
    for i in range(n_clusters):
        mask = (y_pred == i)
        if np.sum(mask) > 0:
            labels[mask] = np.bincount(y_true[mask]).argmax()
        else:
            labels[mask] = 0
    return labels

y_pred_aligned = align_labels(y_test, y_pred_test, optimal_k)

In [None]:
def evaluation(y_true, y_pred, model_name="Model"):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, zero_division=0)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)

    print(f'\n{model_name} Evaluation:')
    print('===' * 15)
    print('         Accuracy:', accuracy)
    print('  Precision Score:', precision)
    print('     Recall Score:', recall)
    print('         F1 Score:', f1)
    print("\nClassification Report:")
    print(classification_report(y_true, y_pred))

evaluation(y_test, y_pred_aligned, model_name="KMeans (Unsupervised)")