# Combined KMeans and Random Forest for Fraud Detection

This notebook demonstrates how to combine **KMeans clustering** and **Random Forest** to detect fraud in the Kaggle credit card fraud dataset.

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    silhouette_score, classification_report, accuracy_score,
    precision_score, recall_score, f1_score
)
import kagglehub

RANDOM_SEED = 42
TEST_SIZE = 0.3

In [None]:
path = kagglehub.dataset_download("mlg-ulb/creditcardfraud")
data = pd.read_csv(f"{path}/creditcard.csv")
data['Class'] = data['Class'].astype(int)
data.head()

In [None]:
data = data.drop(['Time'], axis=1)
data['Amount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))

fraud = data[data['Class'] == 1]
nonfraud = data[data['Class'] == 0]

print(f'Fraudulent: {len(fraud)}, Non-fraudulent: {len(nonfraud)}')
print(f'Fraud percentage: {len(fraud)/(len(fraud)+len(nonfraud))*100:.3f}%')

In [None]:
X = np.asarray(data.drop(columns=['Class']))
y = np.asarray(data['Class'])

x_train, x_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_SEED, stratify=y
)

scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [None]:
np.random.seed(RANDOM_SEED)

normal_train_indices = np.where(y_train == 0)[0]
fraud_train_indices = np.where(y_train == 1)[0]

np.random.shuffle(normal_train_indices)
np.random.shuffle(fraud_train_indices)

selected_normal = normal_train_indices[:5000]
selected_fraud = fraud_train_indices[:500]

selected_train = np.concatenate([selected_normal, selected_fraud])
np.random.shuffle(selected_train)

n_x_train = x_train[selected_train]

In [None]:
scores = []
for k in range(2, 8):
    kmeans = KMeans(n_clusters=k, init='k-means++', random_state=RANDOM_SEED)
    kmeans.fit(n_x_train)
    score = silhouette_score(n_x_train, kmeans.labels_)
    scores.append(score)
    print(f'k={k}, Silhouette Score={score:.4f}')

optimal_k = np.argmax(scores) + 2
print(f'Optimal k: {optimal_k}')

kmeans = KMeans(n_clusters=optimal_k, init='k-means++', random_state=RANDOM_SEED)
kmeans.fit(n_x_train)

In [None]:
train_clusters = kmeans.predict(x_train)
test_clusters = kmeans.predict(x_test)

x_train_combined = np.concatenate([x_train, train_clusters.reshape(-1, 1)], axis=1)
x_test_combined = np.concatenate([x_test, test_clusters.reshape(-1, 1)], axis=1)

In [None]:
rf = RandomForestClassifier(random_state=RANDOM_SEED)
param_grid = {'n_estimators': [50, 100], 'max_depth': [5, 10, None]}

grid_search = GridSearchCV(rf, param_grid, cv=3, scoring='f1', n_jobs=-1)
grid_search.fit(x_train_combined, y_train)

best_rf = grid_search.best_estimator_

In [None]:
y_pred_rf = best_rf.predict(x_test_combined)

def evaluation(y_true, y_pred, model_name="Model"):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, zero_division=0)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)

    print(f'\n{model_name} Evaluation:')
    print('===' * 15)
    print('         Accuracy:', accuracy)
    print('  Precision Score:', precision)
    print('     Recall Score:', recall)
    print('         F1 Score:', f1)
    print("\nClassification Report:")
    print(classification_report(y_true, y_pred))

evaluation(y_test, y_pred_rf, model_name="Random Forest (with KMeans feature)")