<h1 style="color: salmon">K-Medoids Clustering</h1>
Nothing.

In [3]:
import pandas as pd
import joblib
import os
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn_extra.cluster import KMedoids
import numpy as np
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report, confusion_matrix

In [4]:
df = pd.read_csv("Datasets/lastest.csv", encoding="utf-8")
df.head()

Unnamed: 0,step,customer,age,gender,zipcodeOri,merchant,zipMerchant,category,amount,fraud
0,0,'C1093826151','4','M','28007','M348934600','28007','es_transportation',4.55,0
1,0,'C352968107','2','M','28007','M348934600','28007','es_transportation',39.68,0
2,0,'C2054744914','4','F','28007','M1823072687','28007','es_transportation',26.89,0
3,0,'C1760612790','3','M','28007','M348934600','28007','es_transportation',17.25,0
4,0,'C757503768','5','M','28007','M348934600','28007','es_transportation',35.72,0


### Data Pre-Processing

In [5]:
# Remove the uneccesary single quotes
df['category'] = df['category'].str.strip("'").str.split('_').str[1]
df['customer'] = df['customer'].str.strip("'")
df['age'] = df['age'].str.strip("'")
df['gender'] = df['gender'].str.strip("'")
df['merchant'] = df['merchant'].str.strip("'")

# Add the age label as in the original paper
age_map = {
    "0": "<=18", "1": "19-25", "2": "26-35", "3": "36-45", 
    "4": "46-55", "5": "56-65", "'6'": ">65", "U": "Unknown"
}
df['age_labeled'] = df['age'].map(age_map)

# Convert from step to hour of day (ex: 2 means 2AM)
def step_to_hour(row):
    return row % 24

df["hour_of_day"] = df["step"].apply(step_to_hour)

# Drop noise cols:
df = df.drop(
    columns=[
        'zipcodeOri',
        'zipMerchant'
    ],
    errors="ignore"
)

df.head()

Unnamed: 0,step,customer,age,gender,merchant,category,amount,fraud,age_labeled,hour_of_day
0,0,C1093826151,4,M,M348934600,transportation,4.55,0,46-55,0
1,0,C352968107,2,M,M348934600,transportation,39.68,0,26-35,0
2,0,C2054744914,4,F,M1823072687,transportation,26.89,0,46-55,0
3,0,C1760612790,3,M,M348934600,transportation,17.25,0,36-45,0
4,0,C757503768,5,M,M348934600,transportation,35.72,0,56-65,0


## Feature Engineering

### Spending velocity and Spending Frequency: The amount of money that a customer spend as well as the transaction frequency in a period of time 

In [6]:
df = df.sort_values(by=['customer', 'step'])
df = df.reset_index(drop=True)

# Create a temporary 'TimeDelta' column => we can then do time math on this col
df['temp_time'] = pd.to_timedelta(df['step'], unit='h')

# Spending
df['spending_vel_3h'] = (
    df.groupby('customer')
    .rolling('3h', on='temp_time')['amount']
    .sum()
    .values
)

df['spending_vel_6h'] = (
    df.groupby('customer')
    .rolling('6h', on='temp_time')['amount']
    .sum()
    .values
)

df['spending_vel_24h'] = (
    df.groupby('customer')
    .rolling('24h', on='temp_time')['amount']
    .sum()
    .values
)

# Frequency
df['frequency_3h'] = (
    df.groupby('customer')
    .rolling('3h', on='temp_time')['amount']
    .count()
    .values
)

df['frequency_6h'] = (
    df.groupby('customer')
    .rolling('6h', on='temp_time')['amount']
    .count()
    .values
)

df['frequency_24h'] = (
    df.groupby('customer')
    .rolling('24h', on='temp_time')['amount']
    .count()
    .values
)

### High risk Categories and Merchants (Target Encoding using Means)

In [8]:
category_risk_map = df.groupby('category')['fraud'].mean()

df['category_risk_score'] = df['category'].map(category_risk_map)

In [10]:
merchant_risk_map = df.groupby('merchant')['fraud'].mean()

df['merchant_risk_score'] = df['merchant'].map(merchant_risk_map)

### Age and Gender targeting

In [12]:
age_risk_map = df.groupby('age')['fraud'].mean()

df['age_risk_score'] = df['age'].map(age_risk_map)

In [14]:
# According to the EDA, no cases of Enterprise were Fraud
df['is_enterprise'] = df["gender"].apply(lambda g: 1 if g == "E" else 0)

In [15]:
df.head()

Unnamed: 0,step,customer,age,gender,merchant,category,amount,fraud,age_labeled,hour_of_day,...,spending_vel_3h,spending_vel_6h,spending_vel_24h,frequency_3h,frequency_6h,frequency_24h,category_risk_score,merchant_risk_score,age_risk_score,is_enterprise
0,30,C1000148617,5,M,M1888755466,otherservices,143.87,0,56-65,6,...,143.87,143.87,143.87,1.0,1.0,1.0,0.25,0.25,0.010951,0
1,38,C1000148617,5,M,M1741626453,sportsandtoys,16.69,0,56-65,14,...,16.69,16.69,160.56,1.0,1.0,2.0,0.495252,0.371212,0.010951,0
2,42,C1000148617,5,M,M1888755466,otherservices,56.18,0,56-65,18,...,56.18,72.87,216.74,1.0,2.0,3.0,0.25,0.25,0.010951,0
3,43,C1000148617,5,M,M840466850,tech,14.74,0,56-65,19,...,70.92,87.61,231.48,2.0,3.0,4.0,0.066667,0.112938,0.010951,0
4,44,C1000148617,5,M,M1823072687,transportation,47.42,0,56-65,20,...,118.34,118.34,278.9,3.0,3.0,5.0,0.0,0.0,0.010951,0


In [16]:
df.columns.to_list()

['step',
 'customer',
 'age',
 'gender',
 'merchant',
 'category',
 'amount',
 'fraud',
 'age_labeled',
 'hour_of_day',
 'temp_time',
 'spending_vel_3h',
 'spending_vel_6h',
 'spending_vel_24h',
 'frequency_3h',
 'frequency_6h',
 'frequency_24h',
 'category_risk_score',
 'merchant_risk_score',
 'age_risk_score',
 'is_enterprise']

## Final Cleaning and Spliting

In [19]:
# MOVE 'Is_Fraud' TO THE END
# Create a list of all columns except 'Is_Fraud'
cols = [col for col in df.columns if col != 'fraud']

# Append 'Is_Fraud' to the end of that list
cols.append('fraud')
df = df[cols]

keep_cols = [
 'spending_vel_3h',
 'spending_vel_6h',
 'spending_vel_24h',
 'frequency_3h',
 'frequency_6h',
 'frequency_24h',
 'category_risk_score',
 'merchant_risk_score',
 'age_risk_score',
 'is_enterprise',
 'fraud'
]
df = df[keep_cols]

In [21]:
X = df.iloc[:, 0:-1]
y = df.iloc[:, -1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [25]:
scaler = StandardScaler()
scaler.fit(X_train)

# Transform both
# X_train_scaled = scaler.transform(X_train)
# X_test_scaled = scaler.transform(X_test)

X_train_scaled = pd.DataFrame(scaler.transform(X_train), columns=X_train.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

In [26]:
X_train_sample = X_train_scaled.sample(n=10000, random_state=42)
# We'll start with 3 clusters (Normal, Suspicious, Outlier)
# We use 'manhattan' distance because it's often more robust for fraud data
kmed = KMedoids(n_clusters=3, metric='manhattan', init='k-medoids++', random_state=42)

kmed.fit(X_train_sample)



0,1,2
,n_clusters,3
,metric,'manhattan'
,method,'alternate'
,init,'k-medoids++'
,max_iter,300
,random_state,42


In [20]:
df.head()

Unnamed: 0,spending_vel_3h,spending_vel_6h,spending_vel_24h,frequency_3h,frequency_6h,frequency_24h,category_risk_score,merchant_risk_score,age_risk_score,is_enterprise,fraud
0,143.87,143.87,143.87,1.0,1.0,1.0,0.25,0.25,0.010951,0,0
1,16.69,16.69,160.56,1.0,1.0,2.0,0.495252,0.371212,0.010951,0,0
2,56.18,72.87,216.74,1.0,2.0,3.0,0.25,0.25,0.010951,0,0
3,70.92,87.61,231.48,2.0,3.0,4.0,0.066667,0.112938,0.010951,0,0
4,118.34,118.34,278.9,3.0,3.0,5.0,0.0,0.0,0.010951,0,0


## Testing Phase

In [27]:
# 1. Assign a cluster ID to every row in your scaled data
# This uses the 'medoids' found in your 10k sample to label all 400k+ rows
train_clusters = kmed.predict(X_train_scaled)
test_clusters = kmed.predict(X_test_scaled)

# 2. Add these labels back to your original DataFrames so you can see them
# This makes it easy to compare 'cluster' vs 'fraud'
X_train['cluster'] = train_clusters
X_test['cluster'] = test_clusters

# Add the actual fraud labels back for comparison
X_test['is_fraud'] = y_test

In [28]:
# 3. Check how many fraud cases are in each cluster for the Test Set
test_results = X_test.groupby('cluster')['is_fraud'].agg(['count', 'sum', 'mean'])
test_results.columns = ['Total Transactions', 'Fraud Count', 'Fraud Percentage (%)']

print("--- K-Medoids Test Results ---")
print(test_results)

--- K-Medoids Test Results ---
         Total Transactions  Fraud Count  Fraud Percentage (%)
cluster                                                       
0                     79218          361              0.004557
1                     38584          261              0.006764
2                      1127          824              0.731145
