In [None]:
# RFM Customer Segmentation with KMeans Clustering

# 📦 1. Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score

# 📥 2. Load Dataset
df = pd.read_csv("../data/online_retail.csv", encoding='ISO-8859-1')
df.head()

# 🧹 3. Data Cleaning
# Drop null CustomerID
df.dropna(subset=['CustomerID'], inplace=True)
# Remove negative quantities and prices
df = df[(df['Quantity'] > 0) & (df['UnitPrice'] > 0)]
# Remove duplicates
df.drop_duplicates(inplace=True)

# 🧮 4. Feature Engineering
# Add TotalPrice column
df['TotalPrice'] = df['Quantity'] * df['UnitPrice']
# Convert InvoiceDate to datetime
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])

# 📆 5. Calculate RFM
snapshot_date = df['InvoiceDate'].max() + pd.Timedelta(days=1)
rfm = df.groupby('CustomerID').agg({
    'InvoiceDate': lambda x: (snapshot_date - x.max()).days,
    'InvoiceNo': 'nunique',
    'TotalPrice': 'sum'
})
rfm.rename(columns={
    'InvoiceDate': 'Recency',
    'InvoiceNo': 'Frequency',
    'TotalPrice': 'Monetary'
}, inplace=True)

# 🔄 6. Normalisasi RFM
scaler = StandardScaler()
rfm_scaled = scaler.fit_transform(rfm)

# 📊 7. Elbow Method
sse = []
k_range = range(1, 11)
for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(rfm_scaled)
    sse.append(kmeans.inertia_)

plt.figure(figsize=(8, 4))
plt.plot(k_range, sse, marker='o')
plt.title('Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('SSE')
plt.show()

# ✨ 8. KMeans Clustering
kmeans = KMeans(n_clusters=4, random_state=42)
rfm['Cluster'] = kmeans.fit_predict(rfm_scaled)

# 🧠 9. PCA Visualization
pca = PCA(n_components=2)
rfm_pca = pca.fit_transform(rfm_scaled)
rfm['PCA1'] = rfm_pca[:, 0]
rfm['PCA2'] = rfm_pca[:, 1]

plt.figure(figsize=(8, 6))
sns.scatterplot(data=rfm, x='PCA1', y='PCA2', hue='Cluster', palette='tab10')
plt.title('Customer Segments (PCA)')
plt.show()

# 📌 10. Cluster Profiling
rfm_profile = rfm.groupby('Cluster').agg({
    'Recency': 'mean',
    'Frequency': 'mean',
    'Monetary': 'mean',
    'CustomerID': 'count'
}).rename(columns={'CustomerID': 'Count'})
print(rfm_profile)

# 🧩 11. Label Segmentasi (Manual Based on Profile)
def label_cluster(row):
    if row['Cluster'] == 0:
        return 'Loyal'
    elif row['Cluster'] == 1:
        return 'At Risk'
    elif row['Cluster'] == 2:
        return 'New'
    else:
        return 'Potential'

rfm['Segment'] = rfm.apply(label_cluster, axis=1)

# 📈 12. Visualisasi Distribusi Segmen
plt.figure(figsize=(6, 6))
rfm['Segment'].value_counts().plot(kind='pie', autopct='%1.1f%%')
plt.title('Customer Segment Distribution')
plt.ylabel('')
plt.show()

# 💾 13. Save Segmentasi
rfm.to_csv("../data/rfm_segmented.csv")
