In [6]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

In [None]:
# Loaded RFM feature table.
rfm = pd.read_csv("../outputs/rfm_table.csv")

# Verified schema before clustering.
rfm.columns

Index(['CustomerID', 'Recency', 'Frequency', 'Monetary', 'R', 'F', 'M',
       'RFM_Score'],
      dtype='str')

In [8]:
# Standardized RFM features to ensure equal contribution to distance-based clustering.
scaler = StandardScaler()
rfm_scaled = scaler.fit_transform(
    rfm[["Recency", "Frequency", "Monetary"]]
)


In [9]:
# Applied K-Means clustering to segment customers.
# A fixed random_state was used for reproducibility.
kmeans = KMeans(n_clusters=5, random_state=42)

rfm["Cluster"] = kmeans.fit_predict(rfm_scaled)


In [10]:
# Verified that cluster assignments were added.
rfm.columns


Index(['CustomerID', 'Recency', 'Frequency', 'Monetary', 'R', 'F', 'M',
       'RFM_Score', 'Cluster'],
      dtype='str')

In [11]:
# Reviewed customer counts per cluster.
rfm["Cluster"].value_counts().sort_index()


Cluster
0    1546
1    3461
2      35
3       4
4     832
Name: count, dtype: int64

In [None]:
# Persisted RFM table with cluster assignments for downstream notebooks.
rfm.to_csv("../outputs/rfm_with_clusters.csv", index=False)


In [None]:
# Reloaded file to confirm correct persistence.
check = pd.read_csv("../outputs/rfm_with_clusters.csv")
check.columns


Index(['CustomerID', 'Recency', 'Frequency', 'Monetary', 'R', 'F', 'M',
       'RFM_Score', 'Cluster'],
      dtype='str')