In [None]:
# Project: Customer Segmentation using RFM & K-Means Clustering

Business Goal: To segment our customer base into meaningful groups to enable targeted marketing, improve retention, and increase customer lifetime value.

Methodology:
1.  Data Loading & EDA: Load and inspect the transactional data.
2.  Feature Engineering: Calculate Recency, Frequency, and Monetary (RFM) scores for each customer.
3.  Data Preprocessing: Handle data skewness and scale features for the clustering algorithm.
4.  Modeling: Use K-Means to find customer clusters and the Elbow Method to determine the optimal number of clusters.
5.  Segment Profiling: Analyze the characteristics of each cluster to create business-friendly personas.
6.  Export: Save the final segmented data for visualization in Tableau.

In [None]:
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

# Set plot style
sns.set(style="whitegrid")

In [None]:

df = pd.read_csv('customers.csv')


print("Data Information:")
df.info()

print("\nMissing Values:")
print(df.isnull().sum())

print("\nData Head:")
print(df.head())

df['OrderDate'] = pd.to_datetime(df['OrderDate'])

### Calculating RFM (Recency, Frequency, Monetary)

Recency: Days since the last purchase. Lower is better.
Frequency: Total number of purchases. Higher is better.
Monetary: Total value of purchases. Higher is better.

In [None]:
# We'll set a "snapshot date" to calculate recency as if we're running this report today.
# Let's use one day after the last transaction date.
snapshot_date = df['OrderDate'].max() + dt.timedelta(days=1)
print(f"Snapshot Date: {snapshot_date}")

# Calculate RFM values for each customer
rfm_df = df.groupby('CustomerID').agg({
    'OrderDate': lambda date: (snapshot_date - date.max()).days,
    'OrderID': 'count',
    'OrderValue': 'sum'
})

# Rename columns for clarity
rfm_df.rename(columns={'OrderDate': 'Recency',
                       'OrderID': 'Frequency',
                       'OrderValue': 'Monetary'}, inplace=True)

print("\nRFM DataFrame:")
print(rfm_df.head())

### Preprocessing RFM Data

K-Means is sensitive to the scale of data and works best on normally distributed data. Our RFM values are right-skewed. We will:
1.  Apply a log transformation to reduce skewness.
2.  Scale the data to have a mean of 0 and a standard deviation of 1.

In [None]:
# Plot distributions to check for skewness
plt.figure(figsize=(12, 6))
plt.subplot(1, 3, 1)
sns.histplot(rfm_df['Recency'], kde=True, bins=20).set_title('Recency Distribution')
plt.subplot(1, 3, 2)
sns.histplot(rfm_df['Frequency'], kde=True, bins=20).set_title('Frequency Distribution')
plt.subplot(1, 3, 3)
sns.histplot(rfm_df['Monetary'], kde=True, bins=20).set_title('Monetary Distribution')
plt.tight_layout()
plt.show()

# Apply log transformation (using np.log1p to handle potential zeros)
rfm_log = np.log1p(rfm_df)

# Scale the data
scaler = StandardScaler()
rfm_scaled = scaler.fit_transform(rfm_log)
rfm_scaled = pd.DataFrame(rfm_scaled, index=rfm_df.index, columns=rfm_df.columns)

print("\nScaled RFM Data (Head):")
print(rfm_scaled.head())

### The Elbow Method
We will run K-Means for a range of cluster numbers (K) and plot the Within-Cluster Sum of Squares (WCSS). The "elbow" point on the plot indicates the optimal number of clusters to use.

In [None]:
# Find the optimal number of clusters
wcss = {}
for k in range(1, 11):
    kmeans = KMeans(n_clusters=k, init='k-means++', max_iter=300, random_state=42, n_init=10)
    kmeans.fit(rfm_scaled)
    wcss[k] = kmeans.inertia_

# Plot the Elbow Method graph
plt.figure(figsize=(8, 5))
plt.plot(list(wcss.keys()), list(wcss.values()), 'o-')
plt.xlabel('Number of Clusters (K)')
plt.ylabel('WCSS')
plt.title('Elbow Method for Optimal K')
plt.show()

In [None]:
# Build K-Means model with the chosen K
k = 4
kmeans = KMeans(n_clusters=k, init='k-means++', max_iter=300, random_state=42, n_init=10)
kmeans.fit(rfm_scaled)

# Assign the cluster labels to our original RFM dataframe
rfm_df['Cluster'] = kmeans.labels_

print("\nRFM Data with Cluster Labels:")
print(rfm_df.head())

### Segment Profiling
This is the most critical step for a Business Analyst. We translate the numerical clusters into actionable, human-readable business personas by analyzing the mean RFM values for each group.

In [None]:
# Analyze the characteristics of each cluster
cluster_profile = rfm_df.groupby('Cluster').agg({
    'Recency': 'mean',
    'Frequency': 'mean',
    'Monetary': ['mean', 'count']
}).round(2)

print("\nCluster Profile:")
print(cluster_profile)

# --- Naming the Segments (BA Interpretation) ---
# This requires analyzing the output above. The names will depend on your data.
# Example mapping based on a typical result:
# Cluster 0: Low R, High F, High M -> Champions
# Cluster 1: High R, Low F, Low M -> Hibernating
# Cluster 2: Low R, Low F, Low M  -> New/Potential
# Cluster 3: Mid R, Mid F, Mid M  -> Loyal Customers
#
# IMPORTANT: Check your cluster_profile output and adjust the map accordingly!
segment_map = {
    cluster_profile.sort_values(by=('Recency', 'mean'), ascending=True).index[0]: 'Champions',
    cluster_profile.sort_values(by=('Recency', 'mean'), ascending=True).index[1]: 'Loyal Customers',
    cluster_profile.sort_values(by=('Frequency', 'mean'), ascending=False).index[2]: 'At-Risk',
    cluster_profile.sort_values(by=('Frequency', 'mean'), ascending=False).index[3]: 'Hibernating'
}


rfm_df['Segment'] = rfm_df['Cluster'].map(segment_map)

print("\nFinal DataFrame with Segment Personas:")
print(rfm_df.head())

In [None]:
# Visualize the segments
plt.figure(figsize=(10, 7))
sns.scatterplot(data=rfm_df, x='Recency', y='Frequency', hue='Segment', size='Monetary', sizes=(50, 500), alpha=0.7)
plt.title('Customer Segments by RFM')
plt.gca().invert_xaxis() # Lower recency is better, so we invert the x-axis
plt.legend(title='Segment')
plt.show()

# Export the final data for Tableau
# We reset the index to make CustomerID a column
final_data = rfm_df.reset_index()
final_data.to_csv('customer_segments.csv', index=False)

print("\n'customer_segments.csv' has been created successfully for Tableau.")