# Customer Cluster Analysis

In this notebook we explore customer similarities and whether clusters exists

## Imports

In [1]:
import os, tqdm

# ETL and Data Manipulation
import pandas as pd
import numpy as np

# Visualizations
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

# Preprocessing
from sklearn.preprocessing import StandardScaler

# Clustering
from sklearn.cluster import KMeans

# Dimensionality Reduction
from sklearn.decomposition import PCA

## Load data

In [2]:
### Load data from CSV
DATA_PATH = os.path.join('..', '..', 'data')

order_lines = pd.read_csv(os.path.join(DATA_PATH, 'orders.csv'))
products = pd.read_csv(os.path.join(DATA_PATH, 'product-supplier.csv'))

## Data Cleaning

#### Datetime columns

In [3]:
date_str_format = '%d-%b-%y'

order_lines['Date Order was placed'] = pd.to_datetime(order_lines['Date Order was placed'], format = date_str_format)
order_lines['Delivery Date'] = pd.to_datetime(order_lines['Delivery Date'], format = date_str_format)

#### Customer Status column

In [4]:
order_lines['Customer Status'] = order_lines['Customer Status'].str.lower().astype(str)

#### Missing columns

In [5]:
# Adding retail price per unit
order_lines['Retail price Per Unit'] = order_lines['Total Retail Price for This Order'] / order_lines['Quantity Ordered']

In [6]:
# Total cost price
order_lines['Total Cost price'] = order_lines['Quantity Ordered'] * order_lines['Cost Price Per Unit']

#### Merging product information

In [7]:
# Merge with product information
order_lines_products = order_lines.join(products.set_index('Product ID'), on='Product ID', how='left').set_index('Order ID')

## Feature engineering

In [8]:
customers = pd.DataFrame(order_lines['Customer ID'].unique(), columns=['Customer ID']).set_index('Customer ID')

### Activity-based Features

#### Order Count

In [9]:
order_counts = order_lines.groupby(['Customer ID']).agg(order_count = ('Order ID','count'))
customers = customers.join(order_counts, on='Customer ID', how='left')

#### Months Active

In [10]:
# Get first and most recent order dates
activity_extremes = order_lines.groupby(['Customer ID']).agg(   first_order = ('Date Order was placed','min'), 
                                                                most_recent_order = ('Date Order was placed','max'))

# Substract
activity_extremes['lifetime'] = (activity_extremes['most_recent_order'] - activity_extremes['first_order'])

# Get liftetime in months
activity_extremes['active_months'] = (activity_extremes['lifetime'].dt.components['days'] / 30)

# Merge
customers = customers.join(activity_extremes['active_months'], on='Customer ID', how='left')

#### Avg days between orders

In [11]:
# For customers with one order, we set the average time between orders to the maximum of customer lifetimes
max_customer_lifetime = (customers['active_months'].max() * 30)

# Calculate avg. days between orders (with fallback for customers with only one order)
customers['avg_days_between_orders'] = customers.apply(lambda x: (x['active_months'] * 30) / x['order_count'] if x['order_count'] > 1 else max_customer_lifetime, axis=1)

### Monetary Features

#### Average order value

In [12]:
avg_order_value = order_lines.groupby(['Customer ID']).agg(avg_order_value = ('Total Cost price', 'mean'))
customers = customers.join(avg_order_value, on='Customer ID', how='left')

### Product Diversity

#### # Unique Categories bought and Unique Groups bought from

In [13]:
n_unique = order_lines_products.groupby(['Customer ID']).agg(n_unique_categories = ('Product Category', 'nunique'),
                                                             n_unique_groups = ('Product Group', 'nunique'))

customers = customers.join(n_unique, on='Customer ID', how='left')

### Seasonality Profile

#### Quarterly Distribution of orders

In [14]:
# Get orders by quarter
orders_by_quarter = order_lines.set_index('Delivery Date').groupby([pd.Grouper(freq='QE'), 'Customer ID'])

# Get sum of orders by quarter
quarterly_orders = orders_by_quarter.agg(sum_quarter_orders = ('Total Cost price' , 'sum')).reset_index()

# Get quarter as integer from delivery date
quarterly_orders['Quarter'] = quarterly_orders['Delivery Date'].dt.quarter

# Pivot
quarterly_orders_reshaped = quarterly_orders.reset_index().pivot_table(index='Customer ID', columns='Quarter', values='sum_quarter_orders', aggfunc='sum')

# Rename
quarterly_orders_reshaped.columns = ['Q1', 'Q2', 'Q3', 'Q4']

# Fill empty cells
quarterly_orders_reshaped = quarterly_orders_reshaped.fillna(0)

# Calculate the ratio
quarterly_orders_reshaped = quarterly_orders_reshaped.div( quarterly_orders_reshaped.sum(axis=1) , axis=0)

# Rename
quarterly_orders_reshaped.columns = [c+'_rate' for c in quarterly_orders_reshaped.columns]

# Merge
customers = customers.join(quarterly_orders_reshaped, on='Customer ID', how='left')

### Purchases from categories

In [15]:
# Make hierachical product group names
order_lines_products['product_group_cat_name'] = order_lines_products.apply(lambda x: '_'.join([x['Product Line'], x['Product Category']]), axis=1)

# Group by customer and product group
customer_orders_sum_prod_group = order_lines_products.groupby(['Customer ID', 'product_group_cat_name']).agg(group_sum = ('Total Cost price', 'sum')).reset_index()

# Pivot order lines on the groups aggregating sum of orders
order_lines_product_groups_agg = customer_orders_sum_prod_group.pivot_table(index='Customer ID', columns='product_group_cat_name', values='group_sum', aggfunc='sum')

# Fill Nans with 0
order_lines_product_groups_agg = order_lines_product_groups_agg.fillna(0)

# Row-wise Normalize
order_lines_product_groups_agg = order_lines_product_groups_agg.div( order_lines_product_groups_agg.sum(axis=1) , axis=0)

# Add prefix
order_lines_product_groups_agg = order_lines_product_groups_agg.add_prefix('category_')

# Merge
customers = customers.join(order_lines_product_groups_agg, on='Customer ID', how='left')

## Preprocessing

### Scaling

In [16]:
feature_scaler = StandardScaler()
X_scaled = feature_scaler.fit_transform(customers)

## Clustering

### Find right amount of clusters (Elbow Method)

In [17]:
inertias = [] # WCSS
K = range(1, 100)

for k in tqdm.tqdm(K):
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_scaled)
    inertias.append((k, kmeans.inertia_))

inertias_df = pd.DataFrame(inertias, columns=['K', 'WCSS'])

100%|██████████| 99/99 [00:55<00:00,  1.78it/s]


In [18]:
px.line(inertias_df, x='K', y='WCSS', title='Elbow Method for choosing number of clusters')

In [19]:
K = 14

### Assign Clusters

In [20]:
final_KMeans = KMeans(n_clusters=K, random_state=42)
customers['cluster'] = final_KMeans.fit_predict(X_scaled)