In [55]:
import pandas as pd
from sklearn.cluster import KMeans
from datetime import date

data = pd.read_csv('OnlineRetail.csv', encoding= 'unicode_escape')
data.head()
data["InvoiceDate"] = pd.to_datetime(data.InvoiceDate)
data_UK=data[data.Country == "United Kingdom"].reset_index(drop=True)
tx_3m = data_UK[(data_UK.InvoiceDate < "2011-06-01") & (data_UK.InvoiceDate >= "2011-03-01")].reset_index(drop=True)


customer_cluster = pd.DataFrame(data_UK.CustomerID.unique())
customer_cluster.columns = ['CustomerID']

In [56]:
def sort_cluster(cluster, value, data, ascending):
    cluster_sort = data.groupby(cluster)[value].mean().reset_index()
    cluster_sort = cluster_sort.sort_values(by=value, ascending=ascending).reset_index(drop=True)
    cluster_sort['index'] = cluster_sort.index
    customer_cluster = data.merge(cluster_sort[['index', cluster]], on=cluster)
    customer_cluster.drop(columns=cluster, inplace=True)
    customer_cluster.rename(columns ={'index':cluster}, inplace=True)
    return customer_cluster

In [57]:
# Calculate recency score
recency_cluster=data_UK.groupby('CustomerID').InvoiceDate.max().reset_index()
recency_cluster.columns = ['CustomerID', 'MaxDate']
recency_cluster['Recency']=(recency_cluster['MaxDate'].max() - recency_cluster.MaxDate).dt.days
customer_cluster= customer_cluster.merge(recency_cluster[['CustomerID', 'Recency']], on='CustomerID')

kmean = KMeans(n_clusters=4)
kmean.fit(customer_cluster[['Recency']])
customer_cluster['RecencyCluster']=kmean.predict(customer_cluster[['Recency']])
customer_cluster = sort_cluster('RecencyCluster', 'Recency', customer_cluster, False)
customer_cluster

Unnamed: 0,CustomerID,Recency,RecencyCluster
0,17850.0,301,0
1,15100.0,329,0
2,18074.0,373,0
3,16250.0,260,0
4,13747.0,373,0
...,...,...,...
3945,15942.0,133,1
3946,14143.0,133,1
3947,16147.0,133,1
3948,15149.0,133,1


In [58]:
# Calculate Frequency Score
frequency_cluster = data_UK.groupby('CustomerID').InvoiceDate.count().reset_index()
frequency_cluster.columns = ['CustomerID', 'Frequency']
customer_cluster = customer_cluster.merge(frequency_cluster, on='CustomerID')

In [59]:

kmean.fit(customer_cluster[["Frequency"]])
customer_cluster['FrequencyCluster']=kmean.predict(customer_cluster[['Frequency']])
customer_cluster= sort_cluster('FrequencyCluster', 'Frequency', customer_cluster, True)

In [60]:
# Calculate revenue score
data_UK['Revenue']=data_UK.UnitPrice*data_UK.Quantity
monetary_cluster = data_UK.groupby('CustomerID').Revenue.sum().reset_index()
customer_cluster = customer_cluster.merge(monetary_cluster, on='CustomerID')
kmean.fit(customer_cluster[['Revenue']])
customer_cluster['MonetaryCluster'] = kmean.predict(customer_cluster[['Revenue']])
customer_cluster = sort_cluster('MonetaryCluster', 'Revenue', customer_cluster, True)


In [61]:
# overall scoring
customer_cluster['OverallScore'] = customer_cluster.RecencyCluster + customer_cluster.FrequencyCluster + customer_cluster.MonetaryCluster
customer_cluster['Segment'] = 'Low-Value'
customer_cluster.loc[customer_cluster['OverallScore']>2,'Segment'] = 'Mid-Value' 
customer_cluster.loc[customer_cluster['OverallScore']>4,'Segment'] = 'High-Value' 


In [65]:
customer_cluster.groupby('Segment').Segment.count()

Segment
High-Value     185
Low-Value     1956
Mid-Value     1809
Name: Segment, dtype: int64