In [23]:
import numpy as np
import pandas as pd
import datetime
from datetime import date
import matplotlib
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from sklearn.preprocessing import StandardScaler, normalize
from sklearn import metrics
from sklearn.mixture import BayesianGaussianMixture
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
import warnings

warnings.filterwarnings('ignore')
pd.set_option('display.max.columns', None)


In [24]:
def wrangle_data(data_path):
    data = pd.read_csv(data_path, header = 0, sep=';')

    # Spending variable creation
    data['Age'] = 2014-data['Year_Birth']

    data['Spending'] = data['MntWines'] + data['MntFruits'] + data['MntMeatProducts'] + data['MntFishProducts'] + data['MntSweetProducts'] + data['MntGoldProds']

    # Seniority variable creation
    last_date = date(2014,10, 4)
    # Take data from Dt_Customer, then subtract: Last Date - Dt_Customer to get Seniority data
    data['Seniority'] = pd.to_datetime(data['Dt_Customer'], dayfirst=True, format = '%Y-%m-%d')
    data['Seniority'] = pd.to_numeric(data['Seniority'].dt.date.apply(lambda x:(last_date - x)).dt.days, downcast='integer')/30
    data = data.rename(columns = {'NumWebPurchases': 'Web', 'NumCatalogPurchases' : 'Catalog', 'NumStorePurchases':'Store'})
    # This could have been changed into Married? then True/ False data, I think it is more easy
    data['Marital_Status'] = data['Marital_Status'].replace({'Divorced':'Alone','Single':'Alone','Married':'In couple','Together':'In couple','Absurd':'Alone','Widow':'Alone','YOLO':'Alone'})
    data['Education']=data['Education'].replace({'Basic':'Undergraduate','2n Cycle':'Undergraduate','Graduation':'Postgraduate','Master':'Postgraduate','PhD':'Postgraduate'})
    # Calculate Children data, create Has_child column and replace datq
    data['Children']=data['Kidhome']+data['Teenhome']
    data['Has_child'] = np.where(data.Children> 0, 'Has child', 'No child')
    data['Children'].replace({3: "3 children",2:'2 children',1:'1 child',0:"No child"},inplace=True)
    # Rename columns and retrieve new data
    data=data.rename(columns={'MntWines': "Wines",'MntFruits':'Fruits','MntMeatProducts':'Meat','MntFishProducts':'Fish','MntSweetProducts':'Sweets','MntGoldProds':'Gold'})
    data=data[['Age','Education','Marital_Status','Income','Spending','Seniority','Has_child','Children','Wines','Fruits','Meat','Fish','Sweets','Gold']]
    # Remove outlier
    data = data.dropna(subset=['Income'])
    data = data[data['Income']<600000]

    return data

In [25]:
data = wrangle_data('https://raw.githubusercontent.com/amankharwal/Website-data/master/marketing_campaign.csv')

Unnamed: 0,Age,Education,Marital_Status,Income,Spending,Seniority,Has_child,Children,Wines,Fruits,Meat,Fish,Sweets,Gold
0,57,Postgraduate,Alone,58138.0,1617,25.333333,No child,No child,635,88,546,172,88,88
1,60,Postgraduate,Alone,46344.0,27,7.000000,Has child,2 children,11,1,6,2,1,6
2,49,Postgraduate,In couple,71613.0,776,13.633333,No child,No child,426,49,127,111,21,42
3,30,Postgraduate,In couple,26646.0,53,7.866667,Has child,1 child,11,4,20,10,3,5
4,33,Postgraduate,In couple,58293.0,422,8.600000,Has child,1 child,173,43,118,46,27,15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2235,47,Postgraduate,In couple,61223.0,1341,15.933333,Has child,1 child,709,43,182,42,118,247
2236,68,Postgraduate,In couple,64014.0,444,3.866667,Has child,3 children,406,0,30,0,0,8
2237,33,Postgraduate,Alone,56981.0,1241,8.400000,No child,No child,908,48,217,32,12,24
2238,58,Postgraduate,In couple,69245.0,843,8.433333,Has child,1 child,428,30,214,80,30,61


The clients will be segmented into 4 equally weighted customer segments:
1. **Star**: Old customers with high income and high spending nature
2. **Need Attention**: New customers with below-average income, low spending nature
3. **High Potential**: New customers with high income, high spending nature
4. **Leaky Bucket**: Old customers with below-average income, low spending nature

In [58]:
scaler = StandardScaler()
dataset_temp = data[['Income','Seniority','Spending']]
X_std = scaler.fit_transform(dataset_temp)
X = normalize(X_std,norm='l2')

gmm = BayesianGaussianMixture(n_components=4, covariance_type='spherical',max_iter=2000, random_state=5).fit(X)
labels = gmm.predict(X)
dataset_temp['Cluster'] = labels
dataset_temp = dataset_temp.replace({0:'Stars',1:'Need attention',2:'High potential',3:'Leaky bucket'})
data = data.merge(dataset_temp.Cluster, left_index=True, right_index=True)

pd.options.display.float_format = "{:.0f}".format
summary = dataset_temp[['Income','Spending','Seniority','Cluster']]
summary.set_index('Cluster', inplace = True)
summary = summary.groupby('Cluster').describe().transpose()
summary.head()

Unnamed: 0,Cluster,High potential,Leaky bucket,Need attention,Stars
Income,count,599,626,526,464
Income,mean,34822,37706,69558,73412
Income,std,12124,12354,12005,13742
Income,min,2447,1730,44802,49090
Income,25%,26488,28882,60894,65274


In [34]:
scaler=StandardScaler()
dataset_temp=data[['Income','Seniority','Spending']]
X_std=scaler.fit_transform(dataset_temp)
X = normalize(X_std,norm='l2')

gmm=BayesianGaussianMixture(n_components=4, covariance_type='spherical',max_iter=2000, random_state=5).fit(X)
labels = gmm.predict(X)
dataset_temp['Cluster'] = labels
dataset_temp=dataset_temp.replace({0:'Stars',1:'Need attention',2:'High potential',3:'Leaky bucket'})
data = data.merge(dataset_temp.Cluster, left_index=True, right_index=True)

pd.options.display.float_format = "{:.0f}".format
summary=data[['Income','Spending','Seniority','Cluster']]
summary.set_index("Cluster", inplace = True)
summary=summary.groupby('Cluster').describe().transpose()
summary.head()

Unnamed: 0,Cluster,High potential,Leaky bucket,Need attention,Stars
Income,count,599,626,526,464
Income,mean,34822,37706,69558,73412
Income,std,12124,12354,12005,13742
Income,min,2447,1730,44802,49090
Income,25%,26488,28882,60894,65274


In [None]:
PLOT = go.Figure()
for C in list(data.Cluster.unique()):
    PLOT.add_trace(go.Scatter3d(
        x = data[data.Cluster == C]['Income'],
        y = data[data.Cluster == C]['Seniority'],
        z = data[data.Cluster == C]['Spending'],
        mode = 'markers',marker_size = 6, marker_line_width = 1,
        name = str(C)
    ))

PLOT.update_traces(hovertemplate='Income: %{x} <br>Seniority: %{y} <br>Spending: %{z}')
