In [None]:
import numpy as np
import pandas as pd
import random
pd.set_option('display.max_column', None)

import matplotlib.pyplot as plt
import seaborn as sns

import shap

import scipy.stats
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from yellowbrick.cluster import KElbowVisualizer
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.ensemble import RandomForestClassifier

## Customer Segmentation Analysis

In this project, I will be performing a customer segmentation of clients that are regular customers of a fictional product. This example is given as follows: Imagine that this fictitious company wants to launch a new product on the market, which group of customers is the most suitable to promote the product?

The results that I am looking foward are:

- Segment the customers in distinct groups
- Explore the characteristics of each group
- Explore why the classification model assigned a customer to his group

The dataset that I will be using is generated by random variables with some expected characteristics that could be provided by a generic company

In [None]:
# Generate 1000 random rows for the dataframe
random.seed(42)
data = []
sex = ["Female","Male"]
for i in range(1000):
    data.append({
        'sex': random.choice(sex),
        'age': random.randint(18, 75),
        'monthly_visits': random.randint(1, 45),
        'total_purchase_amount': random.randint(40000, 365000),
        'distinct_products' : random.randint(1,50),
        'monthly_purchases_avg' : random.randint(12,30),
        'avg_purchase_price': random.uniform(105.5,2500)
    })

df = pd.DataFrame(data)
df.avg_purchase_price = df.avg_purchase_price.round(2)
df.head()

The dataset contains data from users, here are the featuers:

- Age of the user
- Sex of the user
- How many visits in the last month
- Total purchases
- Distinct products bought
- Quantity of produts bought last month
- Average purchase price

Since the data was randomized, there is no need to drop missing values

Let's have a look at the data's stats.

In [None]:
df.describe()

In [None]:
fig, axs = plt.subplots(ncols=3, nrows = 2)

sns.boxplot(y = df['age'], color= '#eea990', ax = axs[0,0])
sns.boxplot(y = df['monthly_visits'], color= '#eea990', ax = axs[0,1])
sns.boxplot(y = df['total_purchase_amount'], color= '#eea990', ax = axs[0,2])
sns.boxplot(y = df['distinct_products'], color= '#eea990', ax = axs[1,0])
sns.boxplot(y = df['monthly_purchases_avg'], color= '#eea990', ax = axs[1,1])
sns.boxplot(y = df['avg_purchase_price'], color= '#eea990', ax = axs[1,2])
fig.set_size_inches(20, 13)

In [None]:
plt.figure(figsize= (16, 8))
sns.heatmap(df.corr(), annot = True, cmap= 'YlGnBu')

Since the data is randomized, it is expected a low correlation between the variables, and a very well distributed dataset

## Data Preprocessing

Data treatment before modeling

1)Encoding categorical variables

2)Standard scaling all values

3)PCA to three dimensions

In [None]:
df = pd.get_dummies(df)

In [None]:
df.head()

All features are now numeric

Standardize features by removing the mean and scaling to unit variance.

In [None]:
scaler = StandardScaler()
scaler.fit(df)

df_scaled = pd.DataFrame(scaler.transform(df), columns=df.columns)

In [None]:
df_scaled.head()

Principal component analysis (PCA).

Linear dimensionality reduction using Singular Value Decomposition of the data to project it to a lower dimensional space. 

In [None]:
pca = PCA(n_components=3)
pca.fit(df_scaled)
df_PCA = pd.DataFrame(pca.transform(df_scaled), columns=(["col1","col2", "col3"]))
df_PCA.describe()

Visualizing the date before modeling

In [None]:
fig = plt.figure(figsize=(10,8))
ax = fig.add_subplot(111, projection="3d")
ax.scatter(df_PCA["col1"],df_PCA["col2"],df_PCA["col3"], c="red", marker="o")
ax.set_title("Data visualization PCA")
plt.show()

## Clustering

Utilizing a simple K-Means model for clustering

1)Finding k amount of clusters required

2)Adjusting dataset to model

3)Visualize results

In [None]:
Elbow = KElbowVisualizer(KMeans(), k=10)
Elbow.fit(df_PCA)
Elbow.show()

By the elbow curve, it indicates that a good value for k is 6

In [None]:
AC = KMeans(n_clusters=6)
yhat_AC = AC.fit_predict(df_PCA)

df_PCA["Clusters"] = yhat_AC

To examine the clusters formed let's have a look at the 3-D distribution of the clusters

In [None]:
fig = plt.figure(figsize=(10,8))
ax = plt.subplot(111, projection='3d', label="bla")
ax.scatter(df_PCA["col1"],df_PCA["col2"],df_PCA["col3"], s=40, c=df_PCA["Clusters"], marker='o', cmap = "Accent")
ax.set_title("Visualization of Clusters")
plt.show()

In [None]:
fig = sns.countplot(x=df_PCA["Clusters"], palette= "Accent")
fig.set_title("Cluster Distribuition")
plt.show()

let's have a look at the group distribution of clustring

In [None]:
df["Clusters"] = df_PCA.loc[:,"Clusters"].astype(int)

In [None]:
fig, axs = plt.subplots(ncols=3, nrows = 2)

sns.boxplot(y = df['age'], x = df['Clusters'], color= '#eea990', ax = axs[0,0])
sns.boxplot(y = df['monthly_visits'], x = df['Clusters'], color= '#eea990', ax = axs[0,1])
sns.boxplot(y = df['total_purchase_amount'], x = df['Clusters'], color= '#eea990', ax = axs[0,2])
sns.boxplot(y = df['distinct_products'], x = df['Clusters'], color= '#eea990', ax = axs[1,0])
sns.boxplot(y = df['monthly_purchases_avg'], x =df['Clusters'], color= '#eea990', ax = axs[1,1])
sns.boxplot(y = df['avg_purchase_price'], x = df['Clusters'], color= '#eea990', ax = axs[1,2])
fig.set_size_inches(20, 13)

In [None]:
sns.pairplot(data = df, hue = "Clusters")

The clusters seems to bee farly distribuited

This is a table contaning some agregated data of the clusters generated, we can already see some well positioned clustering

In [None]:
df.pivot_table(index = 'Clusters',aggfunc = (['min','max','mean'])).T

Going back to our problem, which group of customers would be the most fitted for the new product that is beeing created, could be solved by the pivot table above, looking at the data we can see some patterns for our groups resulting in a target for the product.

But we can go even further, what if the product/market team wants to know how one group differ from another, in other words, what are the most important features that impact the classification for each cluster

## Classification

Utilizing a Random Forest with no pruning, to train a model based on K Means classification. By utilizing a Random Forest it is possible to apply a shap explainer and understand which variable had a bigger impact in the ouput of each cluster

In [None]:
df["Clusters"] = df_PCA.loc[:,"Clusters"]

In [None]:
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [None]:
classifier = RandomForestClassifier(criterion = 'entropy')
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

In [None]:
c_matrix = confusion_matrix(y_test, y_pred)
print(c_matrix)
accuracy_score(y_test, y_pred)

With the random forest classifier we got a R² of 0.875

Applying shap to see the impact of each feature on the classification model

In [None]:
explainer = shap.TreeExplainer(classifier)
shap_values = explainer.shap_values(X)
shap.summary_plot(shap_values, X_train,feature_names = df.iloc[:, :-1].columns)

With shap values we can see that for exemple, that Class 3 age have a bigger inpact than monthly_visits, with this logic we can analyse and create more impactfull insights over the classification

## Conclusion

In this project, I performed unsupervised clustering. I did use dimensionality reduction followed by Kmeans clustering. I came up with 6 clusters and further used them in a Random Forest Classifier to get a sense of metric score from the k Means, and then I Applyed Shap values to understand better how each feature inpact the segment of our clusters.