In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Customer Segmentation

In this project, I will be performing an unsupervised clustering of data on the customer's records from a groceries firm's database. Customer segmentation is the practice of separating customers into groups that reflect similarities among customers in each cluster. I will divide customers into segments to optimize the significance of each customer to the business. To modify products according to distinct needs and behaviours of the customers. It also helps the business to cater to the concerns of different types of customers.

# Library import & Dataset

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from datetime import datetime
import warnings
import sklearn
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from yellowbrick.cluster import KElbowVisualizer
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering

In [None]:
df = pd.read_csv("/kaggle/input/customer-personality-analysis/marketing_campaign.csv" , sep ="\t")
df.head()

In [None]:
print("Dataset Size:" ,{df.shape})

In [None]:
warnings.filterwarnings('ignore')  # ignore notifications


# Features:¶
 
# **People**
* ID: Customer's unique identifier
* Year_Birth: Customer's birth year
* Education: Customer's education level
* Marital_Status: Customer's marital status
* Income: Customer's yearly household income
* Kidhome: Number of children in customer's household
* Teenhome: Number of teenagers in customer's household
* Dt_Customer: Date of customer's enrollment with the company
* Recency: Number of days since customer's last purchase
* Complain: 1 if the customer complained in the last 2 years, 0 otherwise
# **Products**
* MntWines: Amount spent on wine in last 2 years
* MntFruits: Amount spent on fruits in last 2 years
* MntMeatProducts: Amount spent on meat in last 2 years
* MntFishProducts: Amount spent on fish in last 2 years
* MntSweetProducts: Amount spent on sweets in last 2 years
* MntGoldProds: Amount spent on gold in last 2 years
# **Promotion**
* NumDealsPurchases: Number of purchases made with a discount
* AcceptedCmp1: 1 if customer accepted the offer in the 1st campaign, 0 otherwise
* AcceptedCmp2: 1 if customer accepted the offer in the 2nd campaign, 0 otherwise
* AcceptedCmp3: 1 if customer accepted the offer in the 3rd campaign, 0 otherwise
* AcceptedCmp4: 1 if customer accepted the offer in the 4th campaign, 0 otherwise
* AcceptedCmp5: 1 if customer accepted the offer in the 5th campaign, 0 otherwise
* Response: 1 if customer accepted the offer in the last campaign, 0 otherwise
# **Place**
* NumWebPurchases: Number of purchases made through the company’s website
* NumCatalogPurchases: Number of purchases made using a catalogue
* NumStorePurchases: Number of purchases made directly in stores
* NumWebVisitsMonth: Number of visits to company’s website in the last month

In [None]:
df.info()

In [None]:
df.nunique()

In [None]:
df.isnull().sum()

In [None]:
df.duplicated().sum()

# Feature Generation #

In [None]:
df["Age"] = 2023 - df["Year_Birth"]

#client Age

In [None]:
def get_different_year():
    registration_year = pd.to_datetime(df['Dt_Customer'], format='%d-%m-%Y').apply(lambda x: x.year)
    current_year = datetime.now().year
    return current_year - registration_year

df['Years_Since_Registration'] = get_different_year()  # Number of years since customer registration

In [None]:
df["Education"].value_counts()

In [None]:
df["Education"] = df["Education"].replace({"Basic":0, "Graduation":1, "2n Cycle":2, "Master":2, "PhD":3 })

In [None]:
df["Marital_Status"].value_counts()

In [None]:
df["Marital_Status"] = df["Marital_Status"].replace({"Married":2, "Together":2, "Single":1, "Divorced":1, "Widow":1, "Alone":1, "Absurd":1, "YOLO":1 })

In [None]:
df["Family_Size"] = df["Marital_Status"] + df["Kidhome"] + df["Teenhome"]

# Total number of people in the family


In [None]:
df["Sum_Mnt"] = df["MntWines"] + df["MntFruits"] + df["MntMeatProducts"] + df["MntFishProducts"] + df["MntSweetProducts"] + df["MntGoldProds"]

# Total amount spent on products

In [None]:
df['Num_Accepted_Cmp'] = df["AcceptedCmp1"] + df["AcceptedCmp2"] + df["AcceptedCmp3"] + df["AcceptedCmp4"] + df["AcceptedCmp5"] + df["Response"]

# Number of companies in which the client accepted the offer


In [None]:
df['Num_Total_Purchases'] = df["NumWebPurchases"] + df["NumCatalogPurchases"] + df["NumStorePurchases"] +df["NumDealsPurchases"]

# Total number of purchases

# let's look at the correlation matrix:¶
# 

In [None]:
to_corr = ["Age","Education", "Marital_Status", "Income","Kidhome", "Teenhome", "Years_Since_Registration",
          "Recency", "MntWines", "MntFruits", "MntMeatProducts", "MntFishProducts", "MntSweetProducts", 
           "MntGoldProds", "NumDealsPurchases", "NumWebPurchases", "NumCatalogPurchases", "NumStorePurchases",
          "NumWebVisitsMonth", "AcceptedCmp3", "AcceptedCmp1", "AcceptedCmp2", "AcceptedCmp4", "AcceptedCmp5", 
           "Complain", "Response", "Num_Total_Purchases", "Num_Accepted_Cmp", "Sum_Mnt", "Family_Size"]

cmap = sns.diverging_palette(220, 10, as_cmap=True)
matrix = np.triu(df[to_corr].corr())
plt.figure(figsize=(25, 14))
plt.title('Correlation matrix', fontsize=18)
sns.heatmap(df[to_corr].corr(), annot=True,
            fmt='.1f', vmin=-0.4, center=0, cmap=cmap, mask=matrix)

plt.show()

# look at the distributions of quantitative variables:¶
# 

In [None]:
to_plot = ['Income', 'Recency', 'Age', 'Years_Since_Registration', 'Sum_Mnt',
           'Num_Total_Purchases', 'Marital_Status']
sns.pairplot(df[to_plot], hue='Marital_Status', palette='Set1')
plt.show()

Clearly, there are a few outliers in the Income and Age features. I will be deleting the outliers in the data.

# Data Cleaning

In [None]:
df.dropna(inplace = True)

# Removed objects with gaps in income

In [None]:
def remove_outliers(data: pd.DataFrame, column: str) -> pd.Series:
    q3, q1 = np.nanpercentile(data[column], [75, 25])
    iqr = q3 - q1
    upper_bound = q3 + 1.5 * iqr
    lower_bound = q1 - 1.5 * iqr
    data = data[(data[column] > lower_bound) & (data[column] < upper_bound)]

    return data

df = remove_outliers(df, 'Age')
df = remove_outliers(df, "Income")

In [None]:
df.drop(["Year_Birth", "ID", "Z_CostContact", "Z_Revenue", "Dt_Customer"], axis =1, inplace=True)

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.info()

# Data scaling (Normalization | Standartization)

Data scaling is necessary to bring all features to the same scale. If this is not done, then the attention of the algorithm will be attracted to features that include large values (this is bad)
In this work, normalization will be used as data scaling (as a result of normalization, all features are in the range from 0 to 1)

In [None]:
def scaling_func(df):
    mms = MinMaxScaler()
    return pd.DataFrame(data = mms.fit_transform(df), columns = df.columns)


df_scaled = scaling_func(df)

df_scaled.index = df.index

# for convenient work with dataframes



# Dimensionality reduction
# 

**The dimensionality reduction problem is used in the following situations:**
* There are a lot of features in the dataset, and we want to reduce their number, leaving as much information as possible
* We have many features in the dataset, and we want to visualize the data (for example, in 3D space)
* Solving the problem of multicollinearity

There are many different options for dimensionality reduction, which are based on different methods, for example:

* Principal component analysis (PCA)
* Uniform Manifold Approximation and Projection (UMAP)
* t-distributed Stochastic Neighbor Embedding (t-SNE)
* Locally-Linear Embedding (LLE)
* Multidimensional Scaling (MDS)

The most commonly used dimensionality reduction algorithm is PCA, and we also use it in this work.

Essentially, in PCA we make a transition from one variable space to another, with the new space containing fewer variables (n_component), where the new variables are uncorrelated and are the weighted sum of the old variables.

As a result we get m variables: {PC1, PC2, PC3... PCm} , where PC1 will receive the most information(maximum sample variance), PC2 - less, and so on (A variable is considered informative if it has a high sample variance).

8 components will be used in this work.

In [None]:
#Initiating PCA to reduce dimentions aka features to 3
pca = PCA(n_components=3)
pca.fit(df_scaled)
PCA_ds = pd.DataFrame(pca.transform(df_scaled), columns=(["col1","col2", "col3"]))
PCA_ds.describe().T

In [None]:
#A 3D Projection Of Data In The Reduced Dimension
x =PCA_ds["col1"]
y =PCA_ds["col2"]
z =PCA_ds["col3"]
#To plot
fig = plt.figure(figsize=(10,8))
ax = fig.add_subplot(111, projection="3d")
ax.scatter(x,y,z, c="maroon", marker="o" )
ax.set_title("A 3D Projection Of Data In The Reduced Dimension")
plt.show()

# Clustering

* Feature Generation
* Data Cleaning
* Data Scaling (normalization)
* Dimensionality Reduction (PCA)
* Clustering 

Now that I have reduced the attributes to three dimensions, I will be performing clustering via Agglomerative clustering. Agglomerative clustering is a hierarchical clustering method. It involves merging examples until the desired number of clusters is achieved.

**Steps involved in the Clustering**

* Elbow Method to determine the number of clusters to be formed
* Clustering via Agglomerative Clustering
* Examining the clusters formed via scatter plot

In [None]:
print('Elbow Method to determine the number of clusters to be formed:')

Elbow_M = KElbowVisualizer(KMeans(), k=10)
Elbow_M.fit(PCA_ds)
Elbow_M.show()

The above cell indicates that five will be an optimal number of clusters for this data. Next, we will be fitting the Agglomerative Clustering Model to get the final clusters.

In [None]:
#Initiating the Agglomerative Clustering model 
AC = AgglomerativeClustering(n_clusters=5)
# fit model and predict clusters
yhat_AC = AC.fit_predict(PCA_ds)
PCA_ds["Clusters"] = yhat_AC
#Adding the Clusters feature to the orignal dataframe.
df["Clusters"]= yhat_AC

To examine the clusters formed let's have a look at the 3-D distribution of the clusters.

In [None]:
#Plotting the clusters
fig = plt.figure(figsize=(10,8))
ax = plt.subplot(111, projection='3d', label="bla")
ax.scatter(x, y, z, s=40, c=PCA_ds["Clusters"], marker='o', cmap = cmap )
ax.set_title("The Plot Of The Clusters")
plt.show()

In [None]:
#Plotting countplot of clusters
pal = ["#682F2F","#B9C0C9", "#9F8A78","#F3AB60", "#E75702"]
pl = sns.countplot(x=df["Clusters"], palette= pal)
pl.set_title("Distribution Of The Clusters")
plt.show()

In [None]:
pl = sns.scatterplot(data = df, x = df["Sum_Mnt"], y=df["Income"], hue=df["Clusters"], palette = pal)
pl.set_title("Cluster's Profile Based On Income And Spending")
plt.legend()
plt.show()

Income vs spending plot shows the clusters pattern

group 0:low spending & low income

group 1: high spending & high income

group 2: low spending & low income

group 3: high spending & high income

group 4:high dispersion

Next, I will be looking at the detailed distribution of clusters as per the various products in the data. Namely: Wines, Fruits, Meat, Fish, Sweets and Gold

In [None]:
plt.figure()
pl=sns.swarmplot(x=df["Clusters"], y=df["Sum_Mnt"], color= "#CBEDDD", alpha=0.5 )
pl=sns.boxenplot(x=df["Clusters"], y=df["Sum_Mnt"], palette=pal)
plt.show()

From the above plot, it can be clearly seen that cluster 1 is our biggest set of customers 

Let us next explore how did our campaigns do in the past.



In [None]:
#Creating a feature to get a sum of accepted promotions 
#Plotting count of total campaign accepted.

plt.figure()
pl = sns.countplot(x=df["Num_Accepted_Cmp"], hue=df["Clusters"], palette=pal)
pl.set_title("Count Of Promotion Accepted")
pl.set_xlabel("Number Of Total Accepted Promotions")
plt.show()

There has not been an overwhelming response to the campaigns so far. Very few participants overall. Moreover, no one part take in all 5 of them. Perhaps better-targeted and well-planned campaigns are required to boost sales.

In [None]:
#Plotting the number of deals purchased
plt.figure()
pl=sns.boxenplot(y=df["NumDealsPurchases"],x=df["Clusters"], palette= pal)
pl.set_title("Number of Deals Purchased")
plt.show()

Unlike campaigns, the deals offered did well. It has best outcome with cluster 2. However, our star customers cluster 1 are not much into the deals

# PROFILING #

Now that we have formed the clusters and looked at their purchasing habits. Let us see who all are there in these clusters. For that, we will be profiling the clusters formed and come to a conclusion about who is our star customer and who needs more attention from the retail store's marketing team.

To decide that I will be plotting some of the features that are indicative of the customer's personal traits in light of the cluster they are in. On the basis of the outcomes, I will be arriving at the conclusions.

In [None]:
personal = ["Education", "Marital_Status", "Income", "Kidhome", "Teenhome", "Age", "Family_Size"]

for i in personal:
    plt.figure()
    sns.jointplot(x = df[i], y=df["Sum_Mnt"], hue=df["Clusters"], kind = "kde", palette = pal)
    plt.show()

CONCLUSION

In this project, I performed unsupervised clustering. I did use dimensionality reduction followed by agglomerative clustering. I came up with 5 clusters and further used them in profiling customers in clusters according to their family structures and income/spending. This can be used in planning better marketing strategies.

# As a result of cluster analysis, we received three groups of buyers (clusters):¶
# 

about cluster 0:

* most of them are the parent
* at the max have 4 members in the family
* most have a teenager at home
* lower than average income

about cluster 1:

* most of them are single
* at the max have 3 members in the family
* a high-income group

aboout cluster 2:

* a lower-income group
* they are  a parent
* at the max have 5 members in the family
* lower than-average perches

aboout cluster 3:

* a high-income group
* upper than-average income
* upper than-average perches
* most of them are the parent

aboout cluster 4:

* Most of the members of this group are 1 to 3 people
* upper than-average income
* upper than-average perches