In [None]:
import pandas as pd
import numpy as np
import seaborn as sn
import matplotlib.pyplot as plt

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score

In [None]:
df = pd.read_csv('marketing_campaign_data.csv')
df

* ID: Unique calling number for each entry in the dataset.

* Birth_Year: The customer's birth year.

* Marital_Status: The customer's marital status.

* Revenue: Customer's annual revenue.

* Kidhome: Number of children under 12 years of age in the customer's household.

* Teenhome: Number of teenagers aged 12-18 in the customer's household.

* Dt_Customer: The date the customer joined or became a customer.

* Recency: The number of days since the customer last interacted with the company.

* MntCoke: The amount of money a customer fired to buy a Coca-Cola drink.

* MntFruits: The amount of money that shoots customers to buy fruits.

* MntMeatProducts: The amount of money that kills customers to buy meat products.

* MntFishProducts: The amount of money a customer kills to buy fish products.

* MntSweetProducts: The amount of money that kills customers to buy sweet food products.

* MntGoldProds: The amount of money a customer kills to buy gold or jewelry products.

* NumDealsPurchases: Number of customer purchases with special offers or discounts.

* NumWebPurchases: The number of customer purchases through the company's website.

* NumCatalogPurchases: The number of customer purchases through the company catalog.

* NumStorePurchases: The number of customer purchases through the company's physical stores.

* NumWebVisitsMonth: The number of customer visits to the company's website in a month.

* AcceptedCmp3: Did the customer accept the campaign 3 promotional offer.

* AcceptedCmp4: Did the customer accept the campaign promotion offer 4.

* AcceptedCmp5: Did the customer accept the promotional offer of campaign 5.

* AcceptedCmp1: Did the customer accept the promotion offer of campaign 1.

* AcceptedCmp2: Whether the customer accepts campaign 2's promotional offer.

* Complain: Did the customer file a complaint.

* Z_CostContact: The cost the company incurs for each contact with a customer.

* Z_Revenue: The revenue a company generates from each contact with a customer.

* Feedback: whether customers are responding positively to company offers or campaigns.

## EDA

### Feature Engineering

In [None]:
df.info()

Performing feature engineering: conversion rates

In [None]:
df['Age'] = [2023] - df['Year_Birth']
df['Number_of_children']= df['Kidhome'] + df['Teenhome']
df['Total_Expenses'] = df['MntCoke'] + df['MntFruits'] + df['MntMeatProducts'] + df['MntFishProducts'] + df['MntSweetProducts'] + df['MntGoldProds']
df['Total_Transactions'] = df['NumDealsPurchases'] + df['NumWebPurchases'] + df['NumCatalogPurchases'] + df['NumStorePurchases']
df['Income_per_Transaction'] = df['Income'] / df['Total_Transactions']
df['Visit_Frequency'] = df['NumWebVisitsMonth'] + df['Recency']
df['conversion_rate'] = df['Response'] / df['Visit_Frequency']
df['Campaign_Interactions'] = df['AcceptedCmp3'] + df['AcceptedCmp1'] + df['AcceptedCmp2'] + df['AcceptedCmp4'] + df['AcceptedCmp5']

df[['Age', 'Number_of_children', 'Total_Expenses', 'Total_Transactions', 'Income_per_Transaction', 'Visit_Frequency', 'conversion_rate',
    'Campaign_Interactions']]

Counting and classifying 'age' into several groups

In [None]:
intervals = [0, 29, 39, 49,59, np.inf]
labels = ['20-29', '30-39', '40-49', '50-59', '60+']
df['Range_Age'] = pd.cut(df['Age'], bins=intervals, labels=labels, right=False)

# Displays data after adding a new column
df[['Age', 'Range_Age', 'conversion_rate', 'Campaign_Interactions']]


## Visualization

In [None]:
import seaborn as sns

# Plot the responses for different events and regions
sns.lineplot(x='Range_Age', y='conversion_rate',
             hue='Campaign_Interactions',
             data=df)


In [None]:
df.info()

In [None]:
Categorical = ['Education','Marital_Status', 'Income_per_Transaction', 'Range_Age' ]
Numericals = ['Year_Birth', 'Income', 'Kidhome', 'Teenhome', 'Recency', 'Complain', 'Response', 'Age',
              'Number_of_children', 'Total_Expenses', 'Total_Transactions', 'Visit_Frequency', 'conversion_rate','Campaign_Interactions']
Mnt = ['MntCoke', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds']
Num = ['NumDealsPurchases', 'NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth']
campg = ['AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmp1', 'AcceptedCmp2']


In [None]:

plt.figure(figsize=(12,45))
for i in range(0, len(Numericals)):
    plt.subplot(len(Numericals), 5,i+1)
    sns.boxplot(y=df[Numericals[i]], color='blue', orient='v')
    plt.tight_layout()

In [None]:
plt.figure(figsize=(20, 10))
for i in range(len(Numericals)):
    plt.subplot(4, 4, i+1)
    sns.kdeplot(x=df[Numericals[i]], color='green')
    plt.tight_layout()


In [None]:
for i in range(len(Categorical)):
    plt.figure(figsize=(15,5))
    sns.countplot(x=df[Categorical[i]], data=df, color='green')

In [None]:
plt.figure(figsize=(20,8))
sns.heatmap(df.corr(), annot=True, fmt='.2f', cmap='YlGnBu')

#### Handle null and duplicate data

In [None]:
df.isnull().sum()

In [None]:
# remove oulier based on IQR because the majority of the data is not normally distributed
print(f'Number of rows before filtering outliers: {len(df)}')

filtered_entries = np.array([True] * len(df))
for col in Numericals:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    low_limit = Q1 - (IQR * 1.5)
    high_limit = Q3 + (IQR * 1.5)

    filtered_entries = ((df[col] >= low_limit) & (df[col] <= high_limit)) & filtered_entries

df = df[filtered_entries]

print(f'Number of rows after filtering outliers: {len(df)}')

In [None]:
df['Income'] = df['Income'].fillna(0).astype(int)
df['Income_per_Transaction'] = df['Income_per_Transaction'].fillna(0).astype(str)

df.isnull().sum()

In [None]:
# check for duplicates of all columns

df.duplicated().sum()

Delete unnecessary data

In [None]:
df = df.drop('Z_CostContact', axis=1)
df = df.drop('Z_Revenue', axis=1)


df.info()

Melakukan feature encoding

In [None]:
# encoding of all categorical columns

Categorical = ['Education','Marital_Status', 'Dt_Customer','Income_per_Transaction', 'Range_Age']
df_enc = df.copy()
for i in Categorical[1:]:
    df_enc[i] = df_enc[i].astype('category').cat.codes

In [None]:
df_enc.head()

Standardization of features

In [None]:
# drop some features
df_new = df.drop(columns=['Unnamed: 0', 'Year_Birth', 'ID', 'Education', 'Marital_Status', 'Kidhome', 'Teenhome', 'Dt_Customer', 'Recency',
                          'MntCoke', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds',
                          'Number_of_children', 'Income_per_Transaction', 'Range_Age']).copy()
df_new

In [None]:
df_new.columns

In [None]:
# feature standardization first
feature = ['Income', 'NumDealsPurchases', 'NumWebPurchases', 'NumCatalogPurchases',
       'NumStorePurchases', 'NumWebVisitsMonth', 'AcceptedCmp3',
       'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmp1', 'AcceptedCmp2',
       'Complain', 'Response', 'Age', 'Total_Expenses', 'Total_Transactions',
       'Visit_Frequency', 'conversion_rate', 'Campaign_Interactions']
X = df_new.values

from sklearn.preprocessing import StandardScaler
X_std = StandardScaler().fit_transform(X)
df_std = pd.DataFrame(data = X_std, columns = feature).head()
df_std.describe()

In [None]:
df_std

In [None]:
# PCA
from sklearn.decomposition import PCA
pcs = PCA(n_components=4).fit_transform(X_std)
df_pca = pd.DataFrame(data = pcs, columns = ['pc1','pc2','pc3','pc4'])
df_pca.describe()

#### Seeing the exact number of clusters using the elbow method

In [None]:
inertia = []

for i in range(1,11):
    kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=42)
    kmeans.fit(df_pca.values)
    inertia.append(kmeans.inertia_)

In [None]:
# visualization of the results of the elbow method
sns.lineplot(x=range(1,11), y=inertia, linewidth=3)
sns.scatterplot(x=range(1,11), y=inertia, s=60)

In [None]:
(pd.Series(inertia) - pd.Series(inertia).shift(-1)) / pd.Series(inertia) * 100

clustering implementation using k-means clustering

In [None]:
# fit model
kmeans = KMeans(n_clusters=3, init='k-means++', max_iter=300, n_init=10, random_state=42)
kmeans.fit(df_pca.values)
df_pca['labels_cluster'] = kmeans.labels_

In [None]:
df_pca

The silhouette score calculation looks at how the model's performance is obtained

In [None]:
# visualization of clustering results
sns.scatterplot(data=df_pca, x='pc1', y='pc2', hue='labels_cluster')

In [None]:
# calculate the silhouette score
X_sil = df_pca.drop(columns='labels_cluster').values
labels = df_pca['labels_cluster'].values
silhouette_avg = silhouette_score(X_sil, labels)
silhouette_avg

In [None]:
# silhouette score visualization
sample_silhouette_values = silhouette_samples(X_sil, labels)

n_clusters = 3
y_lower = 10
fig, ax1 = plt.subplots(1, 1)
fig.set_size_inches(6, 4)
ax1.set_xlim([-0.1, 1])
ax1.set_ylim([0, len(X) + (n_clusters + 1) * 10])

for i in range(n_clusters):
    ith_cluster_silhouette_values = sample_silhouette_values[labels == i]
    ith_cluster_silhouette_values.sort()

    size_cluster_i = ith_cluster_silhouette_values.shape[0]
    y_upper = y_lower + size_cluster_i

    color = plt.cm.get_cmap("Spectral")(float(i) / n_clusters)
    ax1.fill_betweenx(np.arange(y_lower, y_upper),
                          0, ith_cluster_silhouette_values,
                          facecolor=color, edgecolor=color, alpha=0.7)
    ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
    y_lower = y_upper + 10

ax1.axvline(x=silhouette_avg, color="red", linestyle="--")
ax1.set_xlabel("Silhouette Coefficient Values")
ax1.set_ylabel("Cluster Label")
ax1.set_yticks([])
plt.title(("Silhouette analysis for KMeans clustering on sample data "
               "with n_clusters = %d" % n_clusters),
              fontsize=10, fontweight='bold')
plt.show()

In [None]:
# restore data to original
for col in Categorical[1:]:
    codes = df_enc[col].unique()
    labels = df[col].unique()
    mapping = dict(zip(codes, labels))
    df_enc[col] = df_enc[col].replace(mapping)

df_enc['labels_cluster'] = kmeans.labels_
df_enc.sample(5)



In [None]:
# numeric column
df_nums = df_enc[Numericals]
df_nums['label'] = df_enc['labels_cluster']

df_nums.groupby('label').agg(['mean', 'median', 'std'])

In [None]:
df_enc.info()

Choose a cluster to do retargeting marketing

In [None]:
df_cats = df_enc[Categorical].drop(columns=['Income_per_Transaction', 'Dt_Customer'])
df_cats['label'] = df_enc['labels_cluster']

df_cats.groupby('label').agg(pd.Series.mode)


The interpretation of these results is as follows:

* Cluster 0 has the majority of customers with undergraduate education level, marital status Married, and age range 50-59.
* Cluster 1 has the majority of customers with an undergraduate education level, marital status is Married, and the age range is 60 and above.
* Cluster 2 also has the majority of customers with an undergraduate education level, marital status is Married, and the age range is 60 and over.