In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
path= '/kaggle/input/customer-personality-analysis/marketing_campaign.csv'
df = pd.read_csv(path, sep="\t")

 # 1. Data Exploration and Preperation

In [None]:
#explore data
df.head()

In [None]:
#look into data for future coorlation calculation purposes and drop tuples with mising values
df.info()
df.shape
df.dropna(inplace= True)

In [None]:
num_unique_education = df['Education'].nunique()
print("Number of unique values in Education column:", num_unique_education)
# can be encoded

In [None]:
education_value_counts = df['Education'].value_counts()
print("Unique values and their counts in Education column:")
print(education_value_counts)

In [None]:
num_unique_Marital_Status = df['Marital_Status'].nunique()
print("Number of unique values in Marital_Status column:", num_unique_Marital_Status)
# can be encoded

In [None]:
Marital_Status_value_counts = df['Marital_Status'].value_counts()
print("Unique values and their counts in Marital_Status column:")
print(Marital_Status_value_counts)

In [None]:
num_unique_Dt_Customer = df['Dt_Customer'].nunique()
print("Number of unique values in Dt_Customer column:", num_unique_Dt_Customer)

In [None]:
Dt_Customer_value_counts = df['Dt_Customer'].value_counts()
print("Unique values and their counts in Dt_Customer column:")
print(Dt_Customer_value_counts)
# date column can be ignored 

In [None]:
# encode the education and marital status 
df_encoded = pd.get_dummies(df, columns=['Marital_Status', 'Education'], prefix=['Marital', 'Education'])
df_encoded.head()

# 2. Correlation Analysis

change cols list

In [None]:
# Columns to exclude; categorical and cannot be used to calculate pearson corrlation
exclude_columns = [
    "Marital_Absurd", "Marital_Alone", "Marital_Divorced",
    "Marital_Married", "Marital_Single", "Marital_Together",
    "Marital_Widow", "Marital_YOLO",
    "Education_Basic", "Education_2n Cycle", "Education_Graduation",
    "Education_Master", "Education_PhD", "Dt_Customer","ID",
    "Complain","AcceptedCmp1", "AcceptedCmp2", "AcceptedCmp3",
    "AcceptedCmp4", "AcceptedCmp5", "Response" 
]

filtered_df= df_encoded
filtered_df = filtered_df.drop(columns=exclude_columns)

correlation_matrix = filtered_df.corr()

correlation_matrix

# Z_Revenue and Z_CostContact have constant values across all the column hence the NaN, the standard deviation of the column would be zero, leading to division by zero in the correlation formula.

In [None]:
#plots the correlation matrix 
corr_df_plot = filtered_df.drop(columns=['Z_CostContact', 'Z_Revenue']) #constant values
plt.figure(figsize=(20,20))
cmap = sns.color_palette("Blues", as_cmap=True)
sns.heatmap(corr_df_plot.corr(), annot=True, cmap=cmap, center=0)

In [None]:
correlations = correlation_matrix.unstack().reset_index()
correlations.columns = ['Attribute 1', 'Attribute 2', 'Correlation']

# Remove self-correlation
correlations = correlations[correlations['Attribute 1'] != correlations['Attribute 2']]

highest_positive = correlations.loc[correlations['Correlation'].idxmax()]
highest_negative = correlations.loc[correlations['Correlation'].idxmin()]


print("Highest Positive Correlation:")
print(highest_positive)

print("\nHighest Negative Correlation:")
print(highest_negative)

In [None]:
filtered_df.describe()

# 3. Standardization

In [None]:
binary_columns = [
    col for col in df_encoded
    if df_encoded[col].isin([0, 1, True, False]).all()
]

In [None]:
for col in binary_columns:
    if df_encoded[col].dtype == bool or set(df_encoded[col].dropna().unique()) <= {True, False}:
        df_encoded[col] = df_encoded[col].astype(int)

In [None]:
binary_columns

In [None]:
# standardize data before applying the algorithms

from sklearn.preprocessing import StandardScaler
df_std= df_encoded
columns_to_scale = [col for col in df_std.columns if col not in exclude_columns]
scaler = StandardScaler()
scaled = scaler.fit_transform(df_std[columns_to_scale])
df_std[columns_to_scale] = scaled
df_std.head()


# 4. K-means Clustering 

In [None]:
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import DBSCAN
import numpy as np
from sklearn.decomposition import PCA

In [None]:
# Define categories
people_columns = ['Year_Birth', 'Income', 'Kidhome', 'Teenhome', 'Recency', 'Complain']
product_columns = ['MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds']
promotion_columns = ['NumDealsPurchases', 'AcceptedCmp1', 'AcceptedCmp2', 'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'Response']
place_columns = ['NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth']


In [None]:
# modify the dataframe for clustering and remove irrelvant columns 
df_cluster= df_std.drop(columns=['ID','Dt_Customer', 'Education_2n Cycle', 'Education_Basic', 'Education_Graduation', 'Education_Master', 'Education_PhD'])



In [None]:
df_cluster

In [None]:
k_means_clustering= df_cluster

kmeans = KMeans(n_clusters=4, random_state=42)  
kmeans.fit(k_means_clustering)

centers = kmeans.cluster_centers_
labels = kmeans.labels_


print("Cluster Centers:\n", centers)
print("Labels:\n", labels)


In [None]:
cluster_counts = np.bincount(kmeans.labels_)
print(cluster_counts)

In [None]:
# Perform PCA to reduce the data to 2D
pca = PCA(n_components=2)
cluster_pca = pca.fit_transform(k_means_clustering)


plt.figure(figsize=(8, 6))
plt.scatter(cluster_pca[:, 0], cluster_pca[:, 1], c=kmeans.labels_, cmap='viridis', s=50, alpha=0.7)
plt.title('K-Means Clustering (PCA-reduced data)')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.colorbar(label='Cluster Label')
plt.show()




In [None]:
# add dummy labels to further analyse the clusters 
k_m= df_encoded
k_m['Cluster'] = labels

clusters = k_m.groupby('Cluster')

k_m

In [None]:
k_m.columns

In [None]:
k_m[columns_to_scale] = scaler.inverse_transform(k_m[columns_to_scale])

k_m.head()


In [None]:
df_encoded.shape


In [None]:
k_m.shape

In [None]:
# Mean and std for numerical columns
people_stats = k_m.groupby('Cluster')[people_columns].agg(['mean', 'std'])
product_stats = k_m.groupby('Cluster')[product_columns].agg(['mean', 'std'])
promotion_stats = k_m.groupby('Cluster')[promotion_columns].agg(['mean', 'std'])
place_stats = k_m.groupby('Cluster')[place_columns].agg(['mean', 'std'])

# Binary attributes: Count how many customers per cluster
binary_frequencies = k_m.groupby('Cluster')[binary_columns].sum()

# Combine statistics
all_stats = {
    'People': people_stats,
    'Products': product_stats,
    'Promotions': promotion_stats,
    'Place': place_stats,
    'Binary Frequencies': binary_frequencies
}


In [None]:
all_stats

In [None]:
continuous_columns = ['Year_Birth', 'Income', 'Kidhome', 'Teenhome', 'Recency', 
                      'MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 
                      'MntSweetProducts', 'MntGoldProds', 'NumDealsPurchases', 
                      'NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth']

# Create boxplots for each continuous variable
plt.figure(figsize=(15, 15))
for i, col in enumerate(continuous_columns, 1):
    plt.subplot(6, 3, i)  
    sns.boxplot(data=k_m, x='Cluster', y=col)
    plt.title(f'{col} by Cluster')

plt.tight_layout()
plt.show()


In [None]:
categorical_columns = [
    'Complain', 'AcceptedCmp1', 'AcceptedCmp2', 'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 
    'Response', 'Marital_Absurd', 'Marital_Alone', 'Marital_Divorced', 'Marital_Married', 'Marital_Single', 
    'Marital_Together', 'Marital_Widow', 'Marital_YOLO', 'Education_Basic', 'Education_2n Cycle', 
    'Education_Graduation', 'Education_Master', 'Education_PhD'
]

# Plot bar charts for categorical variables
plt.figure(figsize=(15, 15))
for i, col in enumerate(categorical_columns, 1):
    plt.subplot(5, 4, i)  # Adjusted to accommodate 20 plots
    sns.countplot(data=k_m, x=col, hue='Cluster')
    plt.title(f'{col} by Cluster')

plt.tight_layout()
plt.show()



# 5. Agglomerative Hierarchical Clustering


In [None]:
agg_clustering = df_cluster  

agg = AgglomerativeClustering(n_clusters=4)  
labels = agg.fit_predict(agg_clustering)


unique_labels = np.unique(labels)
centers = np.array([agg_clustering[labels == label].mean(axis=0) for label in unique_labels])
print("Cluster Centers (calculated manually):\n", centers)
print("Labels:\n", labels)


In [None]:
cluster_counts = np.bincount(labels)
print(cluster_counts)

In [None]:
labels = agg.fit_predict(agg_clustering)  

# Reduce to 2D for visualization 
pca = PCA(n_components=2)
reduced_data = pca.fit_transform(agg_clustering)

plt.figure(figsize=(8, 6))
for cluster in set(labels):
    plt.scatter(reduced_data[labels == cluster, 0], reduced_data[labels == cluster, 1], label=f"Cluster {cluster}", s=50)

plt.title("Agglomerative Clustering Results")
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.legend()
plt.show()


In [None]:
agg_clustering= df_encoded
agg_clustering['Cluster'] = labels

clusters = agg_clustering.groupby('Cluster')

agg_clustering

In [None]:
people_stats = agg_clustering.groupby('Cluster')[people_columns].agg(['mean', 'std'])
product_stats = agg_clustering.groupby('Cluster')[product_columns].agg(['mean', 'std'])
promotion_stats = agg_clustering.groupby('Cluster')[promotion_columns].agg(['mean', 'std'])
place_stats = agg_clustering.groupby('Cluster')[place_columns].agg(['mean', 'std'])

# For binary attributes, count how many customers per cluster
binary_frequencies = agg_clustering.groupby('Cluster')[binary_columns].sum()

# Combine all statistics into a dictionary
all_stats = {
    'People': people_stats,
    'Products': product_stats,
    'Promotions': promotion_stats,
    'Place': place_stats,
    'Binary Frequencies': binary_frequencies
}


In [None]:
all_stats

In [None]:
continuous_columns = ['Year_Birth', 'Income', 'Kidhome', 'Teenhome', 'Recency', 
                      'MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 
                      'MntSweetProducts', 'MntGoldProds', 'NumDealsPurchases', 
                      'NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth']

# Create boxplots for each continuous variable
plt.figure(figsize=(15, 15))
for i, col in enumerate(continuous_columns, 1):
    plt.subplot(6, 3, i)  
    sns.boxplot(data=agg_clustering, x='Cluster', y=col)
    plt.title(f'{col} by Cluster')

plt.tight_layout()
plt.show()


In [None]:
categorical_columns = [
    'Complain', 'AcceptedCmp1', 'AcceptedCmp2', 'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 
    'Response', 'Marital_Absurd', 'Marital_Alone', 'Marital_Divorced', 'Marital_Married', 'Marital_Single', 
    'Marital_Together', 'Marital_Widow', 'Marital_YOLO', 'Education_Basic', 'Education_2n Cycle', 
    'Education_Graduation', 'Education_Master', 'Education_PhD'
]

# Plot bar charts for categorical variables
plt.figure(figsize=(15, 15))
for i, col in enumerate(categorical_columns, 1):
    plt.subplot(5, 4, i)  # Adjusted to accommodate 20 plots
    sns.countplot(data=agg_clustering, x=col, hue='Cluster')
    plt.title(f'{col} by Cluster')

plt.tight_layout()
plt.show()


# 6. DBSCAN (Density-Based Spatial Clustering)

In [None]:
dbscan_cluster = df_cluster  

# Perform DBSCAN clustering
eps=1.8
min_samples=5
dbscan = DBSCAN(eps= eps,min_samples=min_samples)  # Adjust eps and min_samples as needed
labels = dbscan.fit_predict(dbscan_cluster)
print("Labels:\n", labels)

In [None]:
from sklearn.decomposition import PCA

# Reduce to 2D for 2D visualization
pca = PCA(n_components=2)
reduced_data_2d = pca.fit_transform(dbscan_cluster)

# Reduce to 3D for 3D visualization
pca_3d = PCA(n_components=3)
reduced_data_3d = pca_3d.fit_transform(dbscan_cluster)


In [None]:
plt.figure(figsize=(8, 6))
for cluster in set(labels):
    mask = (labels == cluster)
    if cluster == -1:
        plt.scatter(reduced_data_2d[mask, 0], reduced_data_2d[mask, 1], label="Noise", color="black", s=50)
    else:
        plt.scatter(reduced_data_2d[mask, 0], reduced_data_2d[mask, 1], label=f"Cluster {cluster}", s=50)

plt.title(f"DBSCAN Clustering 2D (eps={eps}, min_samples={min_samples})", fontsize=14)
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.legend()
plt.show()


In [None]:
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection="3d")

ax.scatter(reduced_data_3d[:, 0], reduced_data_3d[:, 1], reduced_data_3d[:, 2], c=labels, cmap="viridis", s=50)
ax.set_title("DBSCAN Clustering Results in 3D")
ax.set_xlabel("Principal Component 1")
ax.set_ylabel("Principal Component 2")
ax.set_zlabel("Principal Component 3")

plt.show()


In [None]:
labels = np.array(labels)  
positive_labels = labels[labels >= 0]  # Remove negatives

cluster_counts = np.bincount(positive_labels)
print("Cluster Counts:", cluster_counts)

In [None]:
dbscan= df_encoded
dbscan['Cluster'] = labels

clusters = dbscan.groupby('Cluster')

dbscan

In [None]:
# Remove rows with Cluster = -1
dbscan_filtered = dbscan[dbscan['Cluster'] != -1]


clusters_filtered = dbscan_filtered.groupby('Cluster')
dbscan_filtered


In [None]:
people_stats = dbscan_filtered.groupby('Cluster')[people_columns].agg(['mean', 'std'])
product_stats = dbscan_filtered.groupby('Cluster')[product_columns].agg(['mean', 'std'])
promotion_stats = dbscan_filtered.groupby('Cluster')[promotion_columns].agg(['mean', 'std'])
place_stats = dbscan_filtered.groupby('Cluster')[place_columns].agg(['mean', 'std'])

# For binary attributes, count how many customers per cluster
binary_frequencies = dbscan_filtered.groupby('Cluster')[binary_columns].sum()

# Combine all statistics into a dictionary
all_stats = {
    'People': people_stats,
    'Products': product_stats,
    'Promotions': promotion_stats,
    'Place': place_stats,
    'Binary Frequencies': binary_frequencies
}


In [None]:
all_stats

In [None]:
continuous_columns = ['Year_Birth', 'Income', 'Kidhome', 'Teenhome', 'Recency', 
                      'MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 
                      'MntSweetProducts', 'MntGoldProds', 'NumDealsPurchases', 
                      'NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth']

# Create boxplots for each continuous variable
plt.figure(figsize=(15, 15))
for i, col in enumerate(continuous_columns, 1):
    plt.subplot(6, 3, i)  
    sns.boxplot(data=dbscan_filtered, x='Cluster', y=col)
    plt.title(f'{col} by Cluster')

plt.tight_layout()
plt.show()

In [None]:
categorical_columns = [
    'Complain', 'AcceptedCmp1', 'AcceptedCmp2', 'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 
    'Response', 'Marital_Absurd', 'Marital_Alone', 'Marital_Divorced', 'Marital_Married', 'Marital_Single', 
    'Marital_Together', 'Marital_Widow', 'Marital_YOLO', 'Education_Basic', 'Education_2n Cycle', 
    'Education_Graduation', 'Education_Master', 'Education_PhD'
]

plt.figure(figsize=(20, 20))  # Increased size for better readability
for i, col in enumerate(categorical_columns, 1):
    plt.subplot(5, 4, i)  # Adjusted for 20 plots (5 rows, 4 columns)
    sns.countplot(data=dbscan_filtered, x=col, hue='Cluster')
    plt.title(f'{col} by Cluster')
    plt.xlabel(col)
    plt.ylabel('Count')

plt.tight_layout()
plt.show()