# Main objective is Customer Segmentation into different clusters using patterns and behaviours which can later be used in targeted advertising campaigns and improving customer reviews

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.cluster import DBSCAN, KMeans, AgglomerativeClustering ,kmeans_plusplus
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import silhouette_score
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.neighbors import NearestNeighbors

In [None]:
df = pd.read_csv('/kaggle/input/mall-customers-segmentation/Mall_Customers.csv')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.isna().sum()

In [None]:
df.duplicated().sum()

# EDA - Univariate Analysis

In [None]:
sns.histplot(data=df['Age'], kde = True, bins = 25, color = 'black')
plt.title('Age Distribution')
plt.show()

In [None]:
sns.histplot(data=df['Annual Income (k$)'], kde = True, bins = 25, color = 'red')
plt.title('Annual Income (k$) Distribution')
plt.show()

In [None]:
sns.histplot(data=df['Spending Score (1-100)'], kde = True, bins = 25, color = 'blue')
plt.title('Spending Score (1-100)')
plt.show()

# EDA - Bivariate Analysis

In [None]:
sns.scatterplot(data=df, x='Age', y='Annual Income (k$)',hue ='Genre' , palette='deep', markers=True, legend='auto')
plt.title('Age vs Annual Income')
plt.show()

In [None]:
sns.scatterplot(data=df, x='Age', y='Spending Score (1-100)',hue ='Genre' , palette='flare', markers=True, legend='auto')
plt.title('Age vs Spending Score')
plt.show()

In [None]:
sns.scatterplot(data=df, x='Annual Income (k$)', y='Spending Score (1-100)',hue ='Genre' , palette='husl', markers=True, legend='auto')
plt.title('Annual Income (k$) vs Spending Score')
plt.show()

In [None]:
sns.catplot(data=df, x="Genre", y="Spending Score (1-100)", kind="point")
plt.title('Genre vs Spending Score')
plt.show()

# EDA - Multivariate Analysis

In [None]:
LE = LabelEncoder()
df['Genre'] = LE.fit_transform(df['Genre'])
df.head()

In [None]:
df.info()

In [None]:
corr = df.corr()
sns.heatmap(corr, annot=True, cmap='flare', fmt=".2f")
plt.title("Correlation Matrix Heatmap")
plt.show()

# Preprocessing

In [None]:
df.drop(columns = ['CustomerID'], inplace = True)

# KMean Clustering

In [None]:
inertia = []
k_range = range(1, 11)
for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(df)
    inertia.append(kmeans.inertia_)
plt.plot(k_range, inertia, marker='x')
plt.title('Elbow Method to Determine Number of Clusters')
plt.xlabel('Number of clusters (k)')
plt.ylabel('Inertia')
plt.show()

In [None]:
kmeans = KMeans(n_clusters=5,random_state=42)
df['KMeanClusters'] = kmeans.fit_predict(df)

In [None]:
KMeansScore = silhouette_score(df, kmeans.labels_)
print(KMeansScore)

In [None]:
df.head()

In [None]:
sns.scatterplot(data=df, x='Annual Income (k$)', y='Spending Score (1-100)', hue='KMeanClusters', palette='dark')
plt.title('Customer Segmentation Based on Spending Score and Annual Income')
plt.show()

In [None]:
AC = linkage(df, method='ward')
dendrogram(AC, truncate_mode='level', p=5)
plt.title('Dendrogram')

In [None]:
AgglomerativeClustering = AgglomerativeClustering(n_clusters=3, linkage='ward')
df['Agglomerative_Cluster'] = AgglomerativeClustering.fit_predict(df)

In [None]:
AgglomerativeClusteringScore = silhouette_score(df, AgglomerativeClustering.labels_)
print(AgglomerativeClusteringScore)

In [None]:
df.head()

In [None]:
sns.scatterplot(data=df, x='Annual Income (k$)', y='Spending Score (1-100)', hue='Agglomerative_Cluster', palette='bright')
plt.title('Customer Segmentation Based on Spending Score and Annual Income')
plt.show()

# DBSCAN Clustering

In [None]:
k = 5
nbrs = NearestNeighbors(n_neighbors=k).fit(df)
distances, indices = nbrs.kneighbors(df)
k_distances = distances[:, k-1]
k_distances = np.sort(k_distances)
plt.plot(k_distances, marker='o', linestyle='--', color='b')
plt.title('Elbow Method for Determining eps in DBSCAN')
plt.xlabel('Data Points (sorted)')
plt.ylabel(f'{k}-th Nearest Neighbor Distance')
plt.grid()
plt.show()

In [None]:
# Apply DBSCAN
DBSCAN = DBSCAN(eps=14, min_samples=5)
df['DBSCAN_Cluster'] = dbscan.fit_predict(df)

In [None]:
sns.scatterplot(data=df, x='Annual Income (k$)', y='Spending Score (1-100)', hue='DBSCAN_Cluster', palette='deep')
plt.title('Customer Segmentation Based on Spending Score and Annual Income')
plt.show()

# It appears that the KMeans method outputs the best results for Customer Segmentation