
# library



In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.cluster import DBSCAN
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.cluster import AgglomerativeClustering
from sklearn.mixture import GaussianMixture
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers import Adam
from sklearn.manifold import TSNE
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.cluster import AgglomerativeClustering

# import library

In [None]:
customer = pd.read_csv('/content/Mall_Customers.csv')

In [None]:
customer.head()

In [None]:
customer.tail()

In [None]:
customer.shape

In [None]:
customer.info()

# statistics Summary

In [None]:
customer.describe()

In [None]:
customer.isnull().sum()

In [None]:
customer.duplicated().sum()

In [None]:
customer['Gender'].value_counts()

# Visualize

In [None]:
customer.head()

In [None]:
from matplotlib import pyplot as plt
customer.plot(kind='scatter', x='Annual Income (k$)', y='Spending Score (1-100)', s=32, alpha=.8)
plt.gca().spines[['top', 'right',]].set_visible(False)

In [None]:
sns.pairplot(customer.drop('CustomerID', axis=1), hue='Gender', diag_kind='kde')
plt.show()

In [None]:
from matplotlib import pyplot as plt
import seaborn as sns
def _plot_series(series, series_name, series_index=0):
  palette = list(sns.palettes.mpl_palette('Dark2'))
  xs = series['Annual Income (k$)']
  ys = series['Age']

  plt.plot(xs, ys, label=series_name, color=palette[series_index % len(palette)])

fig, ax = plt.subplots(figsize=(10, 5.2), layout='constrained')
df_sorted = customer.sort_values('Annual Income (k$)', ascending=True)
for i, (series_name, series) in enumerate(df_sorted.groupby('Gender')):
  _plot_series(series, series_name, i)
  fig.legend(title='Gender', bbox_to_anchor=(1, 1), loc='upper left')
sns.despine(fig=fig, ax=ax)
plt.xlabel('Annual Income (k$)')
_ = plt.ylabel('Age')

In [None]:
from matplotlib import pyplot as plt
import seaborn as sns
customer.groupby('Gender').size().plot(kind='barh', color=sns.palettes.mpl_palette('Dark2'))
plt.gca().spines[['top', 'right',]].set_visible(False)

In [None]:

customer['genderr'] = customer['Gender'].map({'Male': 0, 'Female': 1})

In [None]:
customer['genderr'].value_counts()

# Feature Selection

In [None]:
X= customer[['Annual Income (k$)', 'Spending Score (1-100)']]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


# Clustering Algorithms

K-Means Clustering

In [None]:
wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters=i, init='k-means++', random_state=42)
    kmeans.fit(X_scaled)
    wcss.append(kmeans.inertia_)

# Plot the Elbow Method
plt.plot(range(1, 11), wcss, marker='o')
plt.title('Elbow Method')
plt.xlabel('Number of Clusters')
plt.ylabel('WCSS')
plt.show()

The optimal number of clusters in your image appears to be around 3 or 4. This is based on the "Elbow Method," where the Within-Cluster Sum of Squares (WCSS) decreases sharply up to around 3 or 4 clusters and then levels off, forming an "elbow" shape

In [None]:
kmeans = KMeans(n_clusters=3, init='k-means++', random_state=42)
customer['Cluster'] = kmeans.fit_predict(X_scaled)

In [None]:
plt.scatter(X_scaled[:, 0], X_scaled[:, 1], c=customer['Cluster'], cmap='viridis', s=50)
plt.title('Customer Segments')
plt.xlabel('Annual Income (scaled)')
plt.ylabel('Spending Score (scaled)')
plt.show()

In [None]:
kmeans = KMeans(n_clusters=4, init='k-means++', random_state=42)
customer['Cluster'] = kmeans.fit_predict(X_scaled)

In [None]:
plt.scatter(X_scaled[:, 0], X_scaled[:, 1], c=customer['Cluster'], cmap='viridis', s=50)
plt.title('Customer Segments')
plt.xlabel('Annual Income (scaled)')
plt.ylabel('Spending Score (scaled)')
plt.show()

Dimensionality Reduction (PCA)

In [None]:
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

plt.scatter(X_pca[:, 0], X_pca[:, 1], c=customer['Cluster'], cmap='viridis', s=50)
plt.title('PCA of Customer Data')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.show()

Group by clusters and analyze


In [None]:
cluster_summary = customer.groupby('Cluster').agg(
    {
        'Annual Income (k$)': 'mean',
        'Spending Score (1-100)': 'mean',
        'Age': 'mean',
        'genderr': 'mean'  # Assuming 'genderr' is numeric
    }
)
print(cluster_summary)

In [None]:
cluster_summary.plot(kind='bar', figsize=(10, 6))
plt.title('Cluster Summary')
plt.xlabel('Cluster')
plt.ylabel('Average Values')
plt.xticks(rotation=0)
plt.legend(title='Features', bbox_to_anchor=(1, 1), loc='upper left')
plt.tight_layout()
plt.show()

DBSCAN (Density-Based Spatial Clustering of Applications with Noise)

In [None]:
dbscan = DBSCAN(eps=0.5, min_samples=4)
customer['DBSCAN_Cluster'] = dbscan.fit_predict(X_scaled)

plt.scatter(X_scaled[:, 0], X_scaled[:, 1], c=customer['DBSCAN_Cluster'], cmap='viridis', s=50)
plt.title('DBSCAN Clustering')
plt.xlabel('Annual Income (scaled)')
plt.ylabel('Spending Score (scaled)')
plt.show()

In [None]:
outliers = customer[customer['DBSCAN_Cluster'] == -1]
print("Outliers detected by DBSCAN:")
print(outliers)

Hierarchical Clustering with Dendrograms

In [None]:
linked = linkage(X_scaled, method='ward')
plt.figure(figsize=(10, 7))
dendrogram(linked, orientation='top', distance_sort='descending', show_leaf_counts=True)
plt.title('Dendrogram')
plt.show()

In [None]:
agg_cluster = AgglomerativeClustering(n_clusters=4, linkage='ward')
customer['Hierarchical_Cluster'] = agg_cluster.fit_predict(X_scaled)

plt.scatter(X_scaled[:, 0], X_scaled[:, 1], c=customer['Hierarchical_Cluster'], cmap='viridis', s=50)
plt.title('Hierarchical Clustering')
plt.xlabel('Annual Income (scaled)')
plt.ylabel('Spending Score (scaled)')
plt.show()

Gaussian Mixture Models (GMM)

In [None]:
gmm = GaussianMixture(n_components=4, random_state=42)
customer['GMM_Cluster'] = gmm.fit_predict(X_scaled)

plt.scatter(X_scaled[:, 0], X_scaled[:, 1], c=customer['GMM_Cluster'], cmap='viridis', s=50)
plt.title('Gaussian Mixture Model Clustering')
plt.xlabel('Annual Income (scaled)')
plt.ylabel('Spending Score (scaled)')
plt.show()

In [None]:
probabilities = gmm.predict_proba(X_scaled)
print("Cluster Probabilities:")
print(probabilities[:5])

t-SNE for Dimensionality Reduction

In [None]:

tsne = TSNE(n_components=2, random_state=42)
X_tsne = tsne.fit_transform(X_scaled)

plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=customer['Cluster'], cmap='viridis', s=50)
plt.title('t-SNE Visualization')
plt.xlabel('t-SNE Component 1')
plt.ylabel('t-SNE Component 2')
plt.show()

Autoencoders for Feature Extraction

In [None]:
input_dim = X_scaled.shape[1]
encoding_dim = 2

input_layer = Input(shape=(input_dim,))
encoder = Dense(encoding_dim, activation="relu")(input_layer)
decoder = Dense(input_dim, activation="sigmoid")(encoder)

autoencoder = Model(inputs=input_layer, outputs=decoder)
encoder_model = Model(inputs=input_layer, outputs=encoder)

autoencoder.compile(optimizer=Adam(learning_rate=0.001), loss='mse')
autoencoder.fit(X_scaled, X_scaled, epochs=50, batch_size=16, shuffle=True, verbose=0)

# Extract encoded features
X_encoded = encoder_model.predict(X_scaled)

plt.scatter(X_encoded[:, 0], X_encoded[:, 1], c=customer['Cluster'], cmap='viridis', s=50)
plt.title('Autoencoder Feature Extraction')
plt.xlabel('Encoded Feature 1')
plt.ylabel('Encoded Feature 2')
plt.show()