In [None]:
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans

In [None]:
df = pd.read_csv("datasets/Mall_Customers.csv")
# loads the csv file into a pandas dataframe
df

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.info()  # with the help of it we get brief information about our dataset

In [None]:
# one way to access the annual income and spending score column
df.iloc[:, [3, 4]]

In [None]:
x = df.loc[:, "Annual Income (k$)":"Spending Score (1-100)"].values

In [None]:
x

# Exploratory Data Analysis (EDA)

In [None]:
# Renaming a column in the dataset
df.rename(
    columns={"Genre": "Gender"}, inplace=True
)  # To rename column 2 from Genre to Gender
df.head()  # Checking if the correction has been effected

In [None]:
# Checking data types and shape
df.dtypes  # returns the data types of the variables

In [None]:
# Descriptive statistics
df.describe()  # returns the descriptive statistics of the dataset.

In [None]:
# Looking for null or missing values
df.isnull().sum()  # returns the number of missing values

In [None]:
# Looking for duplicated values
df.duplicated()  # Checking for duplicate values.

# Bivariate Analysis — Scatterplot

In [None]:
sns.set_style("dark")
sns.scatterplot(x="Annual Income (k$)", y="Spending Score (1-100)", data=df)
plt.xlabel("Annual Income (k$)")
plt.ylabel("Spending Score (1-100)")
plt.title("Scatterplot Between Annual Income (k$) and Spending Score (1-100)")

In [None]:
# Feature Selection(Choosing the columns of interest for clustering)
X = df.loc[:, ["Annual Income (k$)", "Spending Score (1-100)"]].values
X

# Step 2: Perform Elbow Method To Find Optimal No.Of Clusters

In [None]:
wcss = []

In [None]:
for i in range(1, 11):
    kmeans = KMeans(n_clusters=i, init="k-means++", random_state=0)
    kmeans.fit(x)
    wcss.append(kmeans.inertia_)

In [None]:
plt.figure(figsize=(12, 6))
plt.grid()
plt.plot(
    range(1, 11),
    wcss,
    color="green",
    linestyle="dashed",
    linewidth=3,
    marker="o",
    markerfacecolor="blue",
    markersize=12,
)
plt.title("The Elbow Point Graph")
plt.xlabel("Number of Clusters")
plt.ylabel("WCSS")
plt.show()

# Training the K-Means Clustering Model

In [None]:
kmeans = KMeans(n_clusters=5, init="k-means++")  # initialize the class object
label = kmeans.fit_predict(X)  # returns a cluster number for each of the data points
print(label)

# Checking the centers of out clusters (Also known as Centroids)

In [None]:
print(kmeans.cluster_centers_)

# Visualizing all the clusters

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 8))

# Scatter plot for 5 clusters
plt.scatter(X[label == 0, 0], X[label == 0, 1], s=50, c="green", label="Cluster 1")
plt.scatter(X[label == 1, 0], X[label == 1, 1], s=50, c="yellow", label="Cluster 2")
plt.scatter(X[label == 2, 0], X[label == 2, 1], s=50, c="red", label="Cluster 3")
plt.scatter(X[label == 3, 0], X[label == 3, 1], s=50, c="purple", label="Cluster 4")
plt.scatter(X[label == 4, 0], X[label == 4, 1], s=50, c="blue", label="Cluster 5")

# Scatter plot for cluster centers
plt.scatter(
    kmeans.cluster_centers_[:, 0],
    kmeans.cluster_centers_[:, 1],
    s=100,
    c="black",
    label="Centroids",
    marker="*",
)

plt.title("Customer groups")
plt.xlabel("Annual Income")
plt.ylabel("Spending Score (1-100)")
plt.legend()

plt.show()