# 5. Clustering

This JupyterNotebook is part of an exercise series titled *Clustering*.
The series itself is based on lecture *8. Cluster Analysis*.

There are two parts:

- Part One: Implementing k-means and DBScan
- Part Two: Clustering in the AdventureWorks Database

Recall that we have two exercise groups.
Depending on how each group progresses, some parts of these exercises may not be discussed in its entirety.
If questions arise, ask them in your study group or in our StudOn forum.

## Part One: Implementing k-means and DBScan

<div class="alert alert-block alert-warning">

TODO

</div>

- [ ] Rework the cluster/partition naming scheme
- [ ] Think of more pythonic solutions/whether they make sense in that case or confuse the students

#### Preparations

In [None]:
import pandas as pd
import seaborn as sns
import numpy as np

In [None]:
dataset = pd.DataFrame(
    [
        [6, 5],
        [8, 5],
        [4, 3],
        [5, 6],
        [6, 2],
        [6, 3],
        [2, 2],
        [3, 3],
        [4, 4],
        [5, 5],
        [6, 6],
        [7, 7],
        [8, 7],
        [8, 4],
    ],
    columns=["x", "y"],
)
dataset

In [None]:
sns.scatterplot(x=dataset["x"], y=dataset["y"])

#### Stepwise Implementation

##### 1. Step: Partition

In [None]:
def partition_dataset(dataset, k):
    """
    Arbitrarily partition the dataset into k non-empty partitions.
    Adds a third column to indicate the affiliation to a specific cluster.
    """
    # Copy the original dataset
    dataset_copy = dataset.copy()

    # Create a new empty column to save the cluster affiliation later
    dataset_copy["cluster"] = -1

    # The method that was used in the lecture example is to sort the samples regarding to their y values
    dataset_copy = dataset_copy.sort_values(by=["y"]).reset_index(drop=True)

    # Then to define the size of each cluster
    cluster_size = round(dataset_copy.shape[0] / k)

    # And then to assing the samples to the clusters
    for i in range(0, dataset_copy.shape[0], cluster_size):
        # Start of the slice
        start = i

        # End of the slice
        end = min(i + cluster_size, dataset_copy.shape[0])

        # Cluster id
        cluster_id = i / cluster_size

        # Assign the cluster value
        dataset_copy.loc[start : end - 1, "cluster"] = cluster_id

    # Return the dataset
    return dataset_copy


partitioned_dataset = partition_dataset(dataset, 2)
partitioned_dataset

In [None]:
sns.scatterplot(
    x=partioned_dataset["x"], y=partioned_dataset["y"], hue=partioned_dataset["cluster"]
)

##### 2. Step: Compute the centroids

In [None]:
def compute_centroids(partitioned_dataset, k):
    """
    Compute the centroids of each of the k partitions
    """
    # Init a DataFrame to hold the centroids
    centroids = pd.DataFrame(
        [[np.nan, np.nan] for i in range(0, k)], columns=["x", "y"]
    )

    # Compute the centroid of each partition
    for i in range(0, k):
        # Compute the mean of the x values within that single partition
        x_mean = partitioned_dataset[partitioned_dataset["cluster"] == i]["x"].mean()

        # Compute the mean of the y values within that single partition
        y_mean = partitioned_dataset[partitioned_dataset["cluster"] == i]["y"].mean()

        # Add the centroid of this single partition
        centroids.loc[i, ["x", "y"]] = [x_mean, y_mean]

    # Return the centroids
    return centroids


centroids = compute_centroids(partitioned_dataset, 2)
centroids

##### 3. Step: Assign each sample to the cluster of the nearest centroid

In [None]:
def euclidean_distance(a, b):
    """
    Computes the euclidean distance between two points a and b.
    """
    return (abs(a - b) ** 2).sum() ** 0.5


a = pd.Series([1, 9])
b = pd.Series([9, 5])
euclidean_distance(a, b)

In [None]:
def reassign(partitioned_dataset, centroids, k):
    """
    (Re)assigns each sample to the cluster of its nearest centroid.
    Also returns a boolean that signals whether there was at least one sample reassigned.
    """
    # Indicator to show whether there was at least one tuple reassigned
    reassign_indicator = False

    # Copy the original partitioned_dataset
    dataset_copy = partitioned_dataset.copy()

    # Check for each sample whether it has to be reassigned
    for i in range(0, dataset_copy.shape[0]):
        # Get the value of the the dataset for easier access
        sample = dataset_copy.loc[i, ["x", "y"]]

        # Set the current cluster id and centroid values
        current_cluster = dataset_copy.loc[i, "cluster"]
        current_centroid = centroids.loc[current_cluster]
        current_distance = euclidean_distance(sample, current_centroid)

        # print(sample)
        # print(current_cluster)
        # print(current_centroid)
        # print(current_distance)
        # print()

        # Iterate through the centroids and check whether the distance is lower than the current distance
        # NOTE: We do not skip the current centroid, as this would complicate the code and isn't a big performance problem
        for j in range(0, k):
            # Compute the distance
            distance = euclidean_distance(sample, centroids.loc[j])

            # If the distance is lower than the current_distance we have to reassign
            if distance < current_distance:

                # Set the cluster
                dataset_copy.loc[i, "cluster"] = j
                current_cluster = j

                # Set the current_centroid
                current_centroid = centroids.loc[j]

                # Set the current_distance
                current_distance = distance

                # Set the reassign_indicator
                reassign_indicator = True

    return reassign_indicator, dataset_copy


reassign_indicator, reassigned_dataset = reassign(partitioned_dataset, centroids, 2)

print("Was there at least one sample reassigned? - " + str(reassign_indicator))
reassigned_dataset

##### 4. Step: Merge the functions into a wrapper function

In [None]:
def k_means(dataset, k):
    """
    Wrapper function to implement the k_means clustering
    """

    # Partition the dataset
    dataset = partition_dataset(dataset, k)

    # Set the reassign_indicator to True (as the intial partitioning was as reassingment in itself)
    reassign_indicator = True

    # As long as there are reassingment the following two steps are repeated
    while reassign_indicator:
        # Compute the centroids
        centroids = compute_centroids(dataset, k)

        # Reassign each sample to the cluster of the nearest centroid
        reassign_indicator, dataset = reassign(dataset, centroids, k)

    # Return the clustered dataset
    return dataset


# Cluster our dataset
clustered_dataset = k_means(dataset, 2)
clustered_dataset

In [None]:
sns.scatterplot(
    x=clustered_dataset["x"], y=clustered_dataset["y"], hue=clustered_dataset["cluster"]
)

#### Solution found in the Internet (copied for comparison reasons - will not be part of the final exercise)

Source: https://towardsdatascience.com/create-your-own-k-means-clustering-algorithm-in-python-d7d4c9077670

In [None]:
import seaborn as sns
import numpy as np
from sklearn.datasets import make_blobs
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

centers = 5
X_train, true_labels = make_blobs(n_samples=100000, centers=centers, random_state=2306)
X_train = StandardScaler().fit_transform(X_train)
sns.scatterplot(
    x=[X[0] for X in X_train],
    y=[X[1] for X in X_train],
    hue=true_labels,
    palette="deep",
    legend=None,
)
plt.xlabel("x")
plt.ylabel("y")
plt.show()

In [None]:
true_labels

In [None]:
def euclidean(point, data):
    """
    Euclidean distance between point & data.
    Point has dimensions (m,), data has dimensions (n,m), and output will be of size (n,).
    """
    return np.sqrt(np.sum((point - data) ** 2, axis=1))

In [None]:
class KMeans:
    def __init__(self, n_clusters=8, max_iter=300):
        self.n_clusters = n_clusters
        self.max_iter = max_iter

    def fit(self, X_train):
        # Randomly select centroid start points, uniformly distributed across the domain of the dataset
        min_, max_ = np.min(X_train, axis=0), np.max(X_train, axis=0)
        self.centroids = [np.random.uniform(min_, max_) for _ in range(self.n_clusters)]
        # Iterate, adjusting centroids until converged or until passed max_iter
        iteration = 0
        prev_centroids = None
        while (
            np.not_equal(self.centroids, prev_centroids).any()
            and iteration < self.max_iter
        ):
            # Sort each datapoint, assigning to nearest centroid
            sorted_points = [[] for _ in range(self.n_clusters)]
            for x in X_train:
                dists = euclidean(x, self.centroids)
                centroid_idx = np.argmin(dists)
                sorted_points[centroid_idx].append(x)
            # Push current centroids to previous, reassign centroids as mean of the points belonging to them
            prev_centroids = self.centroids
            self.centroids = [np.mean(cluster, axis=0) for cluster in sorted_points]
            for i, centroid in enumerate(self.centroids):
                if np.isnan(
                    centroid
                ).any():  # Catch any np.nans, resulting from a centroid having no points
                    self.centroids[i] = prev_centroids[i]
            iteration += 1

    def evaluate(self, X):
        centroids = []
        centroid_idxs = []
        for x in X:
            dists = euclidean(x, self.centroids)
            centroid_idx = np.argmin(dists)
            centroids.append(self.centroids[centroid_idx])
            centroid_idxs.append(centroid_idx)
        return centroids, centroid_idx

In [None]:
kmeans = KMeans(n_clusters=centers)
kmeans.fit(X_train)
# View results
class_centers, classification = kmeans.evaluate(X_train)
sns.scatterplot(
    x=[X[0] for X in X_train],
    y=[X[1] for X in X_train],
    hue=true_labels,
    style=classification,
    palette="deep",
    legend=None,
)
plt.plot(
    [x for x, _ in kmeans.centroids],
    [y for _, y in kmeans.centroids],
    "+",
    markersize=10,
)
plt.show()

## Part Two: Clustering in the AdventureWorks Database

<div class="alert alert-block alert-warning">

TODO

</div>