# Clustering of RedPy

This notebook will use RedPy feature data and cluster it to try and find patterns

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import random
from sklearn import preprocessing

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import os

from sklearn.cluster import KMeans
from sklearn import metrics
from sklearn.metrics import silhouette_score, silhouette_samples


### Data
Read TSFEL features for Mt Hood

In [None]:
# this is some underway data collected from a cruise in 2019
mt_hood = pd.read_csv('../data/Hood_tsfel_features.csv')

In [None]:
df = mt_hood.copy()
df.dropna(inplace=True)
df.head()

In [None]:
# df.dropna(axis=1, inplace=True)
df.drop(['template'], axis=1, inplace=True)


In [None]:
df.isna().sum().any()   

In [None]:
df.corr().style.background_gradient(cmap='coolwarm')

Are some of the features log-normal? Log-normal features will be wrongly scaled for a euclidian distance.

We will select the features of high skewness.

In [None]:

# Step 1: Calculate skewness for each feature
skewness = df.apply(lambda x: x.skew())

# Step 2: Identify features with high positive skewness (right-skewed)
log_normal_features = skewness[skewness > 1.0].index.tolist()


In [None]:
print(len(log_normal_features))
print(len(df.keys()))

Most features are log normal. OK transform all log-normal features.


In [None]:
df_log = df.copy()
df_log[log_normal_features] = np.log(df_log[log_normal_features]) # log transform the skewed features

# drop the features with Nan, Inf, Zeros from the data frames.
df_log.dropna(inplace=True,axis=1)
df_log.replace([np.inf, -np.inf], np.nan, inplace=True)
df_log.dropna(inplace=True,axis=1)
df_log.replace(0, np.nan)
df_log.dropna(inplace=True,axis=1)
df_log.isna().sum().any()

In [None]:
df_log.describe()

Here we notice that the features have extremely different values. It seems difficult to continue with normal kmeans because the Euclidian distance will be inadequate.

Let's try some standard scaling first

In [None]:
# scaler = preprocessing.StandardScaler().fit(X_pca)
# Initialize the StandardScaler
scaler = StandardScaler()
# Fit and transform the DataFrame using StandardScaler
df_log_scaled = pd.DataFrame(scaler.fit_transform(df_log), columns=df_log.columns)
X_scaled = df_log_scaled.to_numpy()

## 3. K-means

In [None]:
# Example on original data with silhouette score
X = df.to_numpy()
ncluster=4
kmeans_model = KMeans(n_clusters=ncluster, random_state=1).fit(X)
labels = kmeans_model.labels_
sc=silhouette_score(X, labels, metric='euclidean')
print(f"Silhouette score for {ncluster} clusters: {sc:.3f}")

Example on log-transformed data with silhouette score

In [None]:
X = df_log.to_numpy()
ncluster=4
kmeans_model = KMeans(n_clusters=ncluster, random_state=1).fit(X)
labels = kmeans_model.labels_
sc=silhouette_score(X, labels, metric='euclidean')
print(f"Silhouette score for {ncluster} clusters: {sc:.3f}")

In [None]:
ncluster=4
import matplotlib.cm as cm
fig, (ax1) = plt.subplots(1, 1)
fig.set_size_inches(7, 7)
ax1.set_xlim([-0.1, 1])
# The (n_clusters+1)*10 is for inserting blank space between silhouette
# plots of individual clusters, to demarcate them clearly.
ax1.set_ylim([0, len(X) + (ncluster + 1) * 10])

# Initialize the clusterer with n_clusters value and a random generator
# seed of 10 for reproducibility.
clusterer = KMeans(n_clusters=ncluster, random_state=10)
cluster_labels = clusterer.fit_predict(X)

# The silhouette_score gives the average value for all the samples.
# This gives a perspective into the density and separation of the formed
# clusters
silhouette_avg = silhouette_score(X, cluster_labels)
print(f"For n_clusters = {ncluster}, the average silhouette_score is : {silhouette_avg:.3f}")

# Compute the silhouette scores for each sample
sample_silhouette_values = silhouette_samples(X, cluster_labels)

y_lower = 10
for i in range(ncluster):
    # Aggregate the silhouette scores for samples belonging to
    # cluster i, and sort them
    ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i]

    ith_cluster_silhouette_values.sort()

    size_cluster_i = ith_cluster_silhouette_values.shape[0]
    y_upper = y_lower + size_cluster_i

    color = cm.nipy_spectral(float(i) / ncluster)
    ax1.fill_betweenx(
        np.arange(y_lower, y_upper),
        0,
        ith_cluster_silhouette_values,
        facecolor=color,
        edgecolor=color,
        alpha=0.7,
    )

    # Label the silhouette plots with their cluster numbers at the middle
    ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

    # Compute the new y_lower for next plot
    y_lower = y_upper + 10  # 10 for the 0 samples

ax1.set_title("The silhouette plot for the various clusters.")
ax1.set_xlabel("The silhouette coefficient values")
ax1.set_ylabel("Cluster label")

# The vertical line for average silhouette score of all the values
ax1.axvline(x=silhouette_avg, color="red", linestyle="--")
ax1.set_yticks([])  # Clear the yaxis labels / ticks
ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])
plt.suptitle(
    "Silhouette analysis for KMeans clustering on sample data with n_clusters = %d"
    % ncluster,
    fontsize=14,
    fontweight="bold",
)



In [None]:
mt_hood['clusterID'] = clusterID
mt_hood.to_csv('../data/Hood_tsfel_features_clustered_kmeans.csv')

### 3. Choice of number of clusters: The Elbow Method


Compute the value of E for different values of the number of clusters

In [None]:
X = df_log_scaled.to_numpy()
# Elbow method  
inertia = []
silhouette_avg=[]
for k in range(1, 11):
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X)
    cluster_labels = kmeans.labels_
    inertia.append(kmeans.inertia_)
    if k>=2:
        silhouette_avg.append(silhouette_score(X, cluster_labels))
        print(f"For n_clusters = {k}, the average silhouette_score is : {silhouette_avg[-1]:.3f}")


fig, ax = plt.subplots(1, 2, figsize=(12, 4))
ax[0].plot(range(1, 11), inertia, marker='o')
ax[0].set_title('Elbow Curve for KMeans - log-scaled data')
ax[0].set_xlabel('Number of Clusters (k)')
ax[0].set_ylabel('Inertia')
ax[1].plot(range(2, 11), silhouette_avg, marker='o')
ax[1].set_title('Silhouette Curve for KMeans - log-scaled data')
ax[1].set_xlabel('Number of Clusters (k)')
ax[1].set_ylabel('Silhouette Score')
plt.show()

In [None]:
X = df_log.to_numpy()
# Elbow method  
inertia = []
silhouette_avg=[]
for k in range(1, 11):
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X)
    cluster_labels = kmeans.labels_
    inertia.append(kmeans.inertia_)
    if k>=2:
        silhouette_avg.append(silhouette_score(X, cluster_labels))
        print(f"For n_clusters = {k}, the average silhouette_score is : {silhouette_avg[-1]:.3f}")


fig, ax = plt.subplots(1, 2, figsize=(12, 4))
ax[0].plot(range(1, 11), inertia, marker='o')
ax[0].set_title('Elbow Curve for KMeans - log data')
ax[0].set_xlabel('Number of Clusters (k)')
ax[0].set_ylabel('Inertia')
ax[1].plot(range(2, 11), silhouette_avg, marker='o')
ax[1].set_title('Silhouette Curve for KMeans - log data')
ax[1].set_xlabel('Number of Clusters (k)')
ax[1].set_ylabel('Silhouette Score')
plt.show()

In [None]:
X = df.to_numpy()
# Elbow method  
inertia = []
silhouette_avg=[]
for k in range(1, 11):
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X)
    cluster_labels = kmeans.labels_
    inertia.append(kmeans.inertia_)
    if k>=2:
        silhouette_avg.append(silhouette_score(X, cluster_labels))
        print(f"For n_clusters = {k}, the average silhouette_score is : {silhouette_avg[-1]:.3f}")


fig, ax = plt.subplots(1, 2, figsize=(12, 4))
ax[0].plot(range(1, 11), inertia, marker='o')
ax[0].set_title('Elbow Curve for KMeans - raw data')
ax[0].set_xlabel('Number of Clusters (k)')
ax[0].set_ylabel('Inertia')
ax[1].plot(range(2, 11), silhouette_avg, marker='o')
ax[1].set_title('Silhouette Curve for KMeans - raw data')
ax[1].set_xlabel('Number of Clusters (k)')
ax[1].set_ylabel('Silhouette Score')
plt.show()

## 4. Hierarchical Clustering

In K-means, we use the euclidian distance and prescribe the number of clusters K.

In hierarchical clustering, we choose difference distance metrics, visualize the data structure, and then decide on the number of clusters. There are two approaches to building the hierarchy of clusters:

* **Agglomerative**: each point starts in each unique cluster. data is merged in pairs as on creates a hierarchy of clusters.
* **Divisive**: initially, all data is into 1 cluster. The data is recursively split into smaller and smaller clusters.


There are several types of *linkages*. sklearn has detailed [documentation](!https://scikit-learn.org/stable/modules/clustering.html#hierarchical-clustering), mostly for agglomerative: The different linkages methods are:

* **Ward** minimizes the sum of squared differences within all clusters. It is a variance-minimizing approach and in this sense is similar to the k-means objective function but tackled with an agglomerative hierarchical approach.
* **Maximum** or complete linkage minimizes the maximum distance between observations of pairs of clusters.
* **Average** linkage minimizes the average of the distances between all observations of pairs of clusters.
* **Single** linkage minimizes the distance between the closest observations of pairs of clusters.

We first import relevant packages



In [None]:

import numpy as np
import matplotlib.pyplot as plt
from matplotlib import rcParams
from scipy.cluster import hierarchy  #
from scipy.spatial.distance import pdist

rcParams.update({'font.size': 18})
plt.rcParams['figure.figsize'] = [12, 12]

First we explore the dendograms

In [None]:
## Dendrograms on raw data
X = df.to_numpy()
Y = pdist(X,metric='euclidean')
Z = hierarchy.linkage(Y,method='ward')
thresh = 0.85*np.max(Z[:,2])

plt.figure()
dn = hierarchy.dendrogram(Z,p=100,color_threshold=thresh)
plt.xlabel('Data Sample Index')
plt.ylabel('Distance')
plt.title('Dendrogram with Ward linkage')
plt.show()

In [None]:
## Dendrograms on log data
X = df_log.to_numpy()
Y = pdist(X,metric='euclidean')
Z = hierarchy.linkage(Y,method='average')
thresh = 0.85*np.max(Z[:,2])

plt.figure()
dn = hierarchy.dendrogram(Z,p=100,color_threshold=thresh)
plt.xlabel('Data Sample Index')
plt.ylabel('Distance')
plt.title('Dendrogram with average linkage')
plt.show()

##

Now let's cluster:

* Ward linkage on raw feature
* ward linkage on log features
* ward linkage on log and scaled features


In [None]:
from sklearn.cluster import AgglomerativeClustering
X=df.to_numpy()
# Let's first find a reasonable distance threshod by precalculating the linkage matrix
Z = hierarchy.linkage(X,method='ward')
thresh = 0.4*np.max(Z[:,2])    # choose a threshold distance
# design model
model = AgglomerativeClustering(distance_threshold=thresh,linkage="ward", n_clusters=None)
# fit model and predict clusters on the data samples
clusterID=model.fit_predict(X)
ncluster=len(np.unique(clusterID))
silhouette_avg = silhouette_score(X, clusterID)
print(f"For n_clusters = {ncluster}, the average silhouette_score is : {silhouette_avg:.3f}")


mt_hood['clusterID'] = clusterID
mt_hood.to_csv('../data/Hood_tsfel_features_clustered_agg.csv')

* Ward on log data

In [None]:
from sklearn.cluster import AgglomerativeClustering
X=df_log.to_numpy()
# Let's first find a reasonable distance threshod by precalculating the linkage matrix
Z = hierarchy.linkage(X,method='ward')
thresh = 0.4*np.max(Z[:,2])    # choose a threshold distance
# design model
model = AgglomerativeClustering(distance_threshold=thresh,linkage="ward", n_clusters=None)
# fit model and predict clusters on the data samples
clusterID=model.fit_predict(X)
ncluster=len(np.unique(clusterID))
silhouette_avg = silhouette_score(X, clusterID)
print(f"For n_clusters = {ncluster}, the average silhouette_score is : {silhouette_avg:.3f}")

mt_hood['clusterID'] = clusterID
mt_hood.to_csv('../data/Hood_tsfel_features_clustered_agg_log.csv')

* Ward on log-transformed data

In [None]:
from sklearn.cluster import AgglomerativeClustering
X=df_log_scaled.to_numpy()
# Let's first find a reasonable distance threshod by precalculating the linkage matrix
Z = hierarchy.linkage(X,method='ward')
thresh = 0.4*np.max(Z[:,2])    # choose a threshold distance
# design model
model = AgglomerativeClustering(distance_threshold=thresh,linkage="ward", n_clusters=None)
# fit model and predict clusters on the data samples
clusterID=model.fit_predict(X)
ncluster=len(np.unique(clusterID))
silhouette_avg = silhouette_score(X, clusterID)
print(f"For n_clusters = {ncluster}, the average silhouette_score is : {silhouette_avg:.3f}")

mt_hood['clusterID'] = clusterID
mt_hood.to_csv('../data/Hood_tsfel_features_clustered_agg_log_scaled.csv')

## Try PCA+normalization before clustering

What happens if we apply PCA + normalization before the clustering?

In [None]:
X = df.to_numpy()   
pca = PCA(n_components=5)
X_pca = pca.fit_transform(X)

In [None]:
scaler = preprocessing.StandardScaler().fit(X_pca)
X_scaled = scaler.transform(X_pca)

In [None]:
# Let's first find a reasonable distance threshod by precalculating the linkage matrix
Z = hierarchy.linkage(X_scaled,method='ward')
thresh = 0.3*np.max(Z[:,2])    # choose a threshold distance

plt.figure()
dn = hierarchy.dendrogram(Z,p=100,color_threshold=thresh)
plt.xlabel('Data Sample Index')
plt.ylabel('Distance')
plt.title('Dendrogram with average linkage')
plt.show()
# design model
model = AgglomerativeClustering(distance_threshold=thresh,linkage="ward", n_clusters=None)
# fit model and predict clusters on the data samples
clusterID=model.fit_predict(X_scaled)
plt.hist(clusterID);


In [None]:
ncluster=len(np.unique(clusterID))
fig, (ax1) = plt.subplots(1, 1)
fig.set_size_inches(18, 7)
ax1.set_xlim([-0.1, 1])
ax1.set_ylim([0, len(X) + (ncluster + 1) * 10])

silhouette_avg = silhouette_score(X_scaled, clusterID)
print(
    "For n_clusters =",
    ncluster,
    "The average silhouette_score is :",
    silhouette_avg,
)

# Compute the silhouette scores for each sample
sample_silhouette_values = silhouette_samples(X_scaled, clusterID)

y_lower = 10
for i in range(ncluster):
    # Aggregate the silhouette scores for samples belonging to
    # cluster i, and sort them
    ith_cluster_silhouette_values = sample_silhouette_values[clusterID == i]

    ith_cluster_silhouette_values.sort()

    size_cluster_i = ith_cluster_silhouette_values.shape[0]
    y_upper = y_lower + size_cluster_i

    color = cm.nipy_spectral(float(i) / ncluster)
    ax1.fill_betweenx(
        np.arange(y_lower, y_upper),
        0,
        ith_cluster_silhouette_values,
        facecolor=color,
        edgecolor=color,
        alpha=0.7,
    )

    # Label the silhouette plots with their cluster numbers at the middle
    ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

    # Compute the new y_lower for next plot
    y_lower = y_upper + 10  # 10 for the 0 samples

ax1.set_title("The silhouette plot for the various clusters.")
ax1.set_xlabel("The silhouette coefficient values")
ax1.set_ylabel("Cluster label")

# The vertical line for average silhouette score of all the values
ax1.axvline(x=silhouette_avg, color="red", linestyle="--")

ax1.set_yticks([])  # Clear the yaxis labels / ticks
ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])

plt.suptitle(
    "Silhouette analysis for KMeans clustering on sample data with n_clusters = %d"
    % ncluster,
    fontsize=14,
    fontweight="bold",
)

