In [9]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
from sklearn.cluster import AgglomerativeClustering
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [10]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00240/UCI%20HAR%20Dataset.zip'
dataset_path = 'UCI HAR Dataset'

# Unzip the dataset
import zipfile
import urllib.request
import os

if not os.path.exists(dataset_path):
    urllib.request.urlretrieve(url, 'UCI_HAR_Dataset.zip')
    with zipfile.ZipFile('UCI_HAR_Dataset.zip', 'r') as zip_ref:
        zip_ref.extractall('.')


In [11]:
# Load the dataset
train_data = pd.read_csv(f'{dataset_path}/train/X_train.txt', delim_whitespace=True, header=None)
test_data = pd.read_csv(f'{dataset_path}/test/X_test.txt', delim_whitespace=True, header=None)
data = pd.concat([train_data, test_data])

# Normalize the features
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data)

In [12]:
# Initialize variables to store the best results
best_n_clusters = None
best_linkage = None
best_score = -1
best_labels = None

# Define the grid search ranges for n_clusters and linkage criteria
n_clusters_values = np.arange(2, 30)  # Example range, adjust as needed
linkage_values = ['ward', 'complete', 'average', 'single']

# Function to apply hierarchical clustering and return the number of clusters
def apply_hierarchical(n_clusters, linkage, data):
    hierarchical = AgglomerativeClustering(n_clusters=n_clusters, linkage=linkage)
    clusters = hierarchical.fit_predict(data)
    unique_clusters = len(set(clusters))
    if unique_clusters > 1:  # We need at least 2 clusters to calculate silhouette score
        score = silhouette_score(data, clusters)
    else:
        score = -1
    return unique_clusters, score, clusters

# Perform the grid search
for n_clusters in n_clusters_values:
    for linkage in linkage_values:
        clusters, score, labels = apply_hierarchical(n_clusters, linkage, data_scaled)
        if clusters > 1 and score > best_score:  # You can also set a specific cluster count target here
            best_n_clusters = n_clusters
            best_linkage = linkage
            best_score = score
            best_labels = labels

# Display the best results
print("Best parameters found:")
print(f"n_clusters: {best_n_clusters}, linkage: {best_linkage}")
print(f"Silhouette Score: {best_score}")

# Display the original output of hierarchical clustering with the best parameters
print("Cluster labels for each point in the dataset:")
print(best_labels)

unique, counts = np.unique(best_labels, return_counts=True)
cluster_counts = dict(zip(unique, counts))
print("\nNumber of points in each cluster:")
print(cluster_counts)

# Adding the cluster labels to the dataframe
data['cluster'] = best_labels