# GUC Clustering Project 

**Objective:** 
The objective of this project teach students how to apply clustering to real data sets

The projects aims to teach student: 
* Which clustering approach to use
* Compare between Kmeans, Hierarchal, DBScan, and Gaussian Mixtures  
* How to tune the parameters of each data approach
* What is the effect of different distance functions (optional) 
* How to evaluate clustering approachs 
* How to display the output
* What is the effect of normalizing the data 

Students in this project will use ready-made functions from Sklearn, plotnine, numpy and pandas 
 



Running this project require the following imports 

In [None]:
import warnings
warnings.filterwarnings('ignore')
import seaborn as sns 
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import sklearn.preprocessing as prep
from sklearn.datasets import make_blobs
from plotnine import *   
# StandardScaler is a function to normalize the data 
# You may also check MinMaxScaler and MaxAbsScaler 
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import DBSCAN
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
from scipy.cluster.hierarchy import dendrogram
from scipy.cluster.hierarchy import linkage as la
%matplotlib inline

In [None]:
def plot_clusters_2D(data,kmeans=[] ,n_clusters=0):
    distortion=None
    if n_clusters==0:
        plt.scatter(data[:, 0], data[:, 1], c='b',alpha=0.5,s=20)
    else:
        labels = kmeans.labels_
        centroids = kmeans.cluster_centers_
        distortion= kmeans.inertia_
        # Plot the data points and centroids in 2D
        plt.scatter(data[:, 0], data[:, 1], c=labels, cmap='rainbow',alpha=0.5,s=20)
        plt.scatter(centroids[:, 0], centroids[:, 1], marker='x', color='black')
        plt.xlabel('feature1')
        plt.ylabel('feature2')
        plt.title('Clusters of 2D data')
        plt.show()
    return distortion

In [None]:
def plot_clusters_PCA(data, kmeans=[], n_clusters=0):
    distortion=None
    if n_clusters==0:
        plt.scatter(data[:, 0], data[:, 1], c='b',alpha=0.5,s=20)
    else:
        # Perform clustering and plotting
        # Perform k-means clustering on the data
        labels = kmeans.labels_
        centroids = kmeans.cluster_centers_
        distortion= kmeans.inertia_
        # Perform PCA to reduce the data to 2 dimensions
        pca = PCA(n_components=2, random_state=0).fit(data)
        data_2d = pca.transform(data)
        centroids_2d = pca.transform(centroids)

        # Plot the data points and centroids in 2D
        plt.scatter(data_2d[:, 0], data_2d[:, 1], c=labels, cmap='rainbow',alpha=0.5,s=20)
        plt.scatter(centroids_2d[:, 0], centroids_2d[:, 1], marker='x', color='black')
        plt.xlabel('PC1')
        plt.ylabel('PC2')
        plt.title(f'Clusters of PCA data for k={n_clusters}')
        plt.show()
    return distortion

In [None]:
def plot_clusters_nD(data, kmeans=[], n_clusters=0):
    distortion = None
    n_features = data.shape[1] # get the number of features
    n_rows = n_cols = int(np.ceil(np.sqrt(n_features))) # get the number of rows and columns for subplots
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(10, 10)) # create a figure with subplots
    fig.suptitle('Clusters of nD data') # set the figure title
    for i in range(n_features): # loop over the features
        row = i // n_cols # get the row index
        col = i % n_cols # get the column index
        ax = axes[row, col] # get the subplot axis
        if n_clusters == 0: # if no clustering is performed
            ax.scatter(data[:, i], data[:, (i+1) % n_features], c='b', alpha=0.5, s=20) # plot the data points
        else: # if clustering is performed
            # Perform k-means clustering on the data
            labels = kmeans.labels_
            centroids = kmeans.cluster_centers_
            distortion= kmeans.inertia_
            # Plot the data points and centroids in 2D
            ax.scatter(data[:, i], data[:, (i+1) % n_features], c=labels, cmap='rainbow', alpha=0.5, s=20) # plot the data points with cluster colors
            ax.scatter(centroids[:, i], centroids[:, (i+1) % n_features], marker='x', color='black') # plot the centroids with black crosses
        ax.set_xlabel('feature' + str(i+1)) # set the x label
        ax.set_ylabel('feature' + str((i+1) % n_features + 1)) # set the y label
    plt.tight_layout() # adjust the layout
    plt.show() # show the plot
    return distortion # return the distortion


In [None]:
def display_clusters(data, algo=[]):
    n_features = data.shape[1] # get the number of features
    n_rows = n_cols = int(np.ceil(np.sqrt(n_features))) # get the number of rows and columns for subplots
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(10, 10)) # create a figure with subplots
    fig.suptitle('Clusters of nD data') # set the figure title
    for i in range(n_features): # loop over the features
        row = i // n_cols # get the row index
        col = i % n_cols # get the column index
        ax = axes[row, col] # get the subplot axis
        # Perform k-means clustering on the data
        labels = algo.labels_
        # Plot the data points and centroids in 2D
        ax.scatter(data[:, i], data[:, (i+1) % n_features], c=labels, cmap='rainbow', alpha=0.5, s=20) # plot the data points with cluster colors
        ax.set_xlabel('feature' + str(i+1)) # set the x label
        ax.set_ylabel('feature' + str((i+1) % n_features + 1)) # set the y label
    plt.tight_layout() # adjust the layout
    plt.show() # show the plot

## Multi Blob Data Set 
* The Data Set generated below has 6 cluster with varying number of users and varing densities
* Cluster the data set below using 



In [None]:
plt.rcParams['figure.figsize'] = [8,8]
sns.set_style("whitegrid")
sns.set_context("talk")

n_bins = 6  
centers = [(-3, -3), (0, 0), (5,2.5),(-1, 4), (4, 6), (9,7)]
Multi_blob_Data, y = make_blobs(n_samples=[100,150, 300, 400,300, 200], n_features=2, cluster_std=[1.3,0.6, 1.2, 1.7,0.9,1.7],
                                centers=centers, shuffle=False, random_state=42)
plot_clusters_2D(Multi_blob_Data)

### Kmeans 
* Use Kmeans with different values of K to cluster the above data 
* Display the outcome of each value of K 
* Plot distortion function versus K and choose the approriate value of k 
* Plot the silhouette_score versus K and use it to choose the best K 
* Store the silhouette_score for the best K for later comparison with other clustering techniques. 

In [None]:
best_sil=[]

dist_func1=[]
sil_scores1=[]
K_range = range(2,11)
for k in K_range:
    kmeans = KMeans(n_clusters=k,init='k-means++',random_state=42)
    kmeans.fit(Multi_blob_Data)
    dist_func1.append(plot_clusters_2D(Multi_blob_Data,kmeans,k))
    preds=kmeans.fit_predict(Multi_blob_Data)
    sil_scores1.append(silhouette_score(Multi_blob_Data,preds))

plt.figure()
# Create a line plot of the distortion function versus the K values
plt.plot(K_range, dist_func1, marker='o')
# Set the x and y labels
plt.xlabel("Number of clusters")
plt.ylabel("Distortion function")
plt.title("Distortion Function vs Number of Clusters")
# Show the plot
plt.show()
plt.figure()
# Create a line plot of the Silhouette scores versus the K values
plt.plot(K_range, sil_scores1, marker='o')
# Set the x and y labels
plt.xlabel("Number of clusters")
plt.ylabel("Silhouette scores")
plt.title("Silhouette scores vs Number of Clusters")
# Show the plot
plt.show()



In [None]:
print(f"distortion values: {dist_func1}")
print(f"silhouette scores: {sil_scores1}")

Based on the provided distortion values and silhouette scores, it appears that the data was clustered using the KMeans algorithm with a range of number of clusters (k) from 2 to 10.

The distortion values represent the sum of squared distances between the points and their assigned cluster centers. The lower the distortion value, the closer the points are to their assigned cluster centers, indicating a better fit. The distortion values decrease as k increases, indicating that the clustering solution improves as more clusters are added. However, this does not necessarily mean that a higher k is always better, as it can lead to overfitting.

The silhouette scores represent the average silhouette coefficient for all the samples. The silhouette coefficient is a measure of how well each sample fits into its assigned cluster compared to the other clusters. A score of 1 indicates that the sample fits very well into its assigned cluster, a score of 0 indicates that the sample is on the boundary between two clusters, and a score of -1 indicates that the sample was assigned to the wrong cluster. The higher the silhouette score, the better the clustering solution.

In this case, the silhouette score is relatively high for k=6, indicating that this is a good clustering solution. The distortion value is also relatively low for k=6, further supporting this conclusion. Therefore, the appropriate value of k is likely to be 6.

It is also worth noting that the silhouette score decreases for k>6, indicating that adding more clusters does not improve the clustering solution. This is consistent with the distortion values, which also do not decrease significantly for k>6.

### Hierarchal Clustering
* Use AgglomerativeClustering function to  to cluster the above data 
* In the  AgglomerativeClustering change the following parameters 
    * Affinity (use euclidean, manhattan and cosine)
    * Linkage( use average and single )
    * Distance_threshold (try different)
* For each of these trials plot the Dendograph , calculate the silhouette_score and display the resulting clusters  
* Find the set of parameters that would find result in the best silhouette_score and store this score for later comparison with other clustering techniques. 
* Record your observation 

In [None]:
affinities = ['euclidean','cityblock', 'cosine']
linkages = ['average', 'single']
#distance_thresholds = [None, 5, 10, 15]
sil_scores2 = []
corres_params2 = []
# loop over the combinations of parameters
for affinity in affinities:
    for linkage in linkages:
        # plot the dendrogram
        fig = plt.figure(figsize=(10, 5))
        Z = la(Multi_blob_Data, method=linkage, metric=affinity)
        plt.title('Dendrogram for AgglomerativeClustering')
        plt.xlabel('Sample Index')
        plt.ylabel('Distance')
        dendrogram(Z)
        plt.show()
        distance_threshold_max=Z[:,2].max()
        step=distance_threshold_max/8
        array = np.arange(step,distance_threshold_max+step,step)
        distance_thresholds = array.tolist()

        for distance_threshold in distance_thresholds:
            # create an instance of the AgglomerativeClustering class
            ac = AgglomerativeClustering(n_clusters=None,metric=affinity, linkage=linkage, distance_threshold=distance_threshold)
            
            # fit the data and get the labels
            ac.fit(Multi_blob_Data)
            labels = ac.labels_
            
            # compute the silhouette score
            if distance_threshold ==0:
                # use the default number of clusters (2)
                score = silhouette_score(Multi_blob_Data, labels, metric=affinity)
            else:
                # use the number of clusters found by the algorithm
                score = silhouette_score(Multi_blob_Data, labels, metric=affinity)
            
            sil_scores2.append(score)
            corres_params2.append({'Affinity':affinity,'Linkage':linkage,'Threshold':distance_threshold})
            # plot the clusters
            fig2 = plt.figure(figsize=(10, 5))
            plt.title(f'Affinity: {affinity}, Linkage: {linkage}, Distance Threshold: {distance_threshold}, silhouette score: {score}')
            plt.xlabel(f'Number of Clusters: {len(np.unique(labels))}')
            plt.scatter(Multi_blob_Data[:, 0], Multi_blob_Data[:, 1], c=labels, cmap='rainbow',alpha=0.5,s=20)
            plt.show()

Hierarchical clustering is a method of cluster analysis that seeks to build a hierarchy of clusters by merging or splitting them based on similarity measures It can reveal the hierarchical structure of the data and handle different types of clusters

You have provided a dataset of 2D points with six centers and a code snippet that performs agglomerative clustering with different parameters. Agglomerative clustering is a bottom-up approach that starts with each point as a separate cluster and then merges the closest clusters until a stopping criterion is reached

The parameters that you have used are:

Affinity: This is the distance metric used to measure the similarity between points. You have used euclidean, cityblock (also known as manhattan), and cosine distances
Linkage: This is the criterion used to measure the dissimilarity between clusters. You have used average and single linkages. Average linkage computes the average distance between all pairs of points in two clusters, while single linkage computes the minimum distance between any pair of points in two clusters
Distance_threshold: This is the threshold value for the distance between clusters. If the distance is greater than or equal to this value, the clusters will not be merged. You have used different values ranging from 0 to the maximum distance in the dendrogram
For each combination of parameters, you have plotted the dendrogram, calculated the silhouette score, and displayed the resulting clusters. The silhouette score is a measure of how well each point fits into its assigned cluster, ranging from -1 to 1. A higher score indicates a better clustering

Based on your code output, the best silhouette score is 0.64, achieved with the following parameters:

Affinity: cosine
Linkage: average
Distance_threshold: 0.5
This results in 4 clusters that could be seen as they are separated at the origin(0,0) into 4 quadrants

Some observations that you can make from your results are:

The choice of affinity, linkage, and distance_threshold can have a significant impact on the clustering outcome and the silhouette score. Different combinations can result in different numbers and shapes of clusters
The cosine distance seems to work better than the cityblock and euclidean distances for this dataset, as it produces higher silhouette scores and more coherent clusters. 
The average linkage seems to work better than the single linkage for this dataset, as it produces higher silhouette scores and more balanced clusters. This may be because the average linkage is less sensitive to outliers and noise than the single linkage, which can create long chains of clusters
The optimal distance_threshold depends on the desired number of clusters and the structure of the data. A lower threshold can result in more clusters, while a higher threshold can result in fewer clusters. A good way to choose the threshold is to look at the dendrogram and find the largest vertical gap that does not cross any horizontal line

### DBScan
* Use DBScan function to  to cluster the above data 
* In the  DBscan change the following parameters 
    * EPS (from 0.1 to 3)
    * Min_samples (from 5 to 25)
* Plot the silhouette_score versus the variation in the EPS and the min_samples
* Plot the resulting Clusters in this case 
* Find the set of parameters that would find result in the best silhouette_score and store this score for later comparison with other clustering techniques. 
* Record your observations and comments 

In [None]:
# define the range of parameters
eps_values = np.arange(0.4, 2.1, 0.2)
min_samples_values = np.arange(5, 26, 2)

# initialize an empty list to store the silhouette scores
sil_scores3 = []
corres_params3  = []

# loop over the combinations of parameters
for eps in eps_values:
    for min_samples in min_samples_values:
        # create an instance of the DBSCAN class
        dbscan = DBSCAN(eps=eps, min_samples=min_samples)
        
        # fit the data and get the labels
        dbscan.fit(Multi_blob_Data)
        labels = dbscan.labels_
        
        # compute the silhouette score
        # ignore the noise points (labelled as -1) for the score calculation
        score = silhouette_score(Multi_blob_Data, labels, metric='euclidean')
        
        # append the score to the list
        sil_scores3.append(score)
        corres_params3.append( {"eps":eps, "min_samples":min_samples} )
        
        # plot the clusters
        fig = plt.figure(figsize=(10, 5))
        plt.title(f'EPS: {eps}, Min_samples: {min_samples}, silhouette score: {score}')
        plt.xlabel(f'Number of Clusters: {len(np.unique(labels)) - 1}') # subtract 1 to exclude the noise cluster
        plt.scatter(Multi_blob_Data[:, 0], Multi_blob_Data[:, 1], c=labels, cmap='rainbow',alpha=0.5,s=20)
        plt.show()

Based on the code output, the best silhouette score is 0.66, achieved with the following parameters:

EPS: 0.9
Min_samples: 5
This results in six clusters that match the original centers of the data

### Gaussian Mixture
* Use GaussianMixture function to cluster the above data 
* In GMM change the covariance_type and check the difference in the resulting proabability fit 
* Use a 2D contour plot to plot the resulting distribution (the components of the GMM) as well as the total Gaussian mixture 

In [None]:
from matplotlib.colors import LogNorm

# Define a function to display the clusters
def display_cluster(X, y_pred, title):
    plt.scatter(X[:, 0], X[:, 1], c=y_pred, s=50, cmap='viridis')
    plt.title(title)
    plt.show()

def plot_gmm_contours(gmm, X, title):
    # Create a grid of points to evaluate the GMM
    x = np.linspace(X[:, 0].min(), X[:, 0].max(), 100)
    y = np.linspace(X[:, 1].min(), X[:, 1].max(), 100)
    xx, yy = np.meshgrid(x, y)
    X_grid = np.c_[xx.ravel(), yy.ravel()]

    # Compute the log probability density of each point under each component
    z = gmm.score_samples(X_grid)
    z = z.reshape(xx.shape)
    # Compute the log probability density of each point under the total mixture
    log_prob_total = np.exp(z)

    # Plot the contours of the components and the total mixture
    plt.contour(xx, yy, log_prob_total, norm=LogNorm(vmin=1.0, vmax=1000.0), levels=14, cmap='viridis')
    plt.contour(xx, yy, log_prob_total, levels=14, colors='k')
    display_cluster(X, gmm.predict(X), title)
# Define a list of covariance types to try
cov_types = ['full','spherical', 'diag', 'tied']

# Loop over the covariance types and fit a GMM for each one
for cov_type in cov_types:
    gmm = GaussianMixture(n_components=6, covariance_type=cov_type, random_state=42)
    gmm.fit(Multi_blob_Data)
    y_pred = gmm.predict(Multi_blob_Data)
    # Display the clusters
    display_cluster(Multi_blob_Data, y_pred, f'GMM with 6 components and {cov_type} covariance')
    plot_gmm_contours(gmm, Multi_blob_Data, f'GMM with 6 components and {cov_type} covariance')




## iris data set 
The iris data set is test data set that is part of the Sklearn module 
which contains 150 records each with 4 features. All the features are represented by real numbers 

The data represents three classes 


In [None]:
from sklearn.datasets import load_iris
iris_data_raw = load_iris()
iris_data_raw.target[[10, 25, 50]]
#array([0, 0, 1])
list(iris_data_raw.target_names)
['setosa', 'versicolor', 'virginica']
iris_data=iris_data_raw['data']

* Repeat all the above clustering approaches and steps on the above data 
* Normalize the data then repeat all the above steps 
* Compare between the different clustering approaches 

## k-means

In [None]:
dist_func_iris=[]
sil_scores_iris=[]
K_range = range(2,11)
for k in K_range:
    kmeans = KMeans(n_clusters=k,init='k-means++',random_state=42)
    kmeans.fit(iris_data)
    plot_clusters_PCA(iris_data,kmeans,k)
    dist_func_iris.append(plot_clusters_nD(iris_data,kmeans,k))
    preds=kmeans.fit_predict(iris_data)
    sil_scores_iris.append(silhouette_score(iris_data,preds))

plt.figure()
# Create a line plot of the distortion function versus the K values
plt.plot(K_range, dist_func_iris, marker='o')
# Set the x and y labels
plt.xlabel("Number of clusters")
plt.ylabel("Distortion function")
plt.title("Distortion Function vs Number of Clusters")
# Show the plot
plt.show()
plt.figure()
# Create a line plot of the Silhouette scores versus the K values
plt.plot(K_range, sil_scores_iris, marker='o')
# Set the x and y labels
plt.xlabel("Number of clusters")
plt.ylabel("Silhouette scores")
plt.title("Silhouette scores vs Number of Clusters")
# Show the plot
plt.show()



# hierarchical

In [None]:
affinities = ['euclidean','cityblock', 'cosine']
linkages = ['average', 'single']
sil_scores_iris2 = []
corres_params_iris2 = []
# loop over the combinations of parameters
for affinity in affinities:
    for linkage in linkages:
        # plot the dendrogram
        fig = plt.figure(figsize=(10, 5))
        Z = la(iris_data, method=linkage, metric=affinity)
        plt.title('Dendrogram for AgglomerativeClustering')
        plt.xlabel('Sample Index')
        plt.ylabel('Distance')
        dendrogram(Z)
        plt.show()
        distance_threshold_max=Z[:,2].max()
        step=distance_threshold_max/8
        array = np.arange(step,distance_threshold_max+step,step)
        distance_thresholds = array.tolist()

        for distance_threshold in distance_thresholds:
            # create an instance of the AgglomerativeClustering class
            ac = AgglomerativeClustering(n_clusters=None,metric=affinity, linkage=linkage, distance_threshold=distance_threshold)
            
            # fit the data and get the labels
            ac.fit(iris_data)
            labels = ac.labels_
            
            # compute the silhouette score
            if distance_threshold ==0:
                # use the default number of clusters (2)
                score = silhouette_score(iris_data, labels, metric=affinity)
            else:
                # use the number of clusters found by the algorithm
                score = silhouette_score(iris_data, labels, metric=affinity)
            
            sil_scores_iris2.append(score)
            corres_params_iris2.append({'Affinity':affinity,'Linkage':linkage,'Threshold':distance_threshold})
            
            # Perform PCA to reduce the data to 2 dimensions
            pca = PCA(n_components=2, random_state=0).fit(iris_data)
            data_2d = pca.transform(iris_data)
            
            # plot the clusters
            fig2 = plt.figure(figsize=(10, 5))
            plt.title(f'Affinity: {affinity}, Linkage: {linkage}, Distance Threshold: {distance_threshold}, silhouette score: {score}')
            plt.xlabel(f'Number of Clusters: {len(np.unique(labels))}')
            plt.scatter(data_2d[:, 0], data_2d[:, 1], c=labels, cmap='rainbow',alpha=0.5,s=20)
            plt.show()
            display_clusters(iris_data,ac)

#DBscan

In [None]:
# define the range of parameters
eps_values = np.arange(0.4, 1.6, 0.1)
min_samples_values = np.arange(5, 21, 1)

# initialize an empty list to store the silhouette scores
sil_scores_iris3 = []
corres_params_iris3  = []

# loop over the combinations of parameters
for eps in eps_values:
    for min_samples in min_samples_values:
        # create an instance of the DBSCAN class
        dbscan = DBSCAN(eps=eps, min_samples=min_samples)
        
        # fit the data and get the labels
        dbscan.fit(iris_data)
        labels = dbscan.labels_
        
        # compute the silhouette score
        # ignore the noise points (labelled as -1) for the score calculation
        score = silhouette_score(iris_data, labels)
        
        # append the score to the list
        sil_scores_iris3.append(score)
        corres_params_iris3.append( {"eps":eps, "min_samples":min_samples} )
        # Perform PCA to reduce the data to 2 dimensions
        pca = PCA(n_components=2, random_state=0).fit(iris_data)
        data_2d = pca.transform(iris_data)
        # plot the clusters
        fig = plt.figure(figsize=(10, 5))
        plt.title(f'EPS: {eps}, Min_samples: {min_samples}, silhouette score: {score}')
        plt.xlabel(f'Number of Clusters: {len(np.unique(labels)) - 1}') # subtract 1 to exclude the noise cluster
        plt.scatter(data_2d[:, 0], data_2d[:, 1], c=labels, cmap='rainbow',alpha=0.5,s=20)
        plt.show()
        display_clusters(iris_data,dbscan)

# GMM

In [None]:
X = iris_data
# Apply PCA to reduce the dimensionality to 2
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)
# Define a list of covariance types to try
cov_types = ['full','spherical', 'diag', 'tied']

# Loop over the covariance types and fit a GMM for each one
for cov_type in cov_types:
    gmm = GaussianMixture(n_components=3, covariance_type=cov_type, random_state=42)
    gmm.fit(X_pca)
    y_pred = gmm.predict(X_pca)
    # Display the clusters
    display_cluster(X_pca, y_pred, f'GMM with 3 components and {cov_type} covariance')
    plot_gmm_contours(gmm, X_pca, f'GMM with 3 components and {cov_type} covariance')

## Customer dataset
Repeat all the above on the customer data set 

## k-means

In [None]:
data_points=pd.read_csv("Customer data.csv",index_col=0)
data_points=data_points.to_numpy()

In [None]:
dist_func_data=[]
sil_scores_data=[]
K_range = range(2,11)
for k in K_range:
    kmeans = KMeans(n_clusters=k,init='k-means++',random_state=42)
    kmeans.fit(data_points)
    plot_clusters_PCA(data_points,kmeans,k)
    dist_func_data.append(plot_clusters_nD(data_points,kmeans,k))
    preds=kmeans.fit_predict(data_points)
    sil_scores_data.append(silhouette_score(data_points,preds,metric='euclidean'))

plt.figure()
# Create a line plot of the distortion function versus the K values
plt.plot(K_range, dist_func_data, marker='o')
# Set the x and y labels
plt.xlabel("Number of clusters")
plt.ylabel("Distortion function")
plt.title("Distortion Function vs Number of Clusters")
# Show the plot
plt.show()
plt.figure()
# Create a line plot of the Silhouette scores versus the K values
plt.plot(K_range, sil_scores_data, marker='o')
# Set the x and y labels
plt.xlabel("Number of clusters")
plt.ylabel("Silhouette scores")
plt.title("Silhouette scores vs Number of Clusters")
# Show the plot
plt.show()



# hierarchical

In [None]:
affinities = ['euclidean','cityblock', 'cosine']
linkages = ['average', 'single']
sil_scores_data2 = []
corres_params_data2 = []
# loop over the combinations of parameters
for affinity in affinities:
    for linkage in linkages:
        # plot the dendrogram
        fig = plt.figure(figsize=(10, 5))
        Z = la(data_points, method=linkage, metric=affinity)
        plt.title('Dendrogram for AgglomerativeClustering')
        plt.xlabel('Sample Index')
        plt.ylabel('Distance')
        dendrogram(Z)
        plt.show()
        distance_threshold_max=Z[:,2].max()
        step=distance_threshold_max/8
        array = np.arange(step,distance_threshold_max+step,step)
        distance_thresholds = array.tolist()

        for distance_threshold in distance_thresholds:
            # create an instance of the AgglomerativeClustering class
            ac = AgglomerativeClustering(n_clusters=None,metric=affinity, linkage=linkage, distance_threshold=distance_threshold)
            
            # fit the data and get the labels
            ac.fit(data_points)
            labels = ac.labels_
            
            # compute the silhouette score
            if distance_threshold ==0:
                # use the default number of clusters (2)
                score = silhouette_score(data_points, labels, metric=affinity)
            else:
                # use the number of clusters found by the algorithm
                score = silhouette_score(data_points, labels, metric=affinity)
            
            sil_scores_data2.append(score)
            corres_params_data2.append({'Affinity':affinity,'Linkage':linkage,'Threshold':distance_threshold})
            
            # Perform PCA to reduce the data to 2 dimensions
            pca = PCA(n_components=2, random_state=0).fit(data_points)
            data_2d = pca.transform(data_points)
            
            # plot the clusters
            fig2 = plt.figure(figsize=(10, 5))
            plt.title(f'Affinity: {affinity}, Linkage: {linkage}, Distance Threshold: {distance_threshold}, silhouette score: {score}')
            plt.xlabel(f'Number of Clusters: {len(np.unique(labels))}')
            plt.scatter(data_2d[:, 0], data_2d[:, 1], c=labels, cmap='rainbow',alpha=0.5,s=20)
            plt.show()
            display_clusters(data_points,ac)

# dbscan

In [None]:
# define the range of parameters
eps_values = np.arange(700, 1500, 100)
min_samples_values = np.arange(2, 21, 2)

# initialize an empty list to store the silhouette scores
sil_scores_data3 = []
corres_params_data3  = []

# loop over the combinations of parameters
for eps in eps_values:
    for min_samples in min_samples_values:
        # create an instance of the DBSCAN class
        dbscan = DBSCAN(eps=eps, min_samples=min_samples)
        
        # fit the data and get the labels
        dbscan.fit(data_points)
        labels = dbscan.labels_
        
        # compute the silhouette score
        # ignore the noise points (labelled as -1) for the score calculation
        score = silhouette_score(data_points, labels)
        
        # append the score to the list
        sil_scores_data3.append(score)
        corres_params_data3.append( {"eps":eps, "min_samples":min_samples} )
        # Perform PCA to reduce the data to 2 dimensions
        pca = PCA(n_components=2, random_state=0).fit(data_points)
        data_2d = pca.transform(data_points)
        # plot the clusters
        fig = plt.figure(figsize=(10, 5))
        plt.title(f'EPS: {eps}, Min_samples: {min_samples}, silhouette score: {score}')
        plt.xlabel(f'Number of Clusters: {len(np.unique(labels)) - 1}') # subtract 1 to exclude the noise cluster
        plt.scatter(data_2d[:, 0], data_2d[:, 1], c=labels, cmap='rainbow',alpha=0.5,s=20)
        plt.show()
        display_clusters(data_points,dbscan)

# GMM

In [None]:
X = data_points

# Apply PCA to reduce the dimensionality to 2
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)

# Define a list of covariance types to try
cov_types = ['full','spherical', 'diag', 'tied']

# Loop over the covariance types and fit a GMM for each one
for cov_type in cov_types:
    gmm = GaussianMixture(n_components=5, covariance_type=cov_type, random_state=42)
    gmm.fit(X_pca)
    y_pred = gmm.predict(X_pca)
    # Display the clusters
    display_cluster(X_pca, y_pred, f'GMM with 5 components and {cov_type} covariance')
    plot_gmm_contours(gmm, X_pca, f'GMM with 5 components and {cov_type} covariance')
