In [4]:
#Import library 
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import math
from sklearn.cluster import KMeans, AgglomerativeClustering, SpectralClustering
from sklearn.metrics import rand_score, silhouette_score, accuracy_score, adjusted_rand_score
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster

import umap.umap_ as umap
from sklearn.manifold import TSNE

import warnings
warnings.filterwarnings('ignore')

sns.set(context="notebook", palette="viridis", style="darkgrid", font_scale=1.0, color_codes=True,rc={'figure.figsize':(14,10)})

In [5]:
workingDir = 'E:\\Document\\M-Tech\\M-Tech Class\\2nd Trimester\\Assignment\\AI Lab\\Assignment 5\\Datasets'
dataset_fileName = '20.csv'
random_state = 42
cluster = 3

In [63]:
def get_num_clusters(input_data, linkage_method, distance_metric):
    '''# Define a function to calculate the number of clusters for each
    combination of linkage method and distance metric''' 
    agg_clustering = AgglomerativeClustering(linkage=linkage_method,distance_threshold = 0.1)
    agg_clustering.fit(input_data)

    # Get the number of clusters
    n_clusters = agg_clustering.n_clusters_
    return n_clusters
    


def evaluation_hierarchical_cluster(input_data, linkage_method, distance_metric, dataset,clusters=3):
    '''Evaluation of hierarchical clustering'''
    print(f'============Linkage method: {linkage_method}, Distance metric: {distance_metric}============')
    linkage_init = linkage(input_data, method=linkage_method, metric=distance_metric)
    y_pred_hierarchical = pd.Series(fcluster(linkage_init, t=clusters, criterion='maxclust'), index=dataset.index) 
    
    clust_agglo = AgglomerativeClustering(n_clusters=None, metric = distance_metric, linkage = linkage_method, distance_threshold = 0.1)
    y_pred_agglo_hierarchical = clust_agglo.fit_predict(input_data, dataset["y"])
    
    print('Hierarchical Clustering using fcluster:')
    print(f'Accuracy Score: {accuracy_score(dataset["y"], y_pred_hierarchical):.4f}')
    print(f'Silhouette score: {silhouette_score(input_data, y_pred_hierarchical):.4f}')
    print(f'Adjusted Rand score: {adjusted_rand_score(dataset["y"], y_pred_hierarchical):.4f}') 
    print(f'Rand score: {rand_score(dataset["y"], y_pred_hierarchical):.4f}') 
    
    print('Hierarchical Clustering using Agglomerative Clustering:')
    print(f'Accuracy Score: {accuracy_score(dataset["y"], y_pred_agglo_hierarchical):.4f}')
    print(f'Silhouette score: {silhouette_score(input_data, y_pred_agglo_hierarchical):.4f}')
    print(f'Adjusted Rand score: {adjusted_rand_score(dataset["y"], y_pred_agglo_hierarchical):.4f}') 
    print(f'Rand score: {rand_score(dataset["y"], y_pred_agglo_hierarchical):.4f}') 

def evaluation_spectral_cluster(input_data, dataset, clusters=3, neighbors=10, random_state=42, affinity='nearest_neighbors'):
    '''Evaluation of spectral clustering'''    
    spectral = SpectralClustering(n_clusters=clusters, affinity=affinity, n_neighbors=neighbors, random_state=random_state)
    y_pred_spectral = spectral.fit_predict(input_data)
    print('Spectral Clustering:')
    print(f'Accuracy Score: {accuracy_score(dataset["y"], y_pred_spectral):.4f}')
    print(f'Silhouette score: {silhouette_score(input_data, y_pred_spectral):.4f}')
    print(f'Adjusted Rand score: {adjusted_rand_score(dataset["y"], y_pred_spectral):.4f}')
    print(f'Rand score: {rand_score(dataset["y"], y_pred_spectral):.4f}')
    
def plotUMAPGraph(data,output_data, title, pallet_name,n_neighbors,min_dist,metric_val):
    '''Plot UMAP graph'''
    plt.figure(figsize=(10, 5))
    sns.scatterplot(
        x=data[:, 0], y=data[:, 1],
        hue= output_data,
        palette=sns.color_palette(pallet_name, len(np.unique(output_data))),
        legend="full",
        alpha=0.3
    )
    plt.title(f'{title} : neighbour: {n_neighbors}, metric : {metric_val} & minimum distance: {min_dist}')
    plt.show()
              
def plotTSNEGraph(data,output_data, title, pallet_name,perplexity):
    '''Plot TSNE graph'''
    plt.figure(figsize=(10, 5))
    sns.scatterplot(
        x=data[:, 0], y=data[:, 1],
        hue= output_data,
        palette=sns.color_palette(pallet_name, len(np.unique(output_data))),
        legend="full",
        alpha=0.3
    )
    plt.title(f'{title} : perplexity: {perplexity}')
    plt.show()

In [59]:
#Change the working directory 
os.chdir(workingDir)
#Load the data
dataset = pd.read_csv(dataset_fileName)

In [60]:
input_data = dataset.drop('y', axis=1)
output_data = dataset['y']

In [61]:
# List of linkage methods and distance metrics  '
linkage_methods = ['average', 'complete', 'ward', 'single']
distance_metrics = ['manhattan', 'cosine', 'euclidean']


In [64]:
# Calculate the number of clusters for each combination of linkage method and distance metric
for linkage_method in linkage_methods:
    for distance_metric in distance_metrics:
        num_clusters = get_num_clusters(input_data, linkage_method, distance_metric)
        print(f'Linkage method: {linkage_method}, Distance metric: {distance_metric}, Number of clusters: {num_clusters}')
        

ValueError: Exactly one of n_clusters and distance_threshold has to be set, and the other needs to be None.

In [None]:
, 