# GUC Clustering Project 

**Objective:** 
The objective of this project teach students how to apply clustering to real data sets

The projects aims to teach student: 
* Which clustering approach to use
* Compare between Kmeans, Hierarchal, DBScan, and Gaussian Mixtures  
* How to tune the parameters of each data approach
* What is the effect of different distance functions (optional) 
* How to evaluate clustering approachs 
* How to display the output
* What is the effect of normalizing the data 

Students in this project will use ready-made functions from Sklearn, plotnine, numpy and pandas 
 



In [1]:
# if plotnine is not installed in Jupter then use the following command to install it 
!pip install plotnine





DEPRECATION: pyodbc 4.0.0-unsupported has a non-standard version number. pip 24.0 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of pyodbc or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063

[notice] A new release of pip is available: 23.3.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


Running this project require the following imports 

In [2]:
import warnings
warnings.filterwarnings('ignore')
import seaborn as sns 
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import sklearn.preprocessing as prep
from sklearn.datasets import make_blobs
from plotnine import *   
# StandardScaler is a function to normalize the data 
# You may also check MinMaxScaler and MaxAbsScaler 
#from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import DBSCAN
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram, linkage


from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture

from sklearn.metrics import silhouette_score

%matplotlib inline




In [3]:
#Cluster_Distance: numpy array of shape (num_points, K), containing distances between each data point and each cluster centroid.

def euclidean_distance(point1, point2):
    return np.sqrt(np.sum((point1 - point2)**2))

def pearson_correlation_distance(data_array, centroid,mean_points,mean_centroid,new_points):    
    new_centroid = centroid - mean_centroid
    num = np.sum(new_points * new_centroid, axis=1)
    den = (np.sqrt(np.sum(new_points ** 2, axis=1))) * (np.sqrt(np.sum(new_centroid ** 2)))
    return 1 - (num / den)

def GUC_Distance ( Cluster_Centroids, Data_points, Distance_Type ):
    data_array = Data_points.to_numpy()  # Convert DataFrame to numpy array
    num_clusters = Cluster_Centroids.shape[0]
    num_data_points = Data_points.shape[0]
    mean_points = np.mean(Data_points, axis=0)
    mean_centroid = np.mean(Cluster_Centroids, axis=0)
    new_points = Data_points - mean_points
    distances = np.zeros((num_data_points, num_clusters))
    for i in range(num_clusters):
        centroid = Cluster_Centroids.iloc[i, :].values
        if Distance_Type == 'euclidean':
            distances[:, i] = np.apply_along_axis(lambda x: euclidean_distance(x, centroid), axis=1, arr=data_array)
        elif Distance_Type == 'pearson':
            distances[:, i] = pearson_correlation_distance(data_array, centroid,mean_points,mean_centroid,new_points)

    return distances




In [4]:
def display_cluster(X, km, num_clusters):
    color = ['b', 'r', 'g', 'c', 'm', 'y', 'k', 'Turquoise', 'LimeGreen', 'pink']  # List colors
    alpha = 0.5  # color opaque
    s = 100

    if num_clusters == 0 or not km:
        plt.scatter(X.iloc[:, 0], X.iloc[:, 1], c=color[0], alpha=alpha, s=s)
    else:
        num_columns = X.shape[1]
        fig, ax = plt.subplots(num_columns, num_columns, figsize=(30, 30))
        for i in range(min(num_clusters, len(km))):
            cluster_assignments, cluster_centers = km[i]
            arr = X.to_numpy() 
            for m in range(num_columns):
                for n in range(num_columns):
                    indices = arr[cluster_assignments == i]
                    ax[m, n].scatter(arr[cluster_assignments == i, m], arr[cluster_assignments == i, n], c=color[i], alpha=alpha, marker='o', s=s)
                    ax[m, n].scatter(cluster_centers.iloc[i, m], cluster_centers.iloc[i, n], c=color[i], marker='x', s=200)
                   
        plt.show()

In [5]:
def GUC_Kmean ( Data_points, Number_of_Clusters,  Distance_Type ):
    tolerance=0.001
    km=[]
       # Step 1: Initialize cluster heads
    #cluster_heads = initialize_cluster_heads(Data_points, Number_of_Clusters)
    data_ranges = np.ptp(Data_points.values, axis=0)
    cluster_heads_df = np.random.rand(Number_of_Clusters, Data_points.shape[1]) * data_ranges
    cluster_heads = pd.DataFrame(cluster_heads_df, columns=Data_points.columns)
    
    
    previous_distortion_function = float('inf')
    thres_prev = np.inf
    Cluster_Metric=0
    mean_square_distances = np.zeros(Number_of_Clusters)
    while True:
        # Step 2: Cluster Assignment
        Final_Cluster_Distance = GUC_Distance(cluster_heads, Data_points, Distance_Type)
        cluster_assignments = np.argmin(Final_Cluster_Distance, axis=1)
        thres = 0
        mean_square_distances = np.zeros(Number_of_Clusters)
       # Step 3: Calculate Mean Square Distance for each cluster
        for i in range(Number_of_Clusters):
            cluster_points = Data_points.iloc[cluster_assignments == i]
            squared_distances = np.sum((cluster_points - cluster_heads.iloc[i, :]) ** 2, axis=1)
            mean_square_distances[i] = np.nanmean(squared_distances)  # Calculate mean while ignoring NaN values
            mean_square_distances = np.nan_to_num(mean_square_distances, nan=0.0)  # Replace NaN values with zero
            thres += np.sum((cluster_points - cluster_heads.iloc[i, :])**2)

        #print("mean_square_distances",mean_square_distances)
        

        # Step 4: Stopping condition : # if the sum of mean square distances are close or smaller than the tolarnce
        if np.sum(mean_square_distances) < tolerance or np.allclose(mean_square_distances, previous_distortion_function):
            break
        eps = 0.0001
        if (abs(thres - thres_prev) < eps).any():
            break
        thres_prev = thres

        previous_distortion_function=mean_square_distances.copy()
        
        #print("Sum_of_mean_square_dis",sum(mean_square_distances))   
        
        
          # Step 3: Update centroids
        for i in range(Number_of_Clusters):
            cluster_points = Data_points.iloc[cluster_assignments == i]  # Use iloc here to select rows
            if len(cluster_points) > 0:
                cluster_heads.iloc[i, :] = np.mean(cluster_points, axis=0)
        
        
    # Step 6: Calculate Cluster Metric (Distortion Function)
        km.append((cluster_assignments,cluster_heads))
    
    #Cluster_Metric = sum(mean_square_distances)
    Cluster_Metric += sum(thres)
    #print("Clusrer_Metric",Cluster_Metric)
    display_cluster(Data_points,km,Number_of_Clusters)
    return [ Final_Cluster_Distance , Cluster_Metric ]






In [6]:
def plotting_Examples(example_data,distance_type):
    
    #example_data = pd.DataFrame(example_data, columns=['X', 'Y'])
    Num_of_clusters=[]
    cluster_Performance_mertic=[]
    # Check if example_data is already a DataFrame
    if not isinstance(example_data, pd.DataFrame):
        # Convert example_data to DataFrame with columns 'X' and 'Y'
        example_data = pd.DataFrame(example_data, columns=['X', 'Y'])
    for i in range(2,11):
        clusters=i
        print("Graphs_For_Cluster=",i)
        [Final_Cluster_Distance , Cluster_Metric ] = GUC_Kmean(example_data,i,distance_type)
        cluster_Performance_mertic.append(Cluster_Metric)
        Num_of_clusters.append(clusters)
        print("cluster_Performance",cluster_Performance_mertic)
        
    plt.plot(Num_of_clusters,cluster_Performance_mertic, marker='o',linestyle='-')
    plt.xlabel('Number of Clusters')
    plt.ylabel('Performance Metric')
    plt.title('Cluster Performance Metric vs Number of Clusters')
    plt.show()    

In [10]:
def plotting_silhouette_scoreeeeeeee(example_data,distance_type):
    cluster_metrics_e = []
    num_clusters_list_e = []
    silhouette_scores = []
    # Check if example_data is already a DataFrame
    if not isinstance(example_data, pd.DataFrame):
        # Convert example_data to DataFrame with columns 'X' and 'Y'
        example_data = pd.DataFrame(example_data, columns=['X', 'Y'])
        
    k_values= range(2,11)
    for i in k_values:
        Number_of_Clusters = i
        print("Graphs_For_Cluster=",i)
        [Final_Cluster_Distance, Cluster_Metric]=GUC_Kmean(example_data,Number_of_Clusters,distance_type)
        cluster_metrics_e.append(Cluster_Metric)
        num_clusters_list_e.append(Number_of_Clusters)

        # Check number of unique labels
        cluster_labels = Final_Cluster_Distance.argmin(axis=1)
        unique_labels = len(set(cluster_labels))
    
    # Calculate silhouette score if more than one unique label
        if unique_labels > 1:
            sil_score = silhouette_score(example_data, cluster_labels)
            silhouette_scores.append(sil_score)
        else:
            silhouette_scores.append(None)  # Placeholder if silhouette score cannot be calculated
        
        
    # Plot silhouette scores versus K
    silhouette_scores_filteredpn = [score for score in silhouette_scores if score is not None]
    num_clusters_filteredpn = [num_clusters_list_e[i] for i, score in enumerate(silhouette_scores) if score is not None]
    best_k_idxpn = np.argmax(silhouette_scores_filteredpn)
    best_k_pin = num_clusters_filteredpn[best_k_idxpn]
    print("Best K using silhouette score:", best_k_pin)
    plt.plot(num_clusters_filteredpn, silhouette_scores_filteredpn, marker='o')
    plt.xlabel('Number of clusters (K)')
    plt.ylabel('Silhouette Score')
    plt.title('Silhouette Score vs Number of Clusters')
    plt.xticks(num_clusters_filteredpn)
    plt.grid(True)
    plt.show()
    
  

## Multi Blob Data Set 
* The Data Set generated below has 6 cluster with varying number of users and varing densities
* Cluster the data set below using 



In [121]:
plt.rcParams['figure.figsize'] = [8,8]
sns.set_style("whitegrid")
sns.set_context("talk")

n_bins = 6  
centers = [(-3, -3), (0, 0), (5,2.5),(-1, 4), (4, 6), (9,7)]
Multi_blob_Data, y = make_blobs(n_samples=[100,150, 300, 400,300, 200], n_features=2, cluster_std=[1.3,0.6, 1.2, 1.7,0.9,1.7],
                  centers=centers, shuffle=False, random_state=42)


### Kmeans 
* Use Kmeans with different values of K to cluster the above data 
* Display the outcome of each value of K 
* Plot distortion function versus K and choose the approriate value of k 
* Plot the silhouette_score versus K and use it to choose the best K 
* Store the silhouette_score for the best K for later comparison with other clustering techniques. 

In [63]:
from sklearn.cluster import KMeans
import numpy as np
import matplotlib.pyplot as plt

def kmeans_clustering(example_data, k_values):
    # Initialize lists to store SSE and silhouette scores for different K values
    sse_values = []
    silhouette_scores = []

    for k in k_values:
        # Perform K-means clustering
        kmeans = KMeans(n_clusters=k)
        cluster_labels = kmeans.fit_predict(example_data)
        
        # Calculate Sum of Squared Errors (SSE) for the clustering
        sse = kmeans.inertia_
        sse_values.append(sse)
        
        # Calculate silhouette score for the clustering
        silhouette_avg = silhouette_score(example_data, cluster_labels)
        silhouette_scores.append(silhouette_avg)

        # Plot clusters (for visualization, you can customize this part)
        plt.figure(figsize=(8, 6))
        plt.scatter(example_data[:, 0], example_data[:, 1], c=cluster_labels, cmap='viridis', marker='o', edgecolors='k')
        plt.title(f'K-means Clustering with K={k}')
        plt.xlabel('Feature 1')
        plt.ylabel('Feature 2')
        plt.show()

    # Find the index of the maximum silhouette score
    best_k_idx = np.argmax(silhouette_scores)
    # Get the corresponding value of K
    best_k = k_values[best_k_idx]  
    
    # Plot SSE versus K
    plt.figure(figsize=(10, 6))
    plt.plot(k_values, sse_values, marker='o', linestyle='-')
    plt.title('Sum of Squared Errors (SSE) versus Number of Clusters (K)')
    plt.xlabel('Number of Clusters (K)')
    plt.ylabel('SSE')
    plt.grid(True)
    plt.tight_layout()
    plt.show()
    
    print("Best K using silhouette score:", best_k)
    # Plot silhouette score versus K
    plt.figure(figsize=(10, 6))
    plt.plot(k_values, silhouette_scores, marker='o', linestyle='-')
    plt.title('Silhouette Score versus Number of Clusters (K)')
    plt.xlabel('Number of Clusters (K)')
    plt.ylabel('Silhouette Score')
    plt.grid(True)
    plt.tight_layout()
    plt.show()

    



### Hierarchal Clustering
* Use AgglomerativeClustering function to  to cluster the above data 
* In the  AgglomerativeClustering change the following parameters 
    * Affinity (use euclidean, manhattan and cosine)
    * Linkage( use average and single )
    * Distance_threshold (try different)
* For each of these trials plot the Dendograph , calculate the silhouette_score and display the resulting clusters  
* Find the set of paramters that would find result in the best silhouette_score and store this score for later comparison with other clustering techniques. 
* Record your observation 

In [65]:
import numpy as np
import matplotlib.pyplot as plt  # Importing plt from matplotlib.pyplot
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.cluster.hierarchy import fcluster
import seaborn as sns

def Hierarchal_clustering(example_data):
    # Define parameter combinations to try
    affinities = ['euclidean', 'manhattan', 'cosine']
    linkages = ['average', 'single']
    distance_thresholds = [None, 2, 5]  # Adjust threshold values as needed

    # Initialize variables to store the best silhouette score and its corresponding parameters
    best_silhouette_score = -1  # Initialize with a value that ensures any calculated silhouette score will be better
    best_params = None
    # Loop through parameter combinations
    for affinity in affinities:
        if affinity == 'manhattan':
            affinity = 'cityblock'
        for linkage_type in linkages:
            for distance_threshold in distance_thresholds:
                # Perform hierarchical clustering
                if distance_threshold is not None:
                    Z = linkage(example_data, metric=affinity, method=linkage_type)
                    cluster_labels = fcluster(Z, t=distance_threshold, criterion='distance')
                    plt.figure(figsize=(20, 18))
                    plt.title(f"Dendrogram ({affinity}, {linkage_type}, Threshold={distance_threshold})")
                else:
                    clustering = AgglomerativeClustering(n_clusters=2, affinity=affinity, linkage=linkage_type)
                    cluster_labels = clustering.fit_predict(example_data)
                    Z = linkage(example_data, metric=affinity, method=linkage_type)
                    plt.figure(figsize=(20, 18))
                    plt.title(f"Dendrogram ({affinity}, {linkage_type}, n_clusters=2)")
                dendrogram(Z)     
                # Plot dendrogram       
                plt.xlabel('Data points')
                plt.ylabel('Distance')
                plt.show()
                # Plot resulting clusters
                plt.figure(figsize=(8, 6))
                plt.scatter(example_data[:, 0], example_data[:, 1], c=cluster_labels, cmap='rainbow', marker='o', edgecolors='k')
                plt.title(f"Clusters - Affinity: {affinity}, Linkage: {linkage_type}, Distance Threshold: {distance_threshold}")
                plt.xlabel('Feature 1')
                plt.ylabel('Feature 2')
                plt.colorbar(label='Cluster')
                plt.grid(True)
                plt.show()

                # Calculate silhouette score
                # Check if multiple clusters are formed
                if len(np.unique(cluster_labels)) > 1:
                    silhouette_avg = silhouette_score(pd.DataFrame(example_data), cluster_labels)
                    print(f"Silhouette Score: {silhouette_avg}")
                 # Check if this silhouette score is better than the current best
                    if silhouette_avg > best_silhouette_score:
                        best_silhouette_score = silhouette_avg
                        best_params = {'Affinity': affinity, 'Linkage': linkage_type, 'Distance Threshold': distance_threshold}    



    # Print the best silhouette score and its corresponding parameters
    print("Best Silhouette Score:", best_silhouette_score)
    print("Best Parameters:", best_params)


### DBScan
* Use DBScan function to  to cluster the above data 
* In the  DBscan change the following parameters 
    * EPS (from 0.1 to 3)
    * Min_samples (from 5 to 25)
* Plot the silhouette_score versus the variation in the EPS and the min_samples
* Plot the resulting Clusters in this case 
* Find the set of paramters that would find result in the best silhouette_score and store this score for later comparison with other clustering techniques. 
* Record your observations and comments 

In [122]:
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score
import numpy as np
import matplotlib.pyplot as plt

def DBSCan(example_data):
    # Define parameter ranges
    eps_values = np.linspace(0.1, 3, num=10)  # 10 values between 0.1 and 3
    min_samples_values = range(5, 26)  # Min_samples from 5 to 25

    # Initialize variables to store the best silhouette score and its corresponding parameters
    best_silhouette_score = -1
    best_params = {}

    # Initialize lists to store silhouette scores and corresponding parameters
    silhouette_scores = []
    parameters = []

    # Loop through parameter combinations
    for eps in eps_values:
        for min_samples in min_samples_values:
            # Perform clustering
            dbscan = DBSCAN(eps=eps, min_samples=min_samples)
            cluster_labels = dbscan.fit_predict(example_data)

            # Check if only one unique label is detected
            if len(np.unique(cluster_labels)) <= 1:
                continue

            # Calculate silhouette score
            silhouette_avg = silhouette_score(example_data, cluster_labels)

            # Store silhouette score and parameters
            silhouette_scores.append(silhouette_avg)
            parameters.append({'EPS': eps, 'Min Samples': min_samples})

            # Check if this silhouette score is better than the current best
            if silhouette_avg > best_silhouette_score:
                best_silhouette_score = silhouette_avg
                best_params = {'EPS': eps, 'Min Samples': min_samples}
                best_cluster_labels = cluster_labels

            # Plot clusters
            plt.figure(figsize=(8, 6))
            plt.scatter(example_data[:, 0], example_data[:, 1], c=cluster_labels, cmap='viridis', marker='o', edgecolors='k')
            plt.title(f'DBSCAN Clustering - EPS: {eps}, Min Samples: {min_samples}')
            plt.xlabel('Feature 1')
            plt.ylabel('Feature 2')
            plt.colorbar(label='Cluster')
            plt.grid(True)
            plt.show()

    # Plot silhouette score versus parameter variation
    plt.figure(figsize=(10, 6))
    plt.plot(range(len(silhouette_scores)), silhouette_scores, marker='o', linestyle='-')
    plt.title('Silhouette Score versus Parameter Variation')
    plt.xlabel('Parameter Combination')
    plt.ylabel('Silhouette Score')
    plt.xticks(range(len(silhouette_scores)), parameters, rotation=90)
    plt.grid(True)
    plt.tight_layout()
    plt.show()

    # Print the best silhouette score and its corresponding parameters
    print("Best Silhouette Score:", best_silhouette_score)
    print("Best Parameters:", best_params)


### Gaussian Mixture
 * Use GaussianMixture function to cluster the above data 
 * In GMM change the covariance_type and check the difference in the resulting proabability fit 
 * Use a 2D contour plot to plot the resulting distribution (the components of the GMM) as well as the total Gaussian mixture 

In [69]:
from sklearn.mixture import GaussianMixture
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.colors import LogNorm

def GMMClustering(example_data):
    # Define covariance types to test
    covariance_types = ['full', 'tied', 'diag', 'spherical']

    # Initialize variables to store the best silhouette score and its corresponding parameters
    best_silhouette_score = -1
    best_params = {}

    # Initialize lists to store silhouette scores and corresponding parameters
    silhouette_scores = []
    parameters = []

    # Loop through covariance types
    for covariance_type in covariance_types:
        # Perform clustering
        gmm = GaussianMixture(n_components=3, covariance_type=covariance_type)
        cluster_labels = gmm.fit_predict(example_data)

        # Calculate silhouette score
        silhouette_avg = silhouette_score(example_data, cluster_labels)

        # Store silhouette score and parameters
        silhouette_scores.append(silhouette_avg)
        parameters.append({'Covariance Type': covariance_type})
        print("Silhouette_Score",silhouette_scores)
        
        # Check if this silhouette score is better than the current best
        if silhouette_avg > best_silhouette_score:
            best_silhouette_score = silhouette_avg
            best_params = {'Covariance Type': covariance_type}
            best_cluster_labels = cluster_labels
            

        # Plot clusters and contour plot
        plt.figure(figsize=(20, 15))

        # Scatter plot for clusters
        plt.subplot(1, 2, 1)
        plt.scatter(example_data[:, 0], example_data[:, 1], c=cluster_labels, cmap='viridis', marker='o', edgecolors='k')
        plt.title(f'GMM Clustering - Covariance Type: {covariance_type}')
        plt.xlabel('Feature 1')
        plt.ylabel('Feature 2')

        # Contour plot for Gaussian mixture
        plt.subplot(1, 2, 2)
        x, y = np.meshgrid(np.linspace(example_data[:, 0].min(), example_data[:, 0].max(), 100),
                           np.linspace(example_data[:, 1].min(), example_data[:, 1].max(), 100))
        xy = np.column_stack([x.ravel(), y.ravel()])
        z = -gmm.score_samples(xy)
        z = z.reshape(x.shape)

        plt.contour(x, y, z, norm=LogNorm(vmin=1.0, vmax=1000.0), levels=np.logspace(0, 3, 10))
        plt.scatter(example_data[:, 0], example_data[:, 1], c=cluster_labels, cmap='viridis', marker='o', edgecolors='k')
        plt.title('Gaussian Mixture Components')
        plt.xlabel('Feature 1')
        plt.ylabel('Feature 2')

        plt.show()

    # Plot silhouette score versus covariance type
    plt.figure(figsize=(10, 6))
    plt.bar(range(len(silhouette_scores)), silhouette_scores, tick_label=[param['Covariance Type'] for param in parameters])
    plt.title('Silhouette Score versus Covariance Type')
    plt.xlabel('Covariance Type')
    plt.ylabel('Silhouette Score')
    plt.grid(True)
    plt.show()

    # Print the best silhouette score and its corresponding parameters
    print("Best Silhouette Score:", best_silhouette_score)
    print("Best Parameters:", best_params)




## iris data set 
The iris data set is test data set that is part of the Sklearn module 
which contains 150 records each with 4 features. All the features are represented by real numbers 

The data represents three classes 


In [71]:
from sklearn.datasets import load_iris
iris_data = load_iris()
iris_data.target[[10, 25, 50]]
#array([0, 0, 1])
list(iris_data.target_names)
['setosa', 'versicolor', 'virginica']

# Print the entire dataset (features and target labels)
iris_df = pd.DataFrame(data=iris_data.data, columns=iris_data.feature_names)
print("Irissss",iris_df)

Irissss      sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
0                  5.1               3.5                1.4               0.2
1                  4.9               3.0                1.4               0.2
2                  4.7               3.2                1.3               0.2
3                  4.6               3.1                1.5               0.2
4                  5.0               3.6                1.4               0.2
..                 ...               ...                ...               ...
145                6.7               3.0                5.2               2.3
146                6.3               2.5                5.0               1.9
147                6.5               3.0                5.2               2.0
148                6.2               3.4                5.4               2.3
149                5.9               3.0                5.1               1.8

[150 rows x 4 columns]


* Repeat all the above clustering approaches and steps on the above data 
* Normalize the data then repeat all the above steps 
* Compare between the different clustering approaches 

In [98]:
from sklearn.cluster import KMeans
import numpy as np
import matplotlib.pyplot as plt

def kmeans_clustering_Iris(example_data, k_values):
    # Initialize lists to store SSE and silhouette scores for different K values
    sse_values = []
    silhouette_scores = []

    for k in k_values:
        # Perform K-means clustering
        kmeans = KMeans(n_clusters=k)
        cluster_labels = kmeans.fit_predict(example_data)
        
        # Calculate Sum of Squared Errors (SSE) for the clustering
        sse = kmeans.inertia_
        sse_values.append(sse)
        
        # Calculate silhouette score for the clustering
        silhouette_avg = silhouette_score(example_data, cluster_labels)
        silhouette_scores.append(silhouette_avg)

        # Plot clusters (for visualization, you can customize this part)
        # Plot resulting clusters
        plt.figure(figsize=(30, 25))
        num_features = example_data.shape[1]
        fig, axs = plt.subplots(num_features, num_features, figsize=(30, 25))
        print(f'K-means Clustering with K={k}')
        for i in range(num_features):
            for j in range(num_features):
                if i != j:
                    axs[i, j].scatter(example_data.iloc[:, i], example_data.iloc[:, j], c=cluster_labels, cmap='rainbow', marker='o', edgecolors='k')
                    axs[i, j].set_xlabel(example_data.columns[i])
                    axs[i, j].set_ylabel(example_data.columns[j])
                else:
                    axs[i, j].axis('off')  # Turn off plot for same feature pairs

        plt.tight_layout()
        plt.show()

    # Find the index of the maximum silhouette score
    best_k_idx = np.argmax(silhouette_scores)
    # Get the corresponding value of K
    best_k = k_values[best_k_idx]  
    
    # Plot SSE versus K
    plt.figure(figsize=(10, 6))
    plt.plot(k_values, sse_values, marker='o', linestyle='-')
    plt.title('Sum of Squared Errors (SSE) versus Number of Clusters (K)')
    plt.xlabel('Number of Clusters (K)')
    plt.ylabel('SSE')
    plt.grid(True)
    plt.tight_layout()
    plt.show()
    
    print("Best K using silhouette score:", best_k)
    # Plot silhouette score versus K
    plt.figure(figsize=(10, 6))
    plt.plot(k_values, silhouette_scores, marker='o', linestyle='-')
    plt.title('Silhouette Score versus Number of Clusters (K)')
    plt.xlabel('Number of Clusters (K)')
    plt.ylabel('Silhouette Score')
    plt.grid(True)
    plt.tight_layout()
    plt.show()

    



In [100]:
#Hierorichal_Irias

import numpy as np
import matplotlib.pyplot as plt  # Importing plt from matplotlib.pyplot
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.cluster.hierarchy import fcluster
import seaborn as sns

def Hierarchal_clustering_Iris(example_data):
    # Define parameter combinations to try
    affinities = ['euclidean', 'manhattan', 'cosine']
    linkages = ['average', 'single']
    distance_thresholds = [None, 2, 5]  # Adjust threshold values as needed

    # Initialize variables to store the best silhouette score and its corresponding parameters
    best_silhouette_score = -1  # Initialize with a value that ensures any calculated silhouette score will be better
    best_params = None
    # Loop through parameter combinations
    for affinity in affinities:
        if affinity == 'manhattan':
            affinity = 'cityblock'
        for linkage_type in linkages:
            for distance_threshold in distance_thresholds:
                # Perform hierarchical clustering
                if distance_threshold is not None:
                    Z = linkage(example_data, metric=affinity, method=linkage_type)
                    cluster_labels = fcluster(Z, t=distance_threshold, criterion='distance')
                    plt.figure(figsize=(20, 18))
                    plt.title(f"Dendrogram ({affinity}, {linkage_type}, Threshold={distance_threshold})")
                else:
                    clustering = AgglomerativeClustering(n_clusters=2, affinity=affinity, linkage=linkage_type)
                    cluster_labels = clustering.fit_predict(example_data)
                    Z = linkage(example_data, metric=affinity, method=linkage_type)
                    plt.figure(figsize=(20, 18))
                    plt.title(f"Dendrogram ({affinity}, {linkage_type}, n_clusters=2)")
                dendrogram(Z,leaf_rotation=90., leaf_font_size=8.)     
                # Plot dendrogram       
                plt.xlabel('Data points')
                plt.ylabel('Distance')
                plt.show()
                # Plot resulting clusters
                plt.figure(figsize=(30, 25))
                num_features = example_data.shape[1]
                fig, axs = plt.subplots(num_features, num_features, figsize=(30, 25))
                print(f"Clusters - Affinity: {affinity}, Linkage: {linkage_type}, Distance Threshold: {distance_threshold}")
                for i in range(num_features):
                    for j in range(num_features):
                        if i != j:
                            axs[i, j].scatter(example_data.iloc[:, i], example_data.iloc[:, j], c=cluster_labels, cmap='rainbow', marker='o', edgecolors='k')
                            axs[i, j].set_xlabel(example_data.columns[i])
                            axs[i, j].set_ylabel(example_data.columns[j])
                        else:
                            axs[i, j].axis('off')  # Turn off plot for same feature pairs

                plt.tight_layout()
                plt.show()

                # Calculate silhouette score
                # Check if multiple clusters are formed
                if len(np.unique(cluster_labels)) > 1:
                    silhouette_avg = silhouette_score(pd.DataFrame(example_data), cluster_labels)
                    print(f"Silhouette Score: {silhouette_avg}")
                 # Check if this silhouette score is better than the current best
                    if silhouette_avg > best_silhouette_score:
                        best_silhouette_score = silhouette_avg
                        best_params = {'Affinity': affinity, 'Linkage': linkage_type, 'Distance Threshold': distance_threshold}    



    # Print the best silhouette score and its corresponding parameters
    print("Best Silhouette Score:", best_silhouette_score)
    print("Best Parameters:", best_params)


In [None]:
#The silhouette score of K_means is higher than Hierarchal which means that in this dataset the K-means is better in clustering
#of the iris dataset with K=2

In [102]:
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score
import numpy as np
import matplotlib.pyplot as plt

def DBSCan_iris(example_data):
    # Define parameter ranges
    eps_values = np.linspace(0.1, 3, num=10)  # 10 values between 0.1 and 3
    min_samples_values = range(5, 26)  # Min_samples from 5 to 25
    #eps_values = np.linspace(0.5, 3, num=10)  # 10 values between 0.1 and 3
    #min_samples_values = range(5, 26)  # Min_samples from 5 to 25

    # Initialize variables to store the best silhouette score and its corresponding parameters
    best_silhouette_score = -1
    best_params = {}

    # Initialize lists to store silhouette scores and corresponding parameters
    silhouette_scores = []
    parameters = []

    # Loop through parameter combinations
    for eps in eps_values:
        for min_samples in min_samples_values:
            # Perform clustering
            dbscan = DBSCAN(eps=eps, min_samples=min_samples)
            #print("DSCANNNNNNN",dbscan)
            cluster_labels = dbscan.fit_predict(example_data)
            #print("Cluster_label",cluster_labels)    
            # Check if only one unique label is detected
            if len(np.unique(cluster_labels)) <= 1:
                continue

            # Calculate silhouette score
            silhouette_avg = silhouette_score(example_data, cluster_labels)

            # Store silhouette score and parameters
            silhouette_scores.append(silhouette_avg)
            parameters.append({'EPS': eps, 'Min Samples': min_samples})

            # Check if this silhouette score is better than the current best
            if silhouette_avg > best_silhouette_score:
                best_silhouette_score = silhouette_avg
                best_params = {'EPS': eps, 'Min Samples': min_samples}
                best_cluster_labels = cluster_labels

            # Plot clusters
            plt.figure(figsize=(8, 6))
            num_features = example_data.shape[1]
            print(f'DBSCAN Clustering - EPS: {eps}, Min Samples: {min_samples}')
            fig, axs = plt.subplots(num_features, num_features, figsize=(15, 15))
            for i in range(num_features):
                for j in range(num_features):
                    if i != j:
                        axs[i, j].scatter(example_data.iloc[:, i], example_data.iloc[:, j], c=cluster_labels, cmap='rainbow', marker='o', edgecolors='k')
                        axs[i, j].set_xlabel(example_data.columns[i])
                        axs[i, j].set_ylabel(example_data.columns[j])
                    else:
                        axs[i, j].axis('off')  # Turn off plot for same feature pairs

            plt.tight_layout()
            plt.show()
            
            
    # Plot silhouette score versus parameter variation
    plt.figure(figsize=(10, 6))
    plt.plot(range(len(silhouette_scores)), silhouette_scores, marker='o', linestyle='-')
    plt.title('Silhouette Score versus Parameter Variation')
    plt.xlabel('Parameter Combination')
    plt.ylabel('Silhouette Score')
    plt.xticks(range(len(silhouette_scores)), parameters, rotation=90)
    plt.grid(True)
    plt.tight_layout()
    plt.show()

    # Print the best silhouette score and its corresponding parameters
    print("Best Silhouette Score:", best_silhouette_score)
    print("Best Parameters:", best_params)


In [None]:
#The silhouette score of K_means is higher than Hierarchal and DSCAN which means that in this dataset the K-means is better in clustering
#of the iris dataset with K=2 and that silhoutte of DBSAN and Hirroircal are very close to each other

In [104]:
from sklearn.mixture import GaussianMixture
import numpy as np
import matplotlib.pyplot as plt

def GMMClustering_Irissssss(df):
    # Define covariance types to test
    covariance_types = ['full', 'tied', 'diag', 'spherical']

    # Initialize variables to store the best silhouette score and its corresponding parameters
    best_silhouette_score = -1
    best_params = {}

    # Initialize lists to store silhouette scores and corresponding parameters
    silhouette_scores = []
    parameters = []

    # Iterate over covariance types
    for covariance_type in covariance_types:
        # Perform clustering
        gmm = GaussianMixture(n_components=3, covariance_type=covariance_type)
        cluster_labels = gmm.fit_predict(df)

        # Calculate silhouette score
        silhouette_avg = silhouette_score(df, cluster_labels)

        # Store silhouette score and parameters
        silhouette_scores.append(silhouette_avg)
        parameters.append({'Covariance Type': covariance_type})

        # Check if this silhouette score is better than the current best
        if silhouette_avg > best_silhouette_score:
            best_silhouette_score = silhouette_avg
            best_params = {'Covariance Type': covariance_type}
            best_cluster_labels = cluster_labels

        # Plot scatter plot for each pair of features
        print(f'GMM Clustering - Covariance Type: {covariance_type}')
        n_features = df.shape[1]
        for i in range(n_features):
            for j in range(i + 1, n_features):
                plt.figure(figsize=(10, 6))

                # Scatter plot
                plt.subplot(1, 2, 1)
                plt.scatter(df.iloc[:, i], df.iloc[:, j], c=best_cluster_labels, cmap='viridis', s=50, alpha=0.5)
                plt.xlabel(f'Feature {i}')
                plt.ylabel(f'Feature {j}')
                plt.title('Scatter Plot')

                # Fit GaussianMixture model for the current pair of features
                gmm_pair = GaussianMixture(n_components=3, covariance_type=best_params['Covariance Type'])
                X_pair = df[[df.columns[i], df.columns[j]]]
                gmm_pair.fit(X_pair)

                # Contour plot for Gaussian mixture
                plt.subplot(1, 2, 2)
                x_min, x_max = df.iloc[:, i].min() - 1, df.iloc[:, i].max() + 1
                y_min, y_max = df.iloc[:, j].min() - 1, df.iloc[:, j].max() + 1
                xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100),
                                     np.linspace(y_min, y_max, 100))
                Z = gmm_pair.score_samples(np.column_stack([xx.ravel(), yy.ravel()]))
                Z = Z.reshape(xx.shape)
                plt.contourf(xx, yy, Z, cmap='viridis', levels=20, alpha=0.5)
                plt.xlabel(f'Feature {i}')
                plt.ylabel(f'Feature {j}')
                plt.title('Contour Plot')

                plt.tight_layout()
                plt.show()

    # Plot silhouette score versus covariance type
    plt.figure(figsize=(10, 6))
    plt.bar(range(len(silhouette_scores)), silhouette_scores, tick_label=[param['Covariance Type'] for param in parameters])
    plt.title('Silhouette Score versus Covariance Type')
    plt.xlabel('Covariance Type')
    plt.ylabel('Silhouette Score')
    plt.grid(True)
    plt.show()

    # Print the best silhouette score and its corresponding parameters
    print("Best Silhouette Score:", best_silhouette_score)
    print("Best Parameters:", best_params)

In [None]:
#a high silhouette score generally indicates good clustering, as it implies dense, well-separated clusters with minimal overlap
#the silueheette of GMM is very small compared to other clustering techniques in the data which results that K-means is the best 

In [30]:
#Normalize_the Iris_dataset

In [106]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# Create a MinMaxScaler object
min_max_scaler = MinMaxScaler()

# Normalize the DataFrame using MinMaxScaler
normalized_data = pd.DataFrame(min_max_scaler.fit_transform(iris_df), columns=iris_df.columns)

In [None]:
#Most of the Silhouette Score are very close to each as most of them are 0.63

## Customer dataset
Repeat all the above on the customer data set 

In [88]:
Customer_df=pd.read_csv('F:/Machine_Learning/Customer_data.csv')
Customer_df.set_index('ID', inplace=True)

In [None]:
# From this the Hierorical technquie is the best as 0.75 however other are too small and DSBSCAN couldn't work without any Normalise 

In [93]:
#Normalize_Customer_df
from sklearn.preprocessing import MinMaxScaler

Normalize_Customer_df=Customer_df.copy()

# Extract numerical columns
numerical_columns = Normalize_Customer_df.select_dtypes(include=['float64', 'int64']).columns

# Create a MinMaxScaler
scaler = MinMaxScaler()

# Apply Min-Max Scaling to the numerical columns
Normalize_Customer_df[numerical_columns] = scaler.fit_transform(Normalize_Customer_df[numerical_columns])


In [None]:
#Form the precious the best silheutte score is from DBSCAN that is equal to 0.5209 and other are too small compared to it which means that this technquie is the best in the clustering 