# FINAL VERSION

### Imports from Libraries

In [None]:
# Global
import os 
import numpy as np
import pandas as pd
from math import sqrt

In [None]:
# Cluster Algorithm
from sklearn.cluster import KMeans
from sklearn.cluster import SpectralClustering
from sklearn.cluster import AgglomerativeClustering
from sklearn.mixture import GaussianMixture
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# Mesure Performance
from sklearn.metrics import adjusted_rand_score as ari
from sklearn.metrics import normalized_mutual_info_score as nmi

In [None]:
# Improve Display
from tqdm.notebook import tqdm
import warnings
warnings.filterwarnings('ignore') 

### Loading the dataset

In [None]:
dataset = pd.read_csv('../datasets/Aggregation.txt', sep='\t', header=None)
dataset.columns=['A', 'B', 'target']

In [None]:
# Separate features and target
X = dataset.loc[:, ['A', 'B']]

### Principal Component Analysis

In [None]:
from sklearn.decomposition import PCA

In [None]:
pca = PCA(n_components=2)
principalComponents = pca.fit_transform(X)

In [None]:
X = pd.DataFrame(data=principalComponents, columns=['A', 'B'])

In [None]:
dataset = pd.concat([X, dataset[['target']]], axis=1)

### Simple Clustering Method

In [None]:
# Number of Cluster
k = 7

In [None]:
#K-Means
model = KMeans(n_clusters=k, init="random", max_iter=100, random_state=None, n_init=1).fit(X).labels_
fig = px.scatter(dataset, x='A', y='B', color=model, title='K-Means' + ' - ARI: ' +  str(round(ari(dataset.target, model),2)) + ' - NMI: ' + str(round(nmi(dataset.target, model),2)))
fig.show()

In [None]:
#K-Means++
model = KMeans(n_clusters=k, init="k-means++", max_iter=100, random_state=None, n_init=1).fit(X).labels_
fig = px.scatter(dataset, x='A', y='B', color=model, title='K-Means++'  + ' - ARI: ' +  str(round(ari(dataset.target, model),2)) + ' - NMI: ' + str(round(nmi(dataset.target, model),2))) 
fig.show()

In [None]:
#CAH - Ward Linkage
model = AgglomerativeClustering(n_clusters=k, linkage='ward').fit(X).labels_
fig = px.scatter(dataset, x='A', y='B', color=model, title='CAH - Ward'  + ' - ARI: ' +  str(round(ari(dataset.target, model),2)) + ' - NMI: ' + str(round(nmi(dataset.target, model),2))) 
fig.show()

In [None]:
#CAH - Complete Linkage
model = AgglomerativeClustering(n_clusters=k, linkage='complete').fit(X).labels_
fig = px.scatter(dataset, x='A', y='B', color=model, title='CAH - Complete' + ' - ARI: ' +  str(round(ari(dataset.target, model),2)) + ' - NMI: ' + str(round(nmi(dataset.target, model),2))) 
fig.show()

In [None]:
#CAH - Average Linkage
model = AgglomerativeClustering(n_clusters=k, linkage='average').fit(X).labels_
fig = px.scatter(dataset, x='A', y='B', color=model, title='CAH - Average' + ' - ARI: ' +  str(round(ari(dataset.target, model),2)) + ' - NMI: ' + str(round(nmi(dataset.target, model),2))) 
fig.show()

In [None]:
#CAH - Single Linkage
model = AgglomerativeClustering(n_clusters=k, linkage='single').fit(X).labels_
fig = px.scatter(dataset, x='A', y='B', color=model, title='CAH - Single' + ' - ARI: ' +  str(round(ari(dataset.target, model),2)) + ' - NMI: ' + str(round(nmi(dataset.target, model),2))) 
fig.show()

In [None]:
#GMM - Full
model = GaussianMixture(n_components=k, covariance_type='full').fit(X).predict(X)
fig = px.scatter(dataset, x='A', y='B', color=model, title='GMM - Full' + ' - ARI: ' +  str(round(ari(dataset.target, model),2)) + ' - NMI: ' + str(round(nmi(dataset.target, model),2))) 
fig.show()

In [None]:
#GMM - Tied
model = GaussianMixture(n_components=k, covariance_type='tied').fit(X).predict(X)
fig = px.scatter(dataset, x='A', y='B', color=model, title='GMM - Tied' + ' - ARI: ' +  str(round(ari(dataset.target, model),2)) + ' - NMI: ' + str(round(nmi(dataset.target, model),2))) 
fig.show()

In [None]:
#GMM - Diag
model = GaussianMixture(n_components=k, covariance_type='diag').fit(X).predict(X)
fig = px.scatter(dataset, x='A', y='B', color=model, title='GMM - Diag' + ' - ARI: ' +  str(round(ari(dataset.target, model),2)) + ' - NMI: ' + str(round(nmi(dataset.target, model),2))) 
fig.show()

In [None]:
#GMM - Spherical
model = GaussianMixture(n_components=k, covariance_type='spherical').fit(X).predict(X)
fig = px.scatter(dataset, x='A', y='B', color=model, title='GMM - Spherical' + ' - ARI: ' +  str(round(ari(dataset.target, model),2)) + ' - NMI: ' + str(round(nmi(dataset.target, model),2))) 
fig.show()

### Performance Simple Clustering Method

In [None]:
#KMeans
ariTab = []
nmiTab = []
for i in tqdm(range(100)):
    model = KMeans(n_clusters=k, init="random", max_iter=100, random_state=None, n_init=1).fit(X).labels_
    ariTab.append(round(ari(dataset.target, model),2))
    nmiTab.append(round(nmi(dataset.target, model),2))
print('K-Means')
print('ARI: ', np.mean(ariTab).round(2))
print('NMI: ', np.mean(nmiTab).round(2))

In [None]:
#KMeans++
ariTab = []
nmiTab = []
for i in tqdm(range(100)):
    model = KMeans(n_clusters=k, init="k-means++", max_iter=100, random_state=None, n_init=1).fit(X).labels_
    ariTab.append(round(ari(dataset.target, model),2))
    nmiTab.append(round(nmi(dataset.target, model),2))
print('K-Means++')
print('ARI: ', np.mean(ariTab).round(2))
print('NMI: ', np.mean(nmiTab).round(2))

In [None]:
#CAH - Ward Linkage
ariTab = []
nmiTab = []
for i in tqdm(range(100)):
    model = AgglomerativeClustering(n_clusters=k, linkage='ward').fit(X).labels_    
    ariTab.append(round(ari(dataset.target, model),2))
    nmiTab.append(round(nmi(dataset.target, model),2))
print('CAH - Ward Linkage')
print('ARI: ', np.mean(ariTab).round(2))
print('NMI: ', np.mean(nmiTab).round(2))

In [None]:
#CAH - Complete Linkage
ariTab = []
nmiTab = []
for i in tqdm(range(100)):
    model = AgglomerativeClustering(n_clusters=k, linkage='complete').fit(X).labels_    
    ariTab.append(round(ari(dataset.target, model),2))
    nmiTab.append(round(nmi(dataset.target, model),2))
print('CAH - Complete Linkage')
print('ARI: ', np.mean(ariTab).round(2))
print('NMI: ', np.mean(nmiTab).round(2))

In [None]:
#CAH - Average Linkage
ariTab = []
nmiTab = []
for i in tqdm(range(100)):
    model = AgglomerativeClustering(n_clusters=k, linkage='average').fit(X).labels_    
    ariTab.append(round(ari(dataset.target, model),2))
    nmiTab.append(round(nmi(dataset.target, model),2))
print('CAH - Average Linkage')
print('ARI: ', np.mean(ariTab).round(2))
print('NMI: ', np.mean(nmiTab).round(2))

In [None]:
#CAH - Single Linkage
ariTab = []
nmiTab = []
for i in tqdm(range(100)):
    model = AgglomerativeClustering(n_clusters=k, linkage='single').fit(X).labels_    
    ariTab.append(round(ari(dataset.target, model),2))
    nmiTab.append(round(nmi(dataset.target, model),2))

print('CAH - Single Linkage')
print('ARI: ', np.mean(ariTab).round(2))
print('NMI: ', np.mean(nmiTab).round(2))

In [None]:
#GMM - Full
ariTab = []
nmiTab = []
for i in tqdm(range(100)):
    model = GaussianMixture(n_components=k, covariance_type='full').fit(X).predict(X)
    ariTab.append(round(ari(dataset.target, model),2))
    nmiTab.append(round(nmi(dataset.target, model),2))
print('GMM - Full')
print('ARI: ', np.mean(ariTab).round(2))
print('NMI: ', np.mean(nmiTab).round(2))

In [None]:
#GMM - Tied
ariTab = []
nmiTab = []
for i in tqdm(range(100)):
    model = GaussianMixture(n_components=k, covariance_type='tied').fit(X).predict(X)
    ariTab.append(round(ari(dataset.target, model),2))
    nmiTab.append(round(nmi(dataset.target, model),2))
print('GMM - Tied')
print('ARI: ', np.mean(ariTab).round(2))
print('NMI: ', np.mean(nmiTab).round(2))

In [None]:
#GMM - Diag
ariTab = []
nmiTab = []
for i in tqdm(range(100)):
    model = GaussianMixture(n_components=k, covariance_type='diag').fit(X).predict(X)
    ariTab.append(round(ari(dataset.target, model),2))
    nmiTab.append(round(nmi(dataset.target, model),2))
print('GMM - Diag')
print('ARI: ', np.mean(ariTab).round(2))
print('NMI: ', np.mean(nmiTab).round(2))

In [None]:
#GMM - Spherical
ariTab = []
nmiTab = []
for i in tqdm(range(100)):
    model = GaussianMixture(n_components=k, covariance_type='spherical').fit(X).predict(X)
    ariTab.append(round(ari(dataset.target, model),2))
    nmiTab.append(round(nmi(dataset.target, model),2))
print('GMM - Spherical')
print('ARI: ', np.mean(ariTab).round(2))
print('NMI: ', np.mean(nmiTab).round(2))

### Ensemble Member Generation

In [None]:
member_generation = []    
algorithms = {'kmeans':0, 'kmeans++':0, 'cah_ward':0, 'cah_complete':0, 'cah_average':0, 'cah_single':0, 'gmm_full':0, 'gmm_tied':0, 'gmm_diag':0, 'gmm_spherical': 0}    

In [None]:
def reset():
    global member_generation   
    global algorithms 

    member_generation = []
    algorithms = {'kmeans':0, 'kmeans++':0, 'cah_ward':0, 'cah_complete':0, 'cah_average':0, 'cah_single':0, 'gmm_full':0, 'gmm_tied':0, 'gmm_diag':0, 'gmm_spherical': 0}        

In [None]:
def member_generation_function(X, algorithm='kmeans', nb_partition=5, mix=True):
    '''
    Generating nb_parititions clustering model 
    X : dataset
    nb_partition : number of partition
    mix : Does the result will be merge with an other clustering model in order to use Ensemble Clustering with several algorithm
    '''
    h, _ = X.shape
    partition = []
    global member_generation
    global algorithms
    for i in range(nb_partition):
        k = np.random.randint(2, sqrt(h))
        if algorithm == 'kmeans':
            if mix:
                algorithms['kmeans'] += 1       
            model = KMeans(n_clusters=k, init="random", max_iter=100, random_state=None, n_init=1).fit(X).labels_
        elif algorithm == 'kmeans++':
            if mix:
                algorithms['kmeans++'] += 1   
            model = KMeans(n_clusters=k, init="k-means++", max_iter=100, random_state=None, n_init=1).fit(X).labels_
        elif algorithm == 'cah_ward':
            if mix:
                algorithms['cah_ward'] += 1            
            model = AgglomerativeClustering(n_clusters=k, linkage='ward').fit(X).labels_
        elif algorithm == 'cah_complete':
            if mix:
                algorithms['cah_complete'] += 1
            model = AgglomerativeClustering(n_clusters=k, linkage='complete').fit(X).labels_    
        elif algorithm == 'cah_average':
            if mix:
                algorithms['cah_average'] += 1       
            model = AgglomerativeClustering(n_clusters=k, linkage='average').fit(X).labels_    
        elif algorithm == 'cah_single':
            if mix:
                algorithms['cah_single'] += 1
            model = AgglomerativeClustering(n_clusters=k, linkage='single').fit(X).labels_    
        elif algorithm == 'gmm_full':
            if mix:
                algorithms['gmm_full'] += 1  
            model = GaussianMixture(n_components=k, covariance_type='full').fit(X).predict(X)
        elif algorithm == 'gmm_tied':
            if mix:
                algorithms['gmm_tied'] += 1  
            model = GaussianMixture(n_components=k, covariance_type='tied').fit(X).predict(X)
        elif algorithm == 'gmm_diag':
            if mix:
                algorithms['gmm_diag'] += 1   
            model = GaussianMixture(n_components=k, covariance_type='diag').fit(X).predict(X)
        elif algorithm == 'gmm_spherical':
            if mix:
                algorithms['gmm_spherical'] += 1   
            model = GaussianMixture(n_components=k, covariance_type='spherical').fit(X).predict(X)
        partition.append(model)
    if mix:
        member_generation.extend(partition)
    else:
        return partition

### Consensus Function

In [None]:
def one_hot_encoding(member_generation):
    '''
    Combine multiple partitions in order to form one
    for each partition we represent the membership of individuals in a cluster by a one hot encoding
    
    member_generation : matrix with all partition
    '''
    partitions_one_hot=[]
    for partition_idx, partition in enumerate(member_generation.T):
        nb_individu=len(partition)
        nb_cluster=len(np.unique(partition))
        one_hot_encoding=np.zeros((nb_individu,nb_cluster))
        for individu_idx,individu_affectation_cluster in enumerate(partition):
            one_hot_encoding[individu_idx,individu_affectation_cluster]=1
        partitions_one_hot.append(one_hot_encoding)

    partitions_co_occurence=[partition_one_hot@partition_one_hot.T for partition_one_hot in partitions_one_hot]
    nb_partitions=member_generation.shape[1]
    return (sum(partitions_co_occurence)/nb_partitions).round(2)

In [None]:
def consensus_function(member_generation):
    '''
    Combine multiple partitions in order to form one
    
    member_generation : matrix with all partition
    '''
    h, _ = member_generation.shape
    co_occurence = np.zeros((h, h))
    for i in range(h):
        co_occurence[i, :] = (member_generation == member_generation[i, :]).sum(axis=1)/_
    return co_occurence


In [None]:
def final_clustering_result(co_occurence, n_clusters):
    '''
    Returns the labels, using a clustering algorithm (Spectral Clustering) on the co-association matrix
    
    co_occurence : co-association matrix
    n_clusters : number of cluster to search
    '''
    B = SpectralClustering(n_clusters=n_clusters, random_state=0, affinity="precomputed").fit(co_occurence)  
    return B.labels_

### Evaluation

In [None]:
def accuracy(label_real, label_pred):
    '''
    Accuracy of the method using ARI ans NMI
    
    label_real: the real label
    label_pred: the predicted label
    '''
    return round(ari(label_real, label_pred),2), round(nmi(label_real, label_pred),2)
    

### Clustering Ensemble

In [None]:
def clustering_ensemble(X, algorithm='kmeans', k=7, nb_partition=5, mix=True):
    '''
    Implementation of the Clustering Ensemble method
    
    X : dataset
    algorithm : the name of the algorithm to use 
    nb_partition : number of partition to generate
    mix : if we want to mix several clustering algorithm
    '''
    global member_generation
    global algorithms
    # Clustering Ensemble Algorithm
    if mix:
        member_generation_function(X, algorithm, nb_partition, mix)
        consensus = consensus_function(np.transpose(member_generation))
    else:
        member_generation_simple = member_generation_function(X, algorithm, nb_partition, mix)
        consensus = consensus_function(np.transpose(member_generation_simple))
        
    label_pred = final_clustering_result(consensus, k) 
    
    # Plot the result
    fig = px.scatter(dataset, x="A", y="B", color=label_pred, title='Algorithme : ' + algorithm.upper() + ' - Nb Cluster : ' + str(k) + ' - Nb Partition : ' + str(nb_partition)+ ' - ARI: ' +  str(round(ari(dataset.target, label_pred),2)) + ' - NMI: ' + str(round(nmi(dataset.target, label_pred),2))) 
    fig.show()
    
    # Performance of the system
    return accuracy(dataset.target, label_pred)

### Launch the application

In [None]:
reset()
print(algorithms)

In [None]:
labels = clustering_ensemble(X,'kmeans',7, 55, False)

### Performance Ensemble Clustering Method

In [None]:
k = 7
p = 55
# We comment the line 23-24 in order to note display the scatter plot

In [None]:
#K-Means
ariTab = []
nmiTab = []
for i in tqdm(range(100)):
    a, b = clustering_ensemble(X,'kmeans',k, p, False)    
    ariTab.append(a)
    nmiTab.append(b)
print('K-Means')
print('ARI: ', np.mean(ariTab).round(2))
print('NMI: ', np.mean(nmiTab).round(2))

In [None]:
#K-Means++
ariTab = []
nmiTab = []
for i in tqdm(range(100)):
    a, b = clustering_ensemble(X,'kmeans++',k, p, False)    
    ariTab.append(a)
    nmiTab.append(b)
print('K-Means++')
print('ARI: ', np.mean(ariTab).round(2))
print('NMI: ', np.mean(nmiTab).round(2))

In [None]:
#CAH - Ward Linkage
ariTab = []
nmiTab = []
for i in tqdm(range(100)):
    a, b = clustering_ensemble(X,'cah_ward',k, p, False)    
    ariTab.append(a)
    nmiTab.append(b)
print('CAH - Ward Linkage')
print('ARI: ', np.mean(ariTab).round(2))
print('NMI: ', np.mean(nmiTab).round(2))

In [None]:
#CAH - Complete Linkage
ariTab = []
nmiTab = []
for i in tqdm(range(100)):
    a, b = clustering_ensemble(X,'cah_complete',k, p, False)    
    ariTab.append(a)
    nmiTab.append(b)
print('CAH - Complete Linkage')
print('ARI: ', np.mean(ariTab).round(2))
print('NMI: ', np.mean(nmiTab).round(2))

In [None]:
#CAH - Average Linkage
ariTab = []
nmiTab = []
for i in tqdm(range(100)):
    a, b = clustering_ensemble(X,'cah_average',k, p, False)    
    ariTab.append(a)
    nmiTab.append(b)
print('CAH - Average Linkage')
print('ARI: ', np.mean(ariTab).round(2))
print('NMI: ', np.mean(nmiTab).round(2))

In [None]:
#CAH - Single Linkage
ariTab = []
nmiTab = []
for i in tqdm(range(100)):
    a, b = clustering_ensemble(X,'cah_single',k, p, False)    
    ariTab.append(a)
    nmiTab.append(b)
print('CAH - Single Linkage')
print('ARI: ', np.mean(ariTab).round(2))
print('NMI: ', np.mean(nmiTab).round(2))

In [None]:
#GMM - Full
ariTab = []
nmiTab = []
for i in tqdm(range(100)):
    a, b = clustering_ensemble(X,'gmm_full',k, p, False)    
    ariTab.append(a)
    nmiTab.append(b)
print('GMM - Full')
print('ARI: ', np.mean(ariTab).round(2))
print('NMI: ', np.mean(nmiTab).round(2))

In [None]:
#GMM - Tied
ariTab = []
nmiTab = []
for i in tqdm(range(100)):
    a, b = clustering_ensemble(X,'gmm_tied',k, p, False)    
    ariTab.append(a)
    nmiTab.append(b)
print('GMM - Tied')
print('ARI: ', np.mean(ariTab).round(2))
print('NMI: ', np.mean(nmiTab).round(2))

In [None]:
#GMM - Diag
ariTab = []
nmiTab = []
for i in tqdm(range(100)):
    a, b = clustering_ensemble(X,'gmm_diag',k, p, False)    
    ariTab.append(a)
    nmiTab.append(b)
print('GMM - Diag')
print('ARI: ', np.mean(ariTab).round(2))
print('NMI: ', np.mean(nmiTab).round(2))

In [None]:
#GMM - Spherical
ariTab = []
nmiTab = []
for i in tqdm(range(100)):
    a, b = clustering_ensemble(X,'gmm_spherical',k, p, False)    
    ariTab.append(a)
    nmiTab.append(b)
print('GMM - Spherical')
print('ARI: ', np.mean(ariTab).round(2))
print('NMI: ', np.mean(nmiTab).round(2))

In [None]:
k = 7
p = 55

In [None]:
# Mix of K-algorithm
ariTab = []
nmiTab = []
reset()
for i in tqdm(range(100)):
    reset()
    a, b= clustering_ensemble(X,'kmeans',k, p, True)    
    a, b = clustering_ensemble(X,'kmeans++',k, p, True)    
    ariTab.append(a)
    nmiTab.append(b)

print(np.mean(ariTab))
print(np.mean(nmiTab))

In [None]:
# Mix of CAH method
ariTab = []
nmiTab = []
reset()
for i in tqdm(range(100)):
    reset()
    a, b = clustering_ensemble(X,'cah_ward',k, p, True)    
    a, b = clustering_ensemble(X,'cah_complete',k, p, True)    
    a, b = clustering_ensemble(X,'cah_average',k, p, True)    
    a, b = clustering_ensemble(X,'cah_single',k, p, True)    
    ariTab.append(a)
    nmiTab.append(b)

print(np.mean(ariTab))
print(np.mean(nmiTab))

In [None]:
# Mix of GMM method
ariTab = []
nmiTab = []
reset()
for i in tqdm(range(100)):
    reset()
    a, b = clustering_ensemble(X,'gmm_full',k, p, True)
    a, b = clustering_ensemble(X,'gmm_tied',k, p, True)
    a, b = clustering_ensemble(X,'gmm_diag',k, p, True)
    a, b = clustering_ensemble(X,'gmm_spherical',k, p, True)
    ariTab.append(a)
    nmiTab.append(b)

print(np.mean(ariTab))
print(np.mean(nmiTab))

In [None]:
# Mix of all the clustering algorithm
ariTab = []
nmiTab = []
reset()
for i in tqdm(range(100)):
    reset()
    a, b= clustering_ensemble(X,'kmeans',k, p, True)    
    a, b = clustering_ensemble(X,'kmeans++',k, p, True)    
    a, b = clustering_ensemble(X,'cah_ward',k, p, True)    
    a, b = clustering_ensemble(X,'cah_complete',k, p, True)    
    a, b = clustering_ensemble(X,'cah_average',k, p, True)    
    a, b = clustering_ensemble(X,'cah_single',k, p, True)    
    a, b = clustering_ensemble(X,'gmm_full',k, p, True)
    a, b = clustering_ensemble(X,'gmm_tied',k, p, True)
    a, b = clustering_ensemble(X,'gmm_diag',k, p, True)
    a, b = clustering_ensemble(X,'gmm_spherical',k, p, True)
    ariTab.append(a)
    nmiTab.append(b)

print(np.mean(ariTab))
print(np.mean(nmiTab))