In [1]:
# import libraries
import pandas as pd
import numpy as np 
from SOM_plus_clustering.modules.som import SOM
from SOM_plus_clustering.modules.som import kmeans
from sklearn import preprocessing
from sklearn.metrics import silhouette_score
import multiprocessing

In [2]:
# read dataset to be tested in clustering
df = pd.read_csv('iris_data.csv', header=None)
df.drop(4, axis=1, inplace=True)
df.head()

Unnamed: 0,0,1,2,3
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [3]:
def get_ss(X: np.ndarray, y:np.ndarray) -> float or None:
    """
    Get the value of shilhouette score with existed label.
    
    Args:
        X (np.ndarray): Values of input variables of data.
        y (np.ndarray): Prediction value of data.

    Returns:
        float: If there is only 1 label, return None, else return the silhouette score
    """
    try: 
        return silhouette_score(X,y)
    except:
        return None

In [4]:
def test_clustering_method(X: np.ndarray, total_trial: int, som_max_iteration:int, som_lr: float, som_nr:int, epoch:int, kmeans_total_cluster:int, som_m:int, som_n:int, path:str) -> pd.DataFrame:
    """
    Collects silhouette score of kmeans and SOM method with different initiator method.

    Args:
        X (np.ndarray): Values of test dataset.
        total_trial (int): Number of iteration for each clustering.
        som_max_iteration (int): Maximum iteration for self organizing matrix.
        som_lr (float): Value of learning rate (alpha) of Self Organzing Matrix.
        som_nr (int): Value of radius (gamma) of Self Organizing Matrix.
        epoch (int): Number of training iteration.
        kmeans_total_cluster (int): Total number of center that initiated in kmeans.
        som_m (int): Height of matrix in Self Organizing Matrix.
        som_n (int): Width of matrix in Self Organizing Matrix.

    Returns:
        pd.DataFrame: Table of silhouette score for each method.
    """
    list_data = list()
    for i in range(total_trial):
        model1 = kmeans(n_clusters=kmeans_total_cluster, method="random")
        model1.fit(X, epochs=epoch)
        
        model2 = kmeans(n_clusters=kmeans_total_cluster, method="kde")
        model2.fit(X, epochs=epoch)
        
        model3 = kmeans(n_clusters=kmeans_total_cluster, method="kmeans++")
        model3.fit(X, epochs=epoch)
        
        model4 = SOM(m = som_m, n = som_n, dim = X.shape[1], initiate_method = "random", max_iter = som_max_iteration, learning_rate = som_lr, neighbour_rad = som_nr)
        model4.fit(X, epoch=epoch)
        
        model5 = SOM(m = som_m, n = som_n, dim = X.shape[1], initiate_method = "kmeans", max_iter = som_max_iteration, learning_rate = som_lr, neighbour_rad = som_nr)
        model5.fit(X,epoch=epoch)
        
        model6 = SOM(m = som_m, n = som_n, dim = X.shape[1], initiate_method = "kde_kmeans", max_iter = som_max_iteration, learning_rate = som_lr, neighbour_rad = som_nr)
        model6.fit(X, epoch=epoch)
        
        model7 = SOM(m = som_m, n = som_n, dim = X.shape[1], initiate_method = "kmeans++", max_iter = som_max_iteration, learning_rate = som_lr, neighbour_rad = som_nr)
        model7.fit(X, epoch=epoch)
        
        model8 = SOM(m = som_m, n = som_n, dim = X.shape[1], initiate_method = "SOM++", max_iter = som_max_iteration, learning_rate = som_lr, neighbour_rad = som_nr)
        model8.fit(X, epoch=epoch)
        
        model9 = SOM(m = som_m, n = som_n, dim = X.shape[1], initiate_method = "kde", max_iter = som_max_iteration, learning_rate = som_lr, neighbour_rad = som_nr)
        model9.fit(X, epoch=epoch)
        """
        model1 = kmeans(n_clusters=kmeans_total_cluster, method="random")
        model2 = kmeans(n_clusters=kmeans_total_cluster, method="kde")
        model3 = kmeans(n_clusters=kmeans_total_cluster, method="kmeans++")
        model4 = SOM(m = som_m, n = som_n, dim = X.shape[1], initiate_method = "random", max_iter = som_max_iteration, learning_rate = som_lr, neighbour_rad = som_nr)
        model5 = SOM(m = som_m, n = som_n, dim = X.shape[1], initiate_method = "kmeans", max_iter = som_max_iteration, learning_rate = som_lr, neighbour_rad = som_nr)
        model6 = SOM(m = som_m, n = som_n, dim = X.shape[1], initiate_method = "kde_kmeans", max_iter = som_max_iteration, learning_rate = som_lr, neighbour_rad = som_nr)
        model7 = SOM(m = som_m, n = som_n, dim = X.shape[1], initiate_method = "kmeans++", max_iter = som_max_iteration, learning_rate = som_lr, neighbour_rad = som_nr)
        model8 = SOM(m = som_m, n = som_n, dim = X.shape[1], initiate_method = "SOM++", max_iter = som_max_iteration, learning_rate = som_lr, neighbour_rad = som_nr)
        model9 = SOM(m = som_m, n = som_n, dim = X.shape[1], initiate_method = "kde", max_iter = som_max_iteration, learning_rate = som_lr, neighbour_rad = som_nr)
        print("trial number", i)
        
        # multi processing part 1 - predict the model
        p1 = multiprocessing.Process(target=model1.fit(X, epochs=epoch))
        p2 = multiprocessing.Process(target=model2.fit(X, epochs=epoch))
        p3 = multiprocessing.Process(target=model3.fit(X, epochs=epoch))
        p4 = multiprocessing.Process(target=model4.fit(X, epoch=epoch))
        p5 = multiprocessing.Process(target=model5.fit(X, epoch=epoch))
        p6 = multiprocessing.Process(target=model6.fit(X, epoch=epoch))
        p7 = multiprocessing.Process(target=model7.fit(X, epoch=epoch))
        p8 = multiprocessing.Process(target=model8.fit(X, epoch=epoch))
        p9 = multiprocessing.Process(target=model9.fit(X, epoch=epoch))
        
        p1.start()
        p2.start()
        p3.start()
        p4.start()
        p5.start()
        p6.start()
        p7.start()
        p8.start()
        p9.start()
        
        p1.join()
        p2.join()
        p3.join()
        p4.join()
        p5.join()
        p6.join()
        p7.join()
        p8.join()
        p9.join()
        
        print("finnised trained the models")
        """
        # get the value of silhouette score
        ss1 = get_ss(X,model1.predict(X))
        ss2 = get_ss(X,model2.predict(X))
        ss3 = get_ss(X,model3.predict(X))
        ss4 = get_ss(X,model4.predict(X))
        ss5 = get_ss(X,model5.predict(X))
        ss6 = get_ss(X,model6.predict(X))
        ss7 = get_ss(X,model7.predict(X))
        ss8 = get_ss(X,model8.predict(X))
        ss9 = get_ss(X,model9.predict(X))
        
        print("finised evaluated the models")
        
        list_shs = [ss1, ss2, ss3, ss4, ss5, ss6, ss7, ss8, ss9]
        list_data.append(list_shs)
        print("saving data")
        data_table = pd.DataFrame(list_data, columns=["random kmeans", "kde kmeans", "kmeans++", "random SOM", "kmeans SOM", "kde kmeans SOM", "kmeans++ SOM", "SOM++", "kde SOM"])
        data_table.to_csv(path, index=False)
    return
        

In [5]:
# extract value from dataframe
test_values = df.values

# normalize the data
test_values = preprocessing.normalize(test_values)

# create a table of silhouette score
test_clustering_method(X = test_values, 
                                total_trial = 1, 
                                som_max_iteration = 500, 
                                som_lr = 0.5, 
                                som_nr = 4, 
                                epoch = 2, 
                                kmeans_total_cluster = 8, 
                                som_m = 4, 
                                som_n = 2,
                                path="Datas/silhouette_score_data_dummy.csv")

finised evaluated the models
saving data
