In [None]:
# Created by: Adam Fabo
# Date: 22.5.2022
# Created at HMU Crete
# Class: Neural Networks
# File contains script find optimal number of clusters for given data and plot results (Chapter 8 in documentation) 
 

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import neurolab as nl
import pandas as pd
from sklearn.cluster import KMeans
from sklearn import preprocessing
from yellowbrick.cluster import KElbowVisualizer
from sklearn.metrics import silhouette_samples, silhouette_score




In [None]:
# load data
data = pd.read_csv('data_banknote_auth_trimmed.txt', sep=",", header=None)

# data description
# 1. variance of Wavelet Transformed image (continuous)
# 2. skewness of Wavelet Transformed image (continuous)
# 3. curtosis of Wavelet Transformed image (continuous)
# 4. entropy of image (continuous)
# 5. class (integer)

data.columns = ["Variance", "Skewness", "Curtosis", "Entropy", "Class"]

data.head()

In [None]:
data = data.drop(columns = ["Class"])


data.head()

In [None]:
# scale the data
min_max_scaler = preprocessing.MinMaxScaler()
data_scaled = min_max_scaler.fit_transform(data)

In [None]:
# Using elbow method to get num. of clusters

model = KMeans(init = 'k-means++')

# locate elbow using Knee point detection algorithm
visualizer = KElbowVisualizer(model, k=(1,11), timings=False ,locate_elbow=True)
visualizer.fit(data_scaled) 

for label in visualizer.ax.texts:
    label.set_size(17)

#visualizer.show(outpath="kelbow_visualised.png");
visualizer.show();


In [None]:
# Using silhouette analysis to get num. of clusters

sse_ = []
for k in range(2, 11):
    kmeans = KMeans(n_clusters=k).fit(data_scaled)
    sse_.append([k, silhouette_score(data_scaled, kmeans.labels_)])
    
    
plt.plot(pd.DataFrame(sse_)[0], pd.DataFrame(sse_)[1])
plt.title('Silhouette analysis')
plt.xlabel('Num of clusters')
plt.ylabel('Silhouette coefficient')
#plt.savefig('Silhouette_visualised.png')
plt.show()

In [None]:
# create and train network using winner takes all algorithm

net = nl.net.newc(nl.tool.minmax(data_scaled),2)
error = nl.train.train_wta(net,data_scaled, epochs = 200, show = 200)

In [None]:
plt.title("classification problem")
plt.plot(error)
plt.show()


In [None]:


range_n_clusters = [2, 3, 4, 5, 6,7,8,9,10]
plt.rc('axes', titlesize=14)    # fontsize of the x and y labels
plt.rc('axes', labelsize=14)    # fontsize of the x and y labels

plt.rc('xtick', labelsize=12)    # fontsize of the tick labels
plt.rc('ytick', labelsize=12)    # fontsize of the tick labels
plt.rcParams.update({'font.size': 12})

# plt.rcParams['figure.figsize'] = [8,1]
# plt.rcParams['figure.dpi'] = 100


for n_clusters in range_n_clusters:
    # Create a subplot with 1 row and 2 columns
    fig, (ax1) = plt.subplots(1, 1)
    fig.set_size_inches(8,3)

    # The 1st subplot is the silhouette plot
    # The silhouette coefficient can range from -1, 1 but in this example all
    # lie within [-0.1, 1]
    ax1.set_xlim([-0.1, 0.7])
    # The (n_clusters+1)*10 is for inserting blank space between silhouette
    # plots of individual clusters, to demarcate them clearly.
    ax1.set_ylim([0, len(data_scaled) + (n_clusters + 1) * 10])

    # Initialize the clusterer with n_clusters value and a random generator
    # seed of 10 for reproducibility.
    clusterer = KMeans(n_clusters=n_clusters, random_state=10)
    cluster_labels = clusterer.fit_predict(data_scaled)

    # The silhouette_score gives the average value for all the samples.
    # This gives a perspective into the density and separation of the formed
    # clusters
    silhouette_avg = silhouette_score(data_scaled, cluster_labels)
    print(
        "For n_clusters =",
        n_clusters,
        "The average silhouette_score is :",
        silhouette_avg,
    )

    # Compute the silhouette scores for each sample
    sample_silhouette_values = silhouette_samples(data_scaled, cluster_labels)

    y_lower = 10
    for i in range(n_clusters):
        # Aggregate the silhouette scores for samples belonging to
        # cluster i, and sort them
        ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i]

        ith_cluster_silhouette_values.sort()

        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i

        color = cm.nipy_spectral(float(i) / n_clusters)
        ax1.fill_betweenx(
            np.arange(y_lower, y_upper),
            0,
            ith_cluster_silhouette_values,
            facecolor=color,
            edgecolor=color,
            alpha=0.7,
        )

        # Label the silhouette plots with their cluster numbers at the middle
        ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

        # Compute the new y_lower for next plot
        y_lower = y_upper + 10  # 10 for the 0 samples

    ax1.set_title(f"The silhouette plot for the {n_clusters} clusters.")
    ax1.set_xlabel("The silhouette coefficient values")
    ax1.set_ylabel("Cluster label")

    # The vertical line for average silhouette score of all the values
    ax1.axvline(x=silhouette_avg, color="red", linestyle="--")

    ax1.set_yticks([])  # Clear the yaxis labels / ticks
    ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6])
    plt.tight_layout()
    plt.savefig(f"silhouette_plot_{n_clusters}_clusters.png")

#     # 2nd Plot showing the actual clusters formed
#     colors = cm.nipy_spectral(cluster_labels.astype(float) / n_clusters)
#     ax2.scatter(
#         X[:, 0], X[:, 1], marker=".", s=30, lw=0, alpha=0.7, c=colors, edgecolor="k"
#     )

#     # Labeling the clusters
#     centers = clusterer.cluster_centers_
#     # Draw white circles at cluster centers
#     ax2.scatter(
#         centers[:, 0],
#         centers[:, 1],
#         marker="o",
#         c="white",
#         alpha=1,
#         s=200,
#         edgecolor="k",
#     )

#     for i, c in enumerate(centers):
#         ax2.scatter(c[0], c[1], marker="$%d$" % i, alpha=1, s=50, edgecolor="k")

#     ax2.set_title("The visualization of the clustered data.")
#     ax2.set_xlabel("Feature space for the 1st feature")
#     ax2.set_ylabel("Feature space for the 2nd feature")

#     plt.suptitle(
#         "Silhouette analysis for KMeans clustering on sample data with n_clusters = %d"
#         % n_clusters,
#         fontsize=14,
#         fontweight="bold",
#     )
    
plt.show()