# K-Means clustering result analysis

We will start our exploration of the dataset with one of the most classical clustering algorithms: K-Means.


In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from collections import defaultdict, Counter
from keywords import kw_keyword_tfidf
from sklearn.externals import joblib
from preprocessing import pp_action
from visualization import vis_data
from clustering import clu_kmeans
from utilities import evaluation
from utilities import constants
import plotly.offline as ply
import pandas as pd
import numpy as np
import random
import json
import os

In [None]:
config = json.load(open('config.json', 'r'))
uuids_family = json.load(open(os.path.join(constants.dir_d, constants.json_labels), 'r'))
words = json.load(open(os.path.join(constants.dir_d, constants.json_words), 'r'))
ply.init_notebook_mode(connected=True)

## Data selection

Select a subset of the original dataset. Then the selected subset will be split into a training and a testing set.

In [None]:
samples_data = pp_action.pre_process(config)
pp_action.split_show_data(samples_data)

In [None]:
uuids = samples_data.index[samples_data['selected'] == 1].tolist()
numerical_labels = samples_data.fam_num[samples_data['selected'] == 1].tolist()

## Clustering

Now that we have our data subset we can start with K-Means

In [None]:
data_matrix = np.loadtxt('data/matrix/pca_1024_28582.txt')

In [None]:
clustering_labels, model, modifier, data, metric = clu_kmeans.cluster(
                data_matrix,
                numerical_labels,
                config,
                {
                    'num_clusters': 130,
                }
            )

In [None]:
results = evaluation.evaluate_clustering(numerical_labels, clustering_labels, data, metric, False)

## Cluster Analysis

To better understand the result of the clustering algorithm we would like to see the features characterizing the computed clusters. 

Since the dataset dimensionality was reduced with PCA before clustering we would need to reverse this step to understand the characteristics of the obtained clusters.

To achieve this we will compute the centroids as the average of the data for each cluster and then multiply it by the transposed components matrix.

We will start by creating an inverted index of the clustering.

In [None]:
dr_model = joblib.load(os.path.join(constants.dir_d, constants.dir_mod, 'pca_1024_28582.pkl')) 

In [None]:
inverted_clustering = defaultdict(list)
for i in range(len(uuids)):
    inverted_clustering[clustering_labels[i]].append(uuids[i])

reduced_df = pd.DataFrame(data_matrix, index=uuids)
centroids = {label : np.zeros(len(data_matrix[0])) for label in sorted(set(clustering_labels))}

i = 0
for index, vector in reduced_df.iterrows():
    centroids[clustering_labels[i]] += vector.values
    i += 1

centroid_matrix = []
for centroid in sorted(centroids.keys()):
    centroids[centroid] /= len(inverted_clustering[centroid])
    centroid_matrix.append(centroids[centroid])
    
centroid_matrix = np.array(centroid_matrix)
centroids_orig_fts = np.dot(centroid_matrix, dr_model.components_)
centroids_orig_fts.shape

In [None]:
words = dict(zip(range(len(words)), sorted(words.keys())))

In [None]:
i = -1
for centroid in centroids_orig_fts:
    cent_series = pd.Series(np.abs(centroid), index=sorted(words.values()))
    
    print('Centroid {}:'.format(i))
    print(cent_series.nlargest(10))
    print()
    i += 1

In [None]:
clust_compositions = {i: Counter() for i in sorted(set(clustering_labels.flatten()))}

for i in range(len(uuids)):
    clust_compositions[clustering_labels[i]][uuids_family[uuids[i]]] += 1

for clu in sorted(clust_compositions.keys()):
    print('Cluster {}:'.format(clu))
    print(clust_compositions[clu].most_common())
    print()


In [None]:
families = samples_data.family[samples_data['selected'] == 1].tolist()

In [None]:
vis_data.plot_data('data/matrix/tsne_2_28582.txt', families)

In [None]:
vis_data.plot_data('data/matrix/tsne_2_28582.txt', clustering_labels)