# K-Means clustering result analysis

We will start our exploration of the dataset with one of the most classical clustering algorithms: K-Means.

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from clustering import clu_kmeans, clu_kmeans_minibatch
from visualization import vis_data, vis_cluster
from collections import defaultdict, Counter
from keywords import kw_keyword_tfidf
from sklearn.metrics import f1_score
from sklearn.externals import joblib
from preprocessing import pp_action
from helpers import loader_tfidf
from utilities import constants
import plotly.graph_objs as go
import plotly.offline as ply
import pandas as pd
import numpy as np
import random
import json
import os

In [None]:
config = json.load(open('config.json', 'r'))
uuids_family = json.load(open(os.path.join(constants.dir_d, constants.json_labels), 'r'))
words = json.load(open(os.path.join(constants.dir_d, constants.json_words), 'r'))
ply.init_notebook_mode(connected=True)

## Data selection

Select a subset of the original dataset. Then the selected subset will be split into a training and a testing set.


In [None]:
samples_data = pp_action.pre_process(config)
pp_action.split_show_data(samples_data)

In [None]:
uuids = samples_data.index[samples_data['selected'] == 1].tolist()
labels_num = samples_data.fam_num[samples_data['selected'] == 1].tolist()

## Clustering

Now that we have our data subset we can start with K-Means

In [None]:
clustering, clu_model = clu_kmeans_minibatch.cluster(config, 10, uuids, labels_num)

## Cluster Analysis

To better understand the result of the clustering algorithm we would like to see the features characterizing the computed clusters. We can therefore aggregate the vectors composing each cluster in a single cumulative vector and retrieve the features with the highest weight in the cluster-vector.

In [None]:
kw_keyword_tfidf.extract_keywords(config, 'data/d_clusterings/clustering_kmeans_euclidean_minibatch_1209.json')

In [None]:
with open('data/d_keywords/clustering_kmeans_euclidean_minibatch_1209_keywords_tfidf', 'r') as kws:
    print(kws.read())

Let's also inspect the composition (based on our AV labels) of each cluster discovered by K-Means.

In [None]:
clust_compositions = {i: Counter() for i in sorted(set(clustering.flatten()))}

for i in range(len(uuids)):
    clust_compositions[clustering[i]][uuids_family[uuids[i]]] += 1

for clu in sorted(clust_compositions.keys()):
    print('Cluster {}:'.format(clu))
    print(clust_compositions[clu].most_common())
    print()

## Cluster Visualization

We can also generate a visual output from our clustering. 

We can compare the classification provided by the AV data with the result of our clustering, plotted over the same dimensionality reduced data points.

Here, the color of the points will reflect the cluster in which they are assigned by the algorithm.

In [None]:
vis_data.plot_data('data/d_matrices/tsne_2_1209.txt', clustering)

We can repeat the same comparison process with a 3-dimensional representation of the dataset. Since in this case tSNE generated a representation quite difficult to explore visually, we will use PCA to reduce the dimensions of our vectors.

In [None]:
vis_data.plot_data('data/d_matrices/pca_3_1209.txt', clustering)

## With PCA reduced vectors

It is also interesting to see hwo the clustering results change if we use the PCA reduced vectors instead of the full word wights vectors.

In [None]:
# If you had already computed PCA, load it from the disk instead
dr_model = joblib.load(os.path.join(constants.dir_d, constants.dir_mod, 'pca_128_1209.pkl')) 
reduced = np.loadtxt('data/d_matrices/pca_128_1209.txt')

In [None]:
clustering, clu_model = clu_kmeans.cluster(config, 'data/d_matrices/pca_128_1209.txt', 10, uuids, labels_num, sparse=False)

In [None]:
inverted_clustering = defaultdict(list)
for i in range(len(uuids)):
    inverted_clustering[clustering[i]].append(uuids[i])

reduced_df = pd.DataFrame(reduced, index=uuids)
centroids = {label : np.zeros(len(reduced[0])) for label in sorted(set(clustering))}

i = 0
for index, vector in reduced_df.iterrows():
    centroids[clustering[i]] += vector.values
    i += 1

centroid_matrix = []
for centroid in sorted(centroids.keys()):
    centroids[centroid] /= len(inverted_clustering[centroid])
    centroid_matrix.append(centroids[centroid])
    
centroid_matrix = np.array(centroid_matrix)
centroids_orig_fts = np.dot(centroid_matrix, dr_model.components_)
centroids_orig_fts.shape

In [None]:
words = dict(zip(range(len(words)), sorted(words.keys())))

i = -1
for centroid in centroids_orig_fts:
    cent_series = pd.Series(np.abs(centroid), index=sorted(words.values()))
    
    print('Centroid {}:'.format(i))
    print(cent_series.nlargest(10))
    print()
    i += 1


In [None]:
clust_compositions = {i: Counter() for i in sorted(set(clustering.flatten()))}

for i in range(len(uuids)):
    clust_compositions[clustering[i]][uuids_family[uuids[i]]] += 1

for clu in sorted(clust_compositions.keys()):
    print('Cluster {}:'.format(clu))
    print(clust_compositions[clu].most_common())
    print()


In [None]:
vis_data.plot_data('data/d_matrices/tsne_2_1209.txt', clustering)

In [None]:
vis_data.plot_data('data/d_matrices/pca_3_1209.txt', clustering)