# Clustering using features extracted from Random Forest Classifiers

In this notebook we are going to examine the possibility of selecting features for clustering based on the results of Random Forest Classifiers.

Given the relatively good performances of RFCs on a supervised classification task, we want to explore the features selected by such classifiers in the context of unsupervised learning.

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from visualization import vis_data, vis_cluster
from collections import defaultdict, Counter
from dimensionality_reduction import dr_pca
from classification import cla_rand_forest
from sklearn.externals import joblib
from preprocessing import pp_action
from clustering import clu_hdbscan
from helpers import loader_tfidf
from utilities import evaluation
from utilities import constants
import plotly.graph_objs as go
import plotly.offline as ply
from pprint import pprint
import pandas as pd
import numpy as np
import hdbscan
import json
import os

In [None]:
config = json.load(open('config.json', 'r'))
uuids_family = json.load(open(os.path.join(constants.dir_d, constants.json_labels), 'r'))
words = json.load(open(os.path.join(constants.dir_d, constants.json_words), 'r'))
ply.init_notebook_mode(connected=True)

## Data selection

Select a subset of the original dataset. Then the selected subset will be split into a training and a testing set.

In [None]:
samples_data = pp_action.pre_process(config)
pp_action.split_show_data(samples_data)

In [None]:
uuids = samples_data.index[samples_data['selected'] == 1].tolist()
labels_num = samples_data.fam_num[samples_data['selected'] == 1].tolist()

## Random Forest Classification

Let's use random forests to classify the data points

In [None]:
x_train = samples_data.index[samples_data['train'] == 1].tolist()
x_test = samples_data.index[samples_data['test'] == 1].tolist()
y_train = samples_data.fam_num[samples_data['train'] == 1].tolist()
y_test = samples_data.fam_num[samples_data['test'] == 1].tolist()

In [None]:
train = loader_tfidf.load_tfidf(config, x_train, dense=False, ordered=True)
test = loader_tfidf.load_tfidf(config, x_test, dense=False, ordered=True)

In [None]:
classification_labels, randf = cla_rand_forest.classify(config, train, test, x_test, y_train, y_test)

In [None]:
importance = defaultdict(list)
i = 0
for imp in randf.feature_importances_:
    importance[imp].append(i)
    i += 1

In [None]:
print(np.count_nonzero(np.array(randf.feature_importances_)))

In [None]:
max_feats = 512
n_feats = 0
selected_feats = []
for imp in sorted(list(importance.keys()), reverse=True):
    imp_feats = importance[imp]
    to_add = len(imp_feats)
    
    if n_feats + to_add > max_feats:
        to_add = max_feats - n_feats
        
    selected_feats += (sorted(imp_feats)[:to_add])
    n_feats += to_add
    
    if n_feats == max_feats:
        break
    

In [None]:
len(selected_feats)

In [None]:
pprint(selected_feats)

## Clustering

Let's now see what happens when we use those specific features for clustering

In [None]:
del train
del test

In [None]:
data = loader_tfidf.load_tfidf(
    config, 
    uuids, 
    dense=True, 
    ordered=True
)

In [None]:
data_sel = np.take(data, sorted(selected_feats), axis=1)

In [None]:
data_sel.shape

In [None]:
m = 'euclidean'

hdbs = hdbscan.HDBSCAN(min_cluster_size=40,
                       min_samples=None,
                       metric=m,
                       core_dist_n_jobs=config['core_num'])
hdbs.fit(data_sel)
computed_labels = hdbs.labels_

In [None]:
evaluation.evaluate_clustering(labels_num, clustering_labels, data=data_sel, metric=m)