# Clustering using features extracted from Random Forest Classifiers

In this notebook we are going to examine the possibility of selecting features for clustering based on the results of Random Forest Classifiers.

Given the relatively good performances of RFCs on a supervised classification task, we want to explore the features selected by such classifiers in the context of unsupervised learning.

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from visualization import vis_data, vis_cluster
from sklearn.metrics import pairwise_distances
from collections import defaultdict, Counter
from dimensionality_reduction import dr_pca
from classification import cla_rand_forest
from sklearn.linear_model import Lasso
from sklearn.metrics import f1_score
from sklearn.externals import joblib
from preprocessing import pp_action
from clustering import clu_hdbscan
from helpers import loader_tfidf
from utilities import evaluation
from utilities import constants
import plotly.graph_objs as go
import plotly.offline as ply
from sklearn.svm import SVC
from pprint import pprint
import pandas as pd
import numpy as np
import hdbscan
import json
import os

In [None]:
config = json.load(open('config.json', 'r'))
uuids_family = json.load(open(os.path.join(constants.dir_d, constants.json_labels), 'r'))
words = json.load(open(os.path.join(constants.dir_d, constants.json_words), 'r'))
inv_words = {num : word for word, num in words.items()}
ply.init_notebook_mode(connected=True)
max_feats = 1024

## Data selection

Select a subset of the original dataset. Then the selected subset will be split into a training and a testing set.

In [None]:
samples_data = pp_action.pre_process(config)
pp_action.split_show_data(samples_data)

In [None]:
uuids = samples_data.index[samples_data['selected'] == 1].tolist()
labels_num = samples_data.fam_num[samples_data['selected'] == 1].tolist()
x_train = samples_data.index[samples_data['train'] == 1].tolist()
x_dev = samples_data.index[samples_data['dev'] == 1].tolist()
y_train = samples_data.fam_num[samples_data['train'] == 1].tolist()
y_dev = samples_data.fam_num[samples_data['dev'] == 1].tolist()

## Dataset loading

let's load the data in memory

In [None]:
train = loader_tfidf.load_tfidf(config, x_train, dense=True, ordered=True)

In [None]:
dev = loader_tfidf.load_tfidf(config, x_dev, dense=True, ordered=True)

In [None]:
data = loader_tfidf.load_tfidf(config, uuids, dense=True, ordered=True)

## Extracting features

This function isolates the `max_feats` most important features as identified by the list of feature weights.

In [None]:
def get_important_feats(feats_weights):
    print(np.count_nonzero(feats_weights))
    importance = defaultdict(list)
    selected_feats = []
    n_feats = 0
    i = 0
    
    for imp in feats_weights:
        importance[imp].append(i)
        i += 1
      
    for imp in sorted(list(importance.keys()), reverse=True):
        imp_feats = importance[imp]
        to_add = len(imp_feats)

        if n_feats + to_add > max_feats:
            to_add = max_feats - n_feats

        selected_feats += (sorted(imp_feats)[:to_add])
        n_feats += to_add

        if n_feats == max_feats:
            break
    
    print(len(selected_feats))
    return selected_feats

## SVM Classification

Here we define a helper function that evaluates the performance of a SVM classifier on the specified dataset

In [None]:
def eval_svm(t, y_t, d, y_d):
    svm = SVC(kernel='linear', random_state=42)

    svm.fit(t, y_t)

    classification_labels = svm.predict(d)
    
    print(f1_score(y_d, classification_labels, average='micro'))

## Clustering

Let's now define a function to use in order to evaluate the performance of hdbscan clustering on the specified dataset

In [None]:
def eval_hdbscan(d):
    m = 'cosine'
    distance = pairwise_distances(d, metric=m)
    
    hdbs = hdbscan.HDBSCAN(min_cluster_size=40,
                           min_samples=None,
                           metric='precomputed',
                           core_dist_n_jobs=config['core_num'])
    hdbs.fit(distance)
    clustering_labels = hdbs.labels_
    
    evaluation.evaluate_clustering(labels_num, clustering_labels, data=d, metric=m)

## Initial situation

Let's see the performance of our algorithms with the raw dataset

In [None]:
eval_svm(train, y_train, dev, y_dev)

In [None]:
eval_hdbscan(data)

## Random Forest Classification

Let's use random forests to classify the data points

In [None]:
classification_labels, randf = cla_rand_forest.classify(config, train, dev, x_dev, y_train, y_dev)

In [None]:
selected_feats_r = get_important_feats(randf.feature_importances_)

## LASSO method

As an alternative to using random forests we may try the LASSO method

In [None]:
las = Lasso(random_state=42, max_iter=2000, selection='random', positive=True, tol=0.001)

In [None]:
las.fit(train, y_train)

In [None]:
selected_feats_l = get_important_feats(las.coef_)


## New dataset

Now we can build a new dataset with reduced vectors using the features we just identified

In [None]:
selected_feats = selected_feats_r

In [None]:
data_sel = np.take(data, selected_feats, axis=1)

In [None]:
train_sel = np.take(train, selected_feats, axis=1)

In [None]:
dev_sel = np.take(dev, selected_feats, axis=1)

In [None]:
print(data_sel.shape)
print(train_sel.shape)
print(dev_sel.shape)

In [None]:
eval_svm(train_sel, y_train, dev_sel, y_dev)

In [None]:
eval_hdbscan(data_sel)

In [None]:
selected_feats = selected_feats_l

In [None]:
data_sel = np.take(data, sorted(selected_feats), axis=1)

In [None]:
train_sel = np.take(train, sorted(selected_feats), axis=1)

In [None]:
dev_sel = np.take(dev, sorted(selected_feats), axis=1)

In [None]:
print(data_sel.shape)
print(train_sel.shape)
print(dev_sel.shape)

In [None]:
eval_svm(train_sel, y_train, dev_sel, y_dev)

In [None]:
eval_hdbscan(data_sel)