## `FeatureAgglomeration` applies Hierarchical clustering to group together features that behave similarly.
* Feature scaling

    * Note that if features have very different scaling or statistical properties, `cluster.FeatureAgglomeration` may not be able to capture the links between related features. 
    * Using a `preprocessing.StandardScaler` can be useful in these settings.

In [74]:
import shutil
import tempfile

import numpy as np
import matplotlib.pyplot as plt
from scipy import linalg, ndimage
from joblib import Memory

from sklearn.feature_extraction.image import grid_to_graph
from sklearn.feature_selection import SelectPercentile, f_regression
from sklearn.cluster import FeatureAgglomeration
from sklearn.linear_model import BayesianRidge
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold

Set the parameters

In [2]:
n_samples = 200
size = 40  # image size
roi_size = 15
snr = 5.0
np.random.seed(0)

Generate the coefs

In [115]:
coef = np.zeros((size, size))
coef[0:roi_size, 0:roi_size] = -1.0
coef[-roi_size:, -roi_size:] = 1.0

plt.matshow(coef, cmap=plt.cm.RdBu_r);
plt.title('The True weights\n');

<img src='./plots/true-coefs.png'>

Generate an Image

In [23]:
X = np.random.randn(n_samples, size**2)


for x in X:  # smooth data
    # take each row [1600,] points and smooth
    x[:] = ndimage.gaussian_filter(x.reshape(size, size), sigma=1.0).ravel()

# standardize the image
X -= X.mean(axis=0)
X /= X.std(axis=0)

print('Shape of X :',X.shape)
print('Shape of coef :',coef.shape)

# matrix multiplication X(200, 1600) * Coef(1600,) -> (200, )
y = np.dot(X, coef.ravel())

print('Shape of y :',y.shape)

Shape of X : (200, 1600)
Shape of coef : (40, 40)
Shape of y : (200,)


add noise

In [30]:
y += np.random.randn(y.shape[0])

#### Agglomerate features.
* Recursively merges pair of clusters of features.
<br>

* n_clusters   [ default=2 ]
    * The number of clusters to find. It must be None if distance_threshold is not None.


<br>

* metric   [ default=None ]
    * Metric used to compute the linkage.
    * Can be `“euclidean”`, `“l1”`, `“l2”`, `“manhattan”`, `“cosine”`, or `“precomputed”`. 
    * If set to None then `“euclidean”` is used. 
    * If linkage is `“ward”`, only `“euclidean”` is accepted. 
    * If `“precomputed”`, a distance matrix is needed as input for the fit method.

<br>

* memory
    * object with the joblib.Memory interface, [ default=None ]
    * Used to cache the output of the computation of the tree. 
    * By default, no caching is done. 
    * If a string is given, it is the path to the caching directory.

<br>

* connectivity [ default=None ]
    * Connectivity matrix. Defines for each feature the neighboring features following a given structure of the data. 
    * This can be a connectivity matrix itself or a callable that transforms the data into a connectivity matrix, such as derived from kneighbors_graph.
    * Default is None, i.e, the hierarchical clustering algorithm is unstructured.

<br>

* linkage `“ward”, “complete”, “average”, “single” , default=”ward”`
    * Which linkage criterion to use. 
    * The linkage criterion determines which distance to use between sets of features. 
    * The algorithm will merge the pairs of cluster that minimize this criterion.

<br>

* `“ward”` minimizes the variance of the clusters being merged.

* `“complete”` or maximum linkage uses the maximum distances between all features of the two sets.

* `“average”` uses the average of the distances of each feature of the two sets.

* `“single”` uses the minimum of the distances between all features of the two sets.

### Compute the coefs of a Bayesian Ridge with GridSearch

In [34]:
bays = BayesianRidge()

# cross validation
cv =KFold(n_splits=2)

# to cache the output
cache = tempfile.mkdtemp()
mem = Memory(location=cache, verbose=1)

#### Ward agglomeration followed by BayesianRidge



In [41]:
# Defines for each feature the neighboring features following a given structure of the data.
connectivity = grid_to_graph(n_x=size, n_y=size)
print('Shape of connectivity graph',connectivity.todense().shape)

Shape of connectivity graph (1600, 1600)


In [43]:
# ward linkage
ward = FeatureAgglomeration(n_clusters=10, memory=mem, linkage='ward', metric='euclidean')

In [48]:
# pipeline
pipe = Pipeline(steps=[
    ('ward', ward),
    ('ridge', bays)
])

# grid search
gscv = GridSearchCV(estimator=pipe, param_grid={'ward__n_clusters':[10,20,30]}, cv=cv)

# fit 
gscv.fit(X, y)

________________________________________________________________________________
[Memory] Calling sklearn.cluster._agglomerative.ward_tree...
ward_tree(array([[ 0.817359, ..., -1.140375],
       ...,
       [ 1.027432, ...,  0.558715]]), connectivity=None, n_clusters=None, return_distance=False)
________________________________________________________ward_tree - 0.4s, 0.0min
________________________________________________________________________________
[Memory] Calling sklearn.cluster._agglomerative.ward_tree...
ward_tree(array([[-1.141501, ..., -1.883984],
       ...,
       [-0.128765, ...,  1.212187]]), connectivity=None, n_clusters=None, return_distance=False)
________________________________________________________ward_tree - 0.3s, 0.0min
________________________________________________________________________________
[Memory] Calling sklearn.cluster._agglomerative.ward_tree...
ward_tree(array([[-1.141501, ..., -1.140375],
       ...,
       [-0.128765, ...,  0.558715]]), connec

In [51]:
# This will give us the best pipeline 
# From the pipe we need coef of bayes Model
gscv.best_estimator_

In [62]:
# target the bayes model from the pipe and extract coefs
coef_agglomeration = gscv.best_estimator_[1].coef_
print('Shape of coefs', coef_agglomeration.shape) # feature agglomeration with n_cluster = 30 result in 30 feats

Shape of coefs (30,)


In [64]:
# Now Inverse the transformation and return a vector of size n_features
coef_agglomeration = gscv.best_estimator_[0].inverse_transform(coef_agglomeration)
coef_agglomeration.shape

(1600,)

In [66]:
# Now resize the coef 
coef_agglomeration = np.reshape(coef_agglomeration, newshape=coef.shape)
print('Shape of coef agglomeration :', coef_agglomeration.shape)

Shape of coef agglomeration : (40, 40)


In [114]:
plt.matshow(coef_agglomeration, cmap=plt.cm.RdBu_r);
plt.title('Feature agglomeration\n');

<img src='./plots/feature-agglomeration.png'>

## Feature selection using f_regression

In [81]:
f_stats = SelectPercentile(score_func=f_regression, percentile=10)
bays = BayesianRidge()

cv = KFold(2)

In [84]:
# Define the pipeline
pipe = Pipeline(steps=[
    ('f_regression', f_stats),
    ('ridge', bays)
])

# grid search for best params
gscv = GridSearchCV(estimator=pipe, param_grid={'f_regression__percentile':[5,10,15,20]}, cv=cv)

# fit
gscv.fit(X, y)

In [96]:
# extact the coef
coef_feature_selection = gscv.best_estimator_[1].coef_
print('Shape of coefs', coef_feature_selection.shape)

Shape of coefs (240,)


In [97]:
print('Shape of X', X.shape)

Shape of X (200, 1600)


In [98]:
# Now Inverse the transformation and return a vector of size n_features
# Reshape your data either using array.reshape(-1, 1) if your data has a single feature 
# or 
# array.reshape(1, -1) if it contains a single sample.
coef_feature_selection = gscv.best_estimator_[0].inverse_transform(np.reshape(coef_feature_selection,(1, -1)))

print('Shape of coefs_feature_selection', coef_feature_selection.shape)

Shape of coefs_feature_selection (1, 1600)


In [99]:
# Now lets reshape the coef to match true coef
coef_feature_selection = np.reshape(coef_feature_selection, newshape=coef.shape)

In [113]:
# plot
plt.matshow(coef_feature_selection, cmap=plt.cm.RdBu_r);
plt.title('Feature Selection\n');

<img src='./plots/feature-selection.png'>

## Visualize the results

In [112]:
fig, ax = plt.subplots(nrows=1, ncols=3, figsize=(15,5))

ax[0].matshow(coef, cmap=plt.cm.RdBu_r)
ax[0].set(title='True Coefs')

ax[1].matshow(coef_agglomeration, cmap=plt.cm.RdBu_r)
ax[1].set(title='Coefs after Feature Agglomeration')

ax[2].matshow(coef_feature_selection, cmap=plt.cm.RdBu_r)
ax[2].set(title='Coefs after Feature Selection')

plt.show()

<img src='./plots/Compare-results-coefs-before-and-after-feature-agglomeration-and-feature-selection.png'>