## Initialize some stuff:

In [None]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.decomposition import PCA

## Generate data for 3 clusters embedded in 20 dimensional space:

In [None]:
np.random.seed(0)
d20, _ = make_classification(n_samples=10000, n_features=20, n_informative=2,
                             n_redundant=2, n_repeated=0, n_classes=1,
                             n_clusters_per_class=3, hypercube=True, random_state=2)

In [None]:
d20.shape

## Plot some 2D projections of 20D data:

In [None]:
f, ax = plt.subplots(nrows=1, ncols=3, figsize=(15, 5))
ax[0].set_title('20D data')
ax[0].set_xlabel('Dimension 0')
ax[0].set_ylabel('Dimension 1')
ax[0].scatter(d20[:, 0], d20[:, 1], s=2) # plot x vs. y for first two dimensions
ax[1].set_title('20D data')
ax[1].set_xlabel('Dimension 2')
ax[1].set_ylabel('Dimension 3')
ax[1].scatter(d20[:, 2], d20[:, 3], s=2) # plot x vs. y for second two dimensions
ax[2].set_title('20D data')
ax[2].set_xlabel('Dimension 4')
ax[2].set_ylabel('Dimension 5')
ax[2].scatter(d20[:, 4], d20[:, 5], s=2) # plot x vs. y for third two dimensions

## Use PCA to reduce to top 2 most informative dimensions:

In [None]:
pca = PCA(n_components=2) # create a PCA object, ask for only the top 2 components
d2 = pca.fit_transform(d20) # calculate top 2 principal components, project data onto them
f, ax = plt.subplots(figsize=(10, 10))
ax.set_title('PCA dimension reduction from 20D to 2D')
ax.set_xlabel('Principal Component 1')
ax.set_ylabel('Principal Component 2')
ax.scatter(d2[:, 0], d2[:, 1], s=2) # plot x vs. y for all samples in d2