In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.io import loadmat
import sklearn
%matplotlib inline

Let's load our data

In [None]:
data = loadmat('data/ex7data2.mat')
X = pd.DataFrame(data['X'], columns=['X1','X2'])
X.head()

first step our k-mean clustering algorithm:
  for each sample, find which centroid it belongs to

In [None]:
def norm(x):
    return np.sqrt(np.sum(np.power(x,2)))

def find_closest_centroid(X, mu):
    return [np.argmin([norm(x_i - mu) for mu in initial_centroids]) for x_i in X.values]

initial_centroids = np.array([[3, 3], [6, 2], [8, 5]])

In [None]:
X['centroid'] = find_closest_centroid(X, initial_centroids)
X.head()

step 2: recompute our centroids

In [None]:
def compute_centroid(X):
    return X.groupby('centroid').mean().values

In [None]:
compute_centroid(X)

In [None]:
from IPython.display import Image
Image(filename='data/bird_small.png')

In [None]:
image_data = loadmat('data/bird_small.mat')
A = image_data['A']

In [None]:
# normalize value ranges
A = A / 255.

In [None]:
# reshape the array
X = np.reshape(A, (A.shape[0] * A.shape[1], A.shape[2]))
X.shape

In [None]:
plt.imshow(A)

In [None]:
from sklearn.cluster import KMeans

In [None]:
clf = KMeans(n_clusters=16)
X_centroid = clf.fit_predict(X)

In [None]:
X_centroid.shape, X.shape

In [None]:
clf.cluster_centers_

In [None]:
X_compressed = np.array([clf.cluster_centers_[n] for n in X_centroid])
X_compressed.shape

In [None]:
plt.imshow(X_compressed.reshape(A.shape))

**Principal Component Analysis**

In [None]:
data = loadmat('data/ex7data1.mat')
X = data['X']

fig, ax = plt.subplots(figsize=(12,8))
ax.scatter(X[:, 0], X[:, 1])

In [None]:
from sklearn.preprocessing import scale

In [None]:
X_norm = np.matrix(sklearn.preprocessing.scale(X))

In [None]:
X_cov = (X_norm.T * X_norm) / X_norm.shape[0]

In [None]:
U, S, V = np.linalg.svd(X_cov)
U,S,V

In [None]:
def reduce(U, X, k):
    U_reduced = U[:,:k]
    return U_reduced.T * X.T

def restore(U, Z, k):
    U_reduced = U[:,:k]
    return Z.T * U_reduced.T

In [None]:
#(U_reduced.T * X.T).T * U_reduced.T

reduce(U, X_norm,1)

In [None]:
X_recovered = restore(U, reduce(U,X_norm,1), 1)
X_recovered[:10]

In [None]:
fig, ax = plt.subplots(figsize=(12,8))
ax.scatter(X_recovered[:, 0], X_recovered[:, 1], c='g')
ax.scatter(X_norm[:,0], X_norm[:,1])

In [None]:
fig, ax = plt.subplots(figsize=(12,8))
ax.scatter(X_recovered[:, 0], X_recovered[:, 1])

**Sklearn Solution**

In [None]:
pca = sklearn.decomposition.PCA(n_components=1)
pca.fit(X_norm)

In [None]:
pca.components_

In [None]:
pca.transform(X_norm)[:10]

In [None]:
pca.inverse_transform(pca.transform(X_norm))[:10]

In [None]:
fig, ax = plt.subplots(figsize=(12,8))
restored = pca.inverse_transform(pca.transform(X_norm))
ax.scatter(restored[:,0], restored[:,1], c='g')
ax.scatter(X_norm[:,0], X_norm[:,1])

**PCA and Eigenfaces**

In [None]:
faces = loadmat('data/ex7faces.mat')
X = pd.DataFrame(faces['X'])
X.shape

In [None]:
# let's check out what a face looks like
face = np.reshape(X.values[3,:], (32, 32))
plt.imshow(face)

In [None]:
# Always normalize our X for PCA
X = sklearn.preprocessing.scale(X)
X.mean(axis=0)[:10], X.std(axis=0)[:10]

In [None]:
pca = sklearn.decomposition.PCA(n_components=100)
pca.fit(X)

In [None]:
pca.explained_variance_ratio_.sum()

In [None]:
#Let's visualize our eigenface
#plt.imshow(pca.components_[0].values)
plt.imshow(np.reshape(pca.components_[0,:], (32,32)))

In [None]:
# Let's try reconstructing our first image with our eigenfaces and see how the compression turned out
face_compressed = pca.transform(X[3,:].reshape(1,-1))
face_restored = pca.inverse_transform(face_compressed)
plt.imshow(face_restored.reshape((32,32)))