In [3]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_lfw_people
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

## Face Recognition (from Python Data Science Handbook by Jake VanderPlas)

As an example of support vector machines in action, let's take a look 
at the facial recognition problem.  We will use the Labeled Faces in 
the Wild dataset, which consists of several thousand collated photos 
of various public figures.  A fetcher for the dataset is built into 
]Scikit-Learn:

In [4]:
# Load the raw dataset
faces_raw_data = fetch_lfw_people(min_faces_per_person=70)
#print("Target Variable -- ",faces_raw_data.target_names)
#print("Shape -- ", faces_raw_data.images.shape)
#print("Shape of one image ", faces_raw_data.images[0].shape)

ValueError: min_faces_per_person=70 is too restrictive

In [None]:
# Plot a face to see what we are working with

In [None]:
fig, ax = plt.subplots(1)
ax.imshow(faces_raw_data.images[0], cmap='bone')
ax.set(xticks=[], yticks=[])
xlabel=faces_raw_data.target_names[faces_raw_data.target[0]]
ax.set_xlabel(xlabel=xlabel)

Each image contains [62×47] or nearly 3,000 pixels.
We could proceed by simply using each pixel value as a feature, but often it is more effective to use some sort of preprocessor to extract more meaningful features; here we will use a principal component analysis (we will learn about PCA later) to extract 150 fundamental components to feed into our support vector machine classifier.
We can do this most straightforwardly by packaging the preprocessor and the classifier into a single pipeline:

In [None]:
pca = PCA(n_components=150, whiten=True, random_state=42)
svc = SVC(kernel='rbf', class_weight='balanced')
model = make_pipeline(pca, svc)

In [None]:
# Create a Test/Training Set
(X_train, X_test, y_train,  y_test ) = \
    train_test_split(faces_raw_data.data, faces_raw_data.target, \
                     random_state=42)
print("Training Shapes: ", X_train.shape, y_train.shape)
print("Testing Shapes: ", X_test.shape, y_test.shape)

In [None]:
# Grid Search
parameters = { 
    'svc__C':[1, 5, 10, 50],
    'svc__gamma': [0.0001, 0.0005, .001, .005 ]}

grid_1 = GridSearchCV(model, param_grid=parameters, verbose=5)
grid_1.fit(X_train,y_train)
print("Best Score is ", grid_1.best_score_)
print("Best estimator is ", grid_1.best_estimator_)
print("Best params is ", grid_1.best_params_)

# support_vector_classification.C = 1
# support_vector_classification.kernel = 'linear'
# model = support_vector_classification.fit(X_train, y_train)
# print("Training Score", model.score(X_train,y_train))
# print("Testing Score ", model.score(X_test,y_test))
