In [200]:
import sklearn
from sklearn import datasets
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np, matplotlib.pyplot as plt
%matplotlib inline

In [201]:
data = datasets.fetch_olivetti_faces()

print(data["DESCR"])

.. _olivetti_faces_dataset:

The Olivetti faces dataset
--------------------------

`This dataset contains a set of face images`_ taken between April 1992 and 
April 1994 at AT&T Laboratories Cambridge. The
:func:`sklearn.datasets.fetch_olivetti_faces` function is the data
fetching / caching function that downloads the data
archive from AT&T.

.. _This dataset contains a set of face images: http://www.cl.cam.ac.uk/research/dtg/attarchive/facedatabase.html

As described on the original website:

    There are ten different images of each of 40 distinct subjects. For some
    subjects, the images were taken at different times, varying the lighting,
    facial expressions (open / closed eyes, smiling / not smiling) and facial
    details (glasses / no glasses). All the images were taken against a dark
    homogeneous background with the subjects in an upright, frontal position 
    (with tolerance for some side movement).

**Data Set Characteristics:**

    Classes                        

In [202]:
face_data = pd.DataFrame(data["data"])
face_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4086,4087,4088,4089,4090,4091,4092,4093,4094,4095
0,0.309917,0.367769,0.417355,0.442149,0.528926,0.607438,0.657025,0.677686,0.690083,0.68595,...,0.665289,0.669421,0.652893,0.661157,0.475207,0.132231,0.14876,0.152893,0.161157,0.157025
1,0.454545,0.471074,0.512397,0.557851,0.595041,0.640496,0.681818,0.702479,0.710744,0.702479,...,0.136364,0.157025,0.136364,0.14876,0.152893,0.152893,0.152893,0.152893,0.152893,0.152893
2,0.318182,0.400826,0.491736,0.528926,0.586777,0.657025,0.681818,0.68595,0.702479,0.698347,...,0.07438,0.132231,0.181818,0.136364,0.128099,0.14876,0.144628,0.140496,0.14876,0.152893
3,0.198347,0.194215,0.194215,0.194215,0.190083,0.190083,0.243802,0.404959,0.483471,0.516529,...,0.652893,0.636364,0.657025,0.68595,0.727273,0.743802,0.764463,0.752066,0.752066,0.739669
4,0.5,0.545455,0.582645,0.623967,0.64876,0.690083,0.694215,0.714876,0.72314,0.731405,...,0.190083,0.161157,0.177686,0.173554,0.177686,0.177686,0.177686,0.177686,0.173554,0.173554


In [203]:
face_data["y"] = data["target"]
face_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4087,4088,4089,4090,4091,4092,4093,4094,4095,y
0,0.309917,0.367769,0.417355,0.442149,0.528926,0.607438,0.657025,0.677686,0.690083,0.68595,...,0.669421,0.652893,0.661157,0.475207,0.132231,0.14876,0.152893,0.161157,0.157025,0
1,0.454545,0.471074,0.512397,0.557851,0.595041,0.640496,0.681818,0.702479,0.710744,0.702479,...,0.157025,0.136364,0.14876,0.152893,0.152893,0.152893,0.152893,0.152893,0.152893,0
2,0.318182,0.400826,0.491736,0.528926,0.586777,0.657025,0.681818,0.68595,0.702479,0.698347,...,0.132231,0.181818,0.136364,0.128099,0.14876,0.144628,0.140496,0.14876,0.152893,0
3,0.198347,0.194215,0.194215,0.194215,0.190083,0.190083,0.243802,0.404959,0.483471,0.516529,...,0.636364,0.657025,0.68595,0.727273,0.743802,0.764463,0.752066,0.752066,0.739669,0
4,0.5,0.545455,0.582645,0.623967,0.64876,0.690083,0.694215,0.714876,0.72314,0.731405,...,0.161157,0.177686,0.173554,0.177686,0.177686,0.177686,0.177686,0.173554,0.173554,0


### Split data in train, test

In [204]:
train, test = train_test_split(face_data, test_size=0.1)
len(train), len(test)

(360, 40)

### Train logistic regression model

In [205]:
from sklearn import linear_model
logreg = linear_model.LogisticRegression(max_iter=10000)

cols = np.arange(0, 4096)
X = train[cols].values
y = train.y.values
X_test = test[cols].values
y_test = test.y.values

logreg.fit(X, y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=10000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

### Logistic regression accuracy

In [206]:
logreg.score(X_test, y_test)

1.0

### Decision Tree Classifier

In [207]:
from sklearn import tree
dt = tree.DecisionTreeClassifier()

dt.fit(X,y)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

### Decision Tree Score

In [208]:
dt.score(X_test, y_test)

0.575

### Support Vector Machine

In [211]:
from sklearn import svm
svm_cl = svm.SVC()
svm_cl.fit(X,y)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

### Support Vector Machine Score

In [212]:
svm_cl.score(X_test, y_test)

0.95

### Classifier Review

What are the features?
    
    The features are the 4096 points that make up a picture of a given person.

What was it your classifier was trying to classify? (Hint: You'll need to read the description of the data--this constitutes the classification "task"; i.e., the mapping from features in X to the classification labels in y)

    It was trying to classify a set of points that make up a picture to the person in the picture.
    
What is the baseline for this dataset? (Hint: what is the most common type that could be classified?)

    10% because each class was distributed evenly.   
    
How well did your three classifiers do above the baseline.

    The logistic regression classifier and support vector machine did very well, but the decision tree performed poorly in comparison.
    

Read a little bit about each classifier. Why do you think one classifier did better than another classifier?

    Logistic regression had the highest score, support vector machine had the 2nd highest score and decision
    trees had the worst score.
    
    Decision trees are prone to overfitting and can generate entirely different trees with minor changes in
    the data. If an ensemble of decision trees were used, then perhaps the classification score would improve.
    
    Support vector machine classifier performed well. This may be because svms are effective in high dimensional
    spaces. In the case of train/test data, there were 4096 dimensions, over 10 times the original sample size.

### Open ended exploration

#### Recommendation systems

What are the input features?

    Real-valued feature vectors.
    
What are the class labels?

    Positive class or negative class for binary classification.

How is the data represented?

    In matrices

Do you think any of the three classification approaches you used for the Olivettie Faces above would work for this task?

    Support vector machines might be good for this since in a recommendation engine you might have access to a lot of features of the user (high dimensionsiality.)

#### Link Prediction

What are the input features?

    Countries, their continents and their neighboring countries (Knowledge graphs).
    
What are the class labels?

    Missing graph link or not

How is the data represented?

    In tab separated triples of a knowledge graph.

Do you think any of the three classification approaches you used for the Olivettie Faces above would work for this task?

    Logistic regression.