# This note is consolidating the main components for the model:
# 1- Loading the Images.
# 2- Features Engineering, extracting SURF features.
# 3- The bag of words visual method implemtation.
# 4- Training the model.
# 5- Creating the persistent model.

In [1]:
import mahotas as mh
import numpy as np
from matplotlib import pyplot as plt
from glob import glob
import cv2
from mahotas.features import surf
from sklearn.cluster import KMeans
import scipy
from sklearn import preprocessing, cross_validation, neighbors,datasets, svm
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
from scipy import interp
from sklearn.metrics import roc_curve, auc
from sklearn.externals import joblib
from sklearn.preprocessing import StandardScaler



In [15]:
images = glob('Dataset/*.jpg')
descriptors = []
for im in images:
    im = mh.imread(im, as_grey=True)
    im = im.astype(np.uint8)
    descriptors.append(surf.dense(im, spacing=16))

alldescriptors = []
for im in images:
    im = mh.imread(im, as_grey=True)
    im = im.astype(np.uint8)
    alldescriptors.append(surf.dense(im, spacing=16))
# get all descriptors into a single array
concatenated = np.concatenate(alldescriptors)
print('Number of SURF descriptors: {}'.format(len(concatenated)))

#use only every 64th vector
concatenated = concatenated[::61]
k = 256 #The number of cetroids must not be greater than the number of images,
#usually the number of cetroids should be kept in the order 32,64,128,256,512...
km = KMeans(k) #using the k-means to cluster the SURF descriptors
km.fit(concatenated)

joblib.dump(km, 'SURFcluster.pkl')
print "SURFcluster persistent model Generated"
    
sfeatures = []
for d in alldescriptors:
    c = km.predict(d)
    sfeatures.append(np.array([np.sum(c == ci) for ci in range(k)]))
sfeatures = np.array(sfeatures, dtype=float)
features = np.save("Featurset", sfeatures)
print "features are saved into file Featurset"

a = np.zeros(240)
b = np.ones(240)
c = np.concatenate((a,b), axis =0)
labels = np.save("Labels", c)
print "Labels Generated into file Labels"

Number of SURF descriptors: 94080
SURFcluster persistent model Generated
features are saved into file Featurset
Labels Generated into file Labels


In [16]:
#X = np.load("Featurset.npy")
#y = np.load("Labels.npy")

X = sfeatures
y = c

X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.3, random_state=0)

#scaling
scaler = StandardScaler()
#scaler = preprocessing.MinMaxScaler()

# Fit only on training data
scaler.fit(X_train)
X_train = scaler.transform(X_train)
# apply same transformation to test data
X_test = scaler.transform(X_test)

from sklearn.svm import SVC
clf = SVC(probability = True)
#clf = MLPClassifier(solver='sgd', alpha=1e-5, hidden_layer_sizes=(10, 5, 20), random_state=2, learning_rate_init=0.05, max_iter=10000)
print clf
clf.fit(X_train, y_train)


Accuracy = clf.score(X_train, y_train)
print "Accuracy in the training data: ", Accuracy*100, "%"

accuracy = clf.score(X_test, y_test)
print "Accuracy in the test data", Accuracy*100, "%"

y_pred = clf.predict(X_train)
print '\nTraining classification report\n', classification_report(y_train, y_pred)
print "\n Confusion matrix of training \n", confusion_matrix(y_train, y_pred)

y_pred = clf.predict(X_test)
print '\nTesting classification report\n', classification_report(y_test, y_pred)
print "\nConfusion matrix of the testing\n", confusion_matrix(y_test, y_pred)


probas = clf.fit(X_train, y_train).predict_proba(X_test)
fpr, tpr, thresholds = roc_curve(y_test, probas[:, 1])
mean_tpr = 0.0
mean_fpr = np.linspace(0, 1, 100)

mean_tpr += interp(mean_fpr, fpr, tpr)
mean_tpr[0] = 0.0
roc_auc = auc(fpr, tpr)
print "\nArea Under the ROC curve: ", roc_auc

meanTP = 0
for t in tpr:
    meanTP += t
print "Mean True Positive rate (testing): ", meanTP/len(tpr)

meanFP = 0
for t in fpr:
    meanFP += t
print "Mean False Positive rate (testing): ", meanFP/len(fpr)

plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()


SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
Accuracy in the training data:  99.7023809524 %
Accuracy in the test data 99.7023809524 %

Training classification report
             precision    recall  f1-score   support

        0.0       1.00      0.99      1.00       167
        1.0       0.99      1.00      1.00       169

avg / total       1.00      1.00      1.00       336


 Confusion matrix of training 
[[166   1]
 [  0 169]]

Testing classification report
             precision    recall  f1-score   support

        0.0       0.79      0.81      0.80        73
        1.0       0.80      0.77      0.79        71

avg / total       0.79      0.79      0.79       144


Confusion matrix of the testing
[[59 14]
 [16 55]]

Area Under the ROC curve:  0.87690526722
Mean True Positive rate (testing):  0.7156894

In [18]:
joblib.dump(clf, 'SatisfactionDetector.pkl')
print "SatisfactionDetector persistent model Generated"

SatisfactionDetector persistent model Generated


In [17]:
class_names = ['Satisfied', 'UnSatisfied']

def plot_confusion_matrix(cm, classes,
                          normalize=True,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    thresh = cm.max() / 2.

# Compute confusion matrix
y_pred = clf.predict(X_test)
cnf_matrix = confusion_matrix(y_test, y_pred)
np.set_printoptions(precision=2)

# Plot non-normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=class_names, title='Confusion matrix, without normalization')
plt.show()

Normalized confusion matrix
[[ 0.81  0.19]
 [ 0.23  0.77]]
