In [1]:
# import the necessary packages
from sklearn.neighbors import KNeighborsClassifier
from sklearn import cross_validation
from imutils import paths
import numpy as np
import pandas as pd
import argparse
import imutils
import cv2
import os
import math

In [2]:
def image_to_feature_vector(image, size=(64, 64)):
	# resize the image to a fixed size, then flatten the image into
	# a list of raw pixel intensities
	return cv2.resize(image, size).flatten()

In [3]:
def extract_color_histogram(image, bins=(8, 8, 8)):
	# extract a 3D color histogram from the HSV color space using
	# the supplied number of `bins` per channel
	hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
	hist = cv2.calcHist([hsv], [0, 1, 2], None, bins,
		[0, 180, 0, 256, 0, 256])

	# handle normalizing the histogram if we are using OpenCV 2.4.X
	if imutils.is_cv2():
		hist = cv2.normalize(hist)

	# otherwise, perform "in place" normalization in OpenCV 3 (I
	# personally hate the way this is done
	else:
		cv2.normalize(hist, hist)

	# return the flattened histogram as the feature vector
	return hist.flatten()

In [4]:
dataset = "./classes/kNN_data/"
# grab the list of images that we'll be describing
print("[INFO] describing images...")
imagePaths = list(paths.list_images(dataset))
print "[INFO] " + str(len(list(paths.list_images(dataset)))) + " images found!"

[INFO] describing images...
[INFO] 11433 images found!


In [5]:
# initialize the raw pixel intensities matrix, the features matrix,
# and labels list
rawImages = []
features = []
labels = []

# loop over the input images
for (i, imagePath) in enumerate(imagePaths):
	# load the image and extract the class label (assuming that our
	# path as the format: /path/to/dataset/{class}.{image_num}.jpg
	image = cv2.imread(imagePath)
	label = imagePath.split(os.path.sep)[-1].split(".")[0]

	# extract raw pixel intensity "features", followed by a color
	# histogram to characterize the color distribution of the pixels
	# in the image
	pixels = image_to_feature_vector(image)
	hist = extract_color_histogram(image)

	# update the raw images, features, and labels matricies,
	# respectively
	rawImages.append(pixels)
	features.append(hist)
	labels.append(label)

	# show an update every 1,000 images
	if i > 0 and i % 1000 == 0:
		print("[INFO] processed {}/{}".format(i, len(imagePaths)))

[INFO] processed 1000/11433
[INFO] processed 2000/11433
[INFO] processed 3000/11433
[INFO] processed 4000/11433
[INFO] processed 5000/11433
[INFO] processed 6000/11433
[INFO] processed 7000/11433
[INFO] processed 8000/11433
[INFO] processed 9000/11433
[INFO] processed 10000/11433
[INFO] processed 11000/11433


In [6]:
#64*64*3 colors
# rawImages: Images flattened to a single list
# labels: Brands for the given images
# features: Color histogram

X_train, X_val, Y_train, Y_val = cross_validation.train_test_split(rawImages,
                                                                   labels, 
                                                                   test_size=.1, 
                                                                   random_state=1337)
print "Length of Validation Set:" + str(len(Y_val))
print "Length of Train Set:" + str(len(Y_train))
print "Ratio: " + str(len(Y_val) / float(len(Y_train)))

Length of Validation Set:1144
Length of Train Set:10289
Ratio: 0.111186704247


In [7]:
## Train the KNN model
print "Training KNN Model..."
## n_jobs=-1: Use all cores
## n_neighbors = Number of Neighbors
n_neighbors = math.ceil(math.sqrt(len(rawImages)))

model = KNeighborsClassifier(n_neighbors=n_neighbors,
                             n_jobs=-1)
model.fit(X_train, Y_train)

Training KNN Model...


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=-1, n_neighbors=107.0, p=2,
           weights='uniform')

In [8]:
## Predict on the validation set using the KNN model.
validations_predictions = model.predict(X_val)
print validations_predictions

  return [func(*args, **kwargs) for func, args, kwargs in self.items]


['adidas' 'nike' 'jordan' ..., 'nike' 'nike' 'newbalance']


In [9]:

results = pd.DataFrame()

results.insert(0,'actual',Y_val)
results.insert(1,'predictions',validations_predictions)

misclassified = results[results['actual'] != results['predictions']].count()[0] / float(len(X_val))
misclassified

0.37412587412587411

In [10]:
## Give me the 5 nearest neighbors of the first two items in the validation set
pd.DataFrame(model.kneighbors(X_val[0:5], n_neighbors=5, return_distance=False))

Unnamed: 0,0,1,2,3,4
0,5581,2958,7384,9459,9131
1,5178,7435,2122,6260,3036
2,6364,7187,7870,8493,1477
3,6209,4916,5313,4962,9248
4,2549,7769,9227,4866,5583


In [11]:
model.score(X_val,Y_val)

0.62587412587412583

In [12]:
model.score(X_train,Y_train)

0.64048984352220817

In [13]:
new_images_test = []
new_labels = []
newDataPath = list(paths.list_images("./classes/kNN_new_data/"))

for (i, newDataPath) in enumerate(newDataPath):    
    image = cv2.imread(newDataPath)
    new_images = image_to_feature_vector(image)
    
    labels = newDataPath.split(os.path.sep)[-1].split(".")[0]
    new_images_test.append(new_images)
    new_labels.append(labels)

In [14]:
new_data_predictions = model.predict(new_images_test)
rslt = new_labels == new_data_predictions

new_data_df = pd.DataFrame()
new_data_df.insert(0,'actual',new_labels)
new_data_df.insert(1,'predictions',new_data_predictions)
new_data_df.insert(2,'results',rslt)

print "Accuracy of the Model:" + str(model.score(new_images_test, new_labels))

new_data_df

Accuracy of the Model:0.4375


Unnamed: 0,actual,predictions,results
0,jordan,nike,False
1,jordan,nike,False
2,jordan,nike,False
3,jordan,jordan,True
4,jordan,nike,False
5,jordan,nike,False
6,jordan,jordan,True
7,jordan,jordan,True
8,newbalance,newbalance,True
9,newbalance,nike,False


In [15]:
prediction_nearest_neighbors = pd.DataFrame(model.kneighbors(new_images_test, n_neighbors=107, return_distance=False))
prediction_nearest_neighbors

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,97,98,99,100,101,102,103,104,105,106
0,8126,2383,8554,7226,1062,9625,2561,6839,5943,10240,...,4080,6346,8682,3441,547,1363,86,7764,4616,6011
1,9173,1801,1620,8877,1669,3487,8287,5683,4835,3663,...,1500,8801,2249,4109,3758,551,244,8733,9952,9521
2,8126,9975,9931,6823,5179,4329,1549,7160,3175,4654,...,9226,2066,6237,8549,6679,5838,4842,7834,6954,1481
3,1838,8705,476,4123,1288,6957,8230,7648,7552,1419,...,10251,1751,4343,3113,3923,8449,4247,3032,2356,4397
4,8126,8554,2561,2383,9625,7226,1062,6839,10240,5943,...,10132,1031,8898,7627,6900,1275,9497,4088,7817,3210
5,8126,8554,1062,6839,2561,7226,2383,9625,5364,9975,...,7567,4623,7871,8436,3615,4360,5363,7713,7369,7704
6,8314,3752,4654,9189,3906,9334,7385,9697,1549,3146,...,5150,9710,5368,4188,10038,3877,5783,9962,1074,2106
7,6823,5269,7354,6105,5771,9334,245,8061,5844,3496,...,1126,3175,8459,9725,4580,10176,5439,9962,7853,10285
8,2001,4294,9227,3108,1566,3220,9666,6019,7242,9132,...,6536,2126,2933,5489,105,2956,5124,3640,4020,8398
9,7434,1765,620,5292,9157,5337,5478,499,2869,7466,...,1122,3696,6813,3082,3493,1655,6455,5559,2843,3751


In [23]:
nearest_neighbors = list(model.kneighbors(new_images_test[5], n_neighbors=107, return_distance=False)[0])
NN_images = [imagePaths[x] for x in nearest_neighbors]

processNN = map(lambda x: x.split(os.path.sep)[-1].split(".")[0], NN_images)
[ (i,processNN.count(i)) for i in set(processNN) ]




[('newbalance', 14), ('jordan', 23), ('adidas', 15), ('nike', 55)]