In [1]:
import sklearn
import matplotlib.pyplot as plt
import numpy as np
import cv2
from os import listdir
from os.path import isfile, join
import pandas as pd
from datetime import datetime
from sklearn import ensemble
from sklearn import svm
from sklearn import linear_model
from sklearn.cluster import KMeans
from sklearn import neighbors
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split#Author: David Arredondo

## Part 1: Data Cleaning

In [2]:
#load the images into a numpy array.
#inspired by https://stackoverflow.com/questions/33369832/read-multiple-images-on-a-folder-in-opencv-python
mypath = "C:\\Users\\David\\Documents\\train\\" #change this path to wherever your image data is
# the below is a list of individual file names
justfiles = [ f for f in listdir(mypath) if isfile(join(mypath,f)) ]
#prepare my list of images
images = np.empty(len(justfiles), dtype=object)
#fill the list of images with the dog and cat images
for n in range(0, len(justfiles)):
  images[n] = cv2.imread( join(mypath,justfiles[n]) )

In [3]:
#create an array of images to play around with
tempimg = np.empty(500, dtype=object)
for i in range(500):
    tempimg[i] = images[i]

In [4]:
#load the labels, and prepare an array for a bool of labels, dog == True, cat == False
labels = np.loadtxt("..\\data\\train_label.txt", dtype = 'str')
labelsb = np.empty(len(labels), dtype=bool)

In [5]:
#make labels a better string
for i in range(len(labels)):
    if ('d' in labels[i]):
        labels[i] = 'dog'
    else:
        labels[i] = 'cat'

In [6]:
#fill labelsb
for i in range(len(labels)):
    if ('d' in labels[i]):
        labelsb[i] = True
    else:
        labelsb[i] = False

In [7]:
#make all images gray
g_img = np.empty(len(images), dtype = object)
for i in range(len(images)):
    g_img[i] = cv2.cvtColor(images[i], cv2.COLOR_BGR2GRAY)

In [8]:
#resize all images; make them uniform
gu_img = np.empty(len(g_img), dtype = object)
for i in range(len(images)):
    gu_img[i] = cv2.resize(g_img[i], (128,128))

In [9]:
#train and test sets
trn = gu_img[0:1600]
tst = gu_img[1600:2000]
# tmp train and test labels
ltrn = labelsb[0:1600]
ltst = labelsb[1600:2000]

### ORB Stuff

In [10]:
# Initiate STAR detector for all the images
orbs = np.empty(len(gu_img), dtype = object)
for i in range(len(gu_img)):
    orbs[i] = cv2.ORB_create()

# find the keypoints with ORB
kps = np.empty(len(gu_img), dtype=object)
for i in range(len(orbs)):
    kps[i] = orbs[i].detect(gu_img[i],None)

# compute the descriptors with ORB
descrs = np.empty(len(kps), dtype=object)
for i in range(len(kps)):
    kps[i],descrs[i] = orbs[i].compute(gu_img[i],kps[i])

### Bag of words on orb features

The following section is from my project partner Yang He.

In [11]:
# Deeper Feature Extraction

# Bag of word
# clustering
def bow_cluster(kmeans_obj, descriptor_stack):
    kmeans_ret = kmeans_obj.fit_predict(descriptor_stack)
    return kmeans_ret

# generate vertical stack of descriptors
def bow_vstack(desc_list):
    stack = np.array(desc_list[0])
    for rest in desc_list[1:]:
        stack = np.vstack((stack, rest))
    desc_stack = stack.copy()
    return desc_stack

# generate bag of words frequency matrix (shape: NUM_TRAIN_IMG * N_CLUSTER)
def bow_get_freq_matrix(num_imgs, num_clusters, SIFT_list, kmeans_ret):
    # initialization
    matrix = np.array([np.zeros(num_clusters) for i in range(num_imgs)])
    
    # keep track of index of kmeans_ret
    kmeans_id = 0
    for i in range(num_imgs):
        l = len(SIFT_list[i])
        for j in range(l):
            cluster_id = kmeans_ret[kmeans_id + j]
            matrix[i][cluster_id] += 1
        kmeans_id += l
    
    return matrix

In [12]:
descriptor_stack = bow_vstack(descrs)

In [13]:
start_time = datetime.now()
print("Start Kmeans " + str(start_time))

kmeans_obj = KMeans(n_clusters = 200)
kmeans_ret = bow_cluster(kmeans_obj, descriptor_stack)

end_time = datetime.now()
print("End Kmeans " + str(end_time) + " Time Cost: " + str(end_time - start_time))

Start Kmeans 2018-03-04 22:17:15.067410
End Kmeans 2018-03-04 23:09:20.355724 Time Cost: 0:52:05.288314


In [14]:
vocab_matrix = bow_get_freq_matrix(2000, 200, descrs, kmeans_ret)

In [15]:
from sklearn.preprocessing import StandardScaler

In [16]:
scale = StandardScaler().fit(vocab_matrix)
vocab_matrix_std = scale.transform(vocab_matrix)

In [17]:
from sklearn.model_selection import train_test_split

In [18]:
X_train, X_test, y_train, y_test = train_test_split(vocab_matrix_std, labelsb, test_size = 0.2, random_state =42)

### Train Classifiers with BOW ORB features

SVM

In [42]:
start_time = datetime.now()
print("Start training using SVM @ " + str(start_time))

clf = svm.SVC()
clf.fit(X_train, y_train)

end_time = datetime.now()
print("End training SVM @ " + str(end_time) + " Time Cost: " + str(end_time - start_time))

Start training using SVM @ 2018-03-04 23:54:28.295217
End training SVM @ 2018-03-04 23:54:28.922972 Time Cost: 0:00:00.627755


In [43]:
start_time = datetime.now()
print("Start predicting using SVM @ " + str(start_time))
pred = np.asarray(clf.predict(X_test))
accuracy = (pred == y_test).mean()
print("Accuracy: %.2f%%" % (accuracy * 100))
end_time = datetime.now()
print("End training SVM @ " + str(end_time) + " Time Cost: " + str(end_time - start_time))

Start predicting using SVM @ 2018-03-04 23:54:32.436828
Accuracy: 70.00%
End training SVM @ 2018-03-04 23:54:32.568814 Time Cost: 0:00:00.131986


GBC

In [21]:
start_time = datetime.now()
print("Start training using GBC @ " + str(start_time))

clf = ensemble.GradientBoostingClassifier(n_estimators=100, max_depth=11)
clf.fit(X_train, y_train)

end_time = datetime.now()
print("End training @ " + str(end_time) + " Time Cost: " + str(end_time - start_time))

Start training using GBC @ 2018-03-04 23:28:24.892662
End training @ 2018-03-04 23:28:34.672593 Time Cost: 0:00:09.779931


In [22]:
start_time = datetime.now()
print("Start predicting using GBC @ " + str(start_time))

pred = np.asarray(clf.predict(X_test))
accuracy = (pred == y_test).mean()
print("Accuracy: %.2f%%" % (accuracy * 100))

end_time = datetime.now()
print("End prediction @ " + str(end_time) + " Time Cost: " + str(end_time - start_time))

Start predicting using GBC @ 2018-03-04 23:28:45.931953
Accuracy: 66.75%
End prediction @ 2018-03-04 23:28:45.936957 Time Cost: 0:00:00.005004


RF

In [23]:
start_time = datetime.now()
print("Start training using RF @ " + str(start_time))

clf = ensemble.RandomForestClassifier(n_estimators=17)
clf.fit(X_train, y_train)

end_time = datetime.now()
print("End training @ " + str(end_time) + " Time Cost: " + str(end_time - start_time))

Start training using RF @ 2018-03-04 23:29:00.103503
End training @ 2018-03-04 23:29:00.209558 Time Cost: 0:00:00.106055


In [24]:
start_time = datetime.now()
print("Start predicting using RF @ " + str(start_time))

pred = np.asarray(clf.predict(X_test))
accuracy = (pred == y_test).mean()
print("Accuracy: %.2f%%" % (accuracy * 100))

end_time = datetime.now()
print("End prediction @ " + str(end_time) + " Time Cost: " + str(end_time - start_time))

Start predicting using RF @ 2018-03-04 23:29:02.856672
Accuracy: 66.75%
End prediction @ 2018-03-04 23:29:02.861677 Time Cost: 0:00:00.005005


ADABoost

In [25]:
start_time = datetime.now()
print("Start training using ADB @ " + str(start_time))

clf = ensemble.AdaBoostClassifier(n_estimators=5)
clf.fit(X_train, y_train)

end_time = datetime.now()
print("End training @ " + str(end_time) + " Time Cost: " + str(end_time - start_time))

Start training using ADB @ 2018-03-04 23:29:12.783993
End training @ 2018-03-04 23:29:12.839015 Time Cost: 0:00:00.055022


In [26]:
start_time = datetime.now()
print("Start prediction using ADB @ " + str(start_time))

pred = np.asarray(clf.predict(X_test))
accuracy = (pred == y_test).mean()
print("Accuracy: %.2f%%" % (accuracy * 100))

end_time = datetime.now()
print("End prediction @ " + str(end_time) + " Time Cost: " + str(end_time - start_time))

Start prediction using ADB @ 2018-03-04 23:29:16.747786
Accuracy: 68.25%
End prediction @ 2018-03-04 23:29:16.750790 Time Cost: 0:00:00.003004


In [None]:
KNN

In [27]:
start_time = datetime.now()
print("Start training using KNN @ " + str(start_time))

clf = neighbors.KNeighborsClassifier(leaf_size =5)
clf.fit(X_train, y_train)

end_time = datetime.now()
print("End training @ " + str(end_time) + " Time Cost: " + str(end_time - start_time))

Start training using KNN @ 2018-03-04 23:29:24.474372
End training @ 2018-03-04 23:29:24.493385 Time Cost: 0:00:00.019013


In [28]:
start_time = datetime.now()
print("Start prediction using KNN @ " + str(start_time))

pred = np.asarray(clf.predict(X_test))
accuracy = (pred == y_test).mean()
print("Accuracy: %.2f%%" % (accuracy * 100))

end_time = datetime.now()
print("End prediction @ " + str(end_time) + " Time Cost: " + str(end_time - start_time))

Start prediction using KNN @ 2018-03-04 23:29:26.955126
Accuracy: 60.00%
End prediction @ 2018-03-04 23:29:27.710139 Time Cost: 0:00:00.755013


### HOG Stuff

In [29]:
winSize = (128,128)
blockSize = (16,16)
blockStride = (8,8)
cellSize = (8,8)
nbins = 9

hog = cv2.HOGDescriptor(winSize,blockSize,blockStride,cellSize,nbins)

des = np.empty(len(trn), dtype=object)
for i in range(len(des)):
    des[i] = hog.compute(trn[i])

In [30]:
des_tst = np.empty(len(tst), dtype=object)
for i in range(len(des_tst)):
    des_tst[i] = hog.compute(tst[i])

### Clean HOG

In [31]:
#flatten the arrays
f_trn = np.empty(len(trn),dtype=object)
for i in range(len(trn)):
    f_trn[i] = trn[i].flatten()

f_tst = np.empty(len(tst),dtype=object)
for i in range(len(tst)):
    f_tst[i] = tst[i].flatten()
  
f_des = np.empty(len(des),dtype=object)
for i in range(len(f_des)):
    f_des[i] = des[i].flatten()

f_des_tst = np.empty(len(tst), dtype=object)
for i in range(len(f_des_tst)):
    f_des_tst[i] = des_tst[i].flatten()

In [32]:
#reformat the arrays
f_trn = np.vstack(f_trn)
f_tst = np.vstack(f_tst)
f_des = np.vstack(f_des)
f_des_tst = np.vstack(f_des_tst)

### Run Models With HOG

In [33]:
#fit a gradient boosting algorithm
start_time = datetime.now()

gbcmodel = ensemble.GradientBoostingClassifier(n_estimators = 100, max_depth = 15)
gbcmodel.fit(f_des,ltrn)

end_time = datetime.now()
print("complete @ " + str(end_time) + " Time Cost: " + str(end_time - start_time))

complete @ 2018-03-04 23:40:21.698127 Time Cost: 0:08:41.957168


In [34]:
start_time = datetime.now()

acc = 1 - (sum(gbcmodel.predict(f_des_tst) ^ ltst)/600)

end_time = datetime.now()
print("complete @ " + str(end_time) + " Time Cost: " + str(end_time - start_time))
print(acc)

complete @ 2018-03-04 23:42:16.469831 Time Cost: 0:00:00.016009
0.723333333333


In [35]:
#fit adaboost
start_time = datetime.now()

adbmodel = ensemble.AdaBoostClassifier(n_estimators = 100)
adbmodel.fit(f_des,ltrn)

end_time = datetime.now()
print("ada complete @ " + str(end_time) + " Time Cost: " + str(end_time - start_time))

ada complete @ 2018-03-04 23:44:53.614395 Time Cost: 0:02:33.142799


In [36]:
start_time = datetime.now()

acc = 1 - (sum(adbmodel.predict(f_des_tst) ^ ltst)/600)

end_time = datetime.now()
print("ada complete @ " + str(end_time) + " Time Cost: " + str(end_time - start_time))
print(acc)

ada complete @ 2018-03-04 23:48:35.588766 Time Cost: 0:00:00.235223
0.725


In [37]:
#fit random forest
start_time = datetime.now()

rf = ensemble.RandomForestClassifier(n_estimators = 200)
rf.fit(f_des,ltrn)

end_time = datetime.now()
print("rf complete @ " + str(end_time) + " Time Cost: " + str(end_time - start_time))

rf complete @ 2018-03-04 23:49:01.661606 Time Cost: 0:00:22.656907


In [38]:
start_time = datetime.now()

acc = 1 - (sum(rf.predict(f_des_tst) ^ ltst)/600)

end_time = datetime.now()
print("rf complete @ " + str(end_time) + " Time Cost: " + str(end_time - start_time))
print(acc)

rf complete @ 2018-03-04 23:49:05.869132 Time Cost: 0:00:00.050022
0.778333333333


In [39]:
#fit svm
start_time = datetime.now()

svmmodel = svm.SVC(kernel='rbf', degree = 3)
svmmodel.fit(f_des,ltrn)

end_time = datetime.now()
print("svm complete @ " + str(end_time) + " Time Cost: " + str(end_time - start_time))

svm complete @ 2018-03-04 23:49:43.358242 Time Cost: 0:00:19.986420


In [40]:
start_time = datetime.now()
acc = 1 - (sum(svmmodel.predict(f_des_tst) ^ ltst)/600)
end_time = datetime.now()
print("svm complete @ " + str(end_time) + " Time Cost: " + str(end_time - start_time))
print(acc)

svm complete @ 2018-03-04 23:49:52.157633 Time Cost: 0:00:04.430565
0.785


In [44]:
#fit KNN
start_time = datetime.now()

svmmodel = neighbors.KNeighborsClassifier()
svmmodel.fit(f_des,ltrn)

end_time = datetime.now()
print("knn complete @ " + str(end_time) + " Time Cost: " + str(end_time - start_time))

knn complete @ 2018-03-05 00:54:00.030200 Time Cost: 0:00:00.681811


In [45]:
start_time = datetime.now()
acc = 1 - (sum(svmmodel.predict(f_des_tst) ^ ltst)/600)
end_time = datetime.now()
print("knn complete @ " + str(end_time) + " Time Cost: " + str(end_time - start_time))
print(acc)

knn complete @ 2018-03-05 00:54:11.517466 Time Cost: 0:00:07.776830
0.74
