In [1]:
# Imports

import sklearn
from sklearn.cluster import KMeans
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import numpy as np
import cv2
# from os import listdir
# from os.path import isfile, join
import pandas as pd
# import re
# from glob import glob
from matplotlib import pyplot as plt
from datetime import datetime
from sklearn.externals import joblib

In [2]:
# Directory Path:
IMG_PATH = "../data/pets/train/"
LABEL_PATH = "../data/pets/train_label.txt"
TEST_IMG_PATH = "../data/pets/test_practice/"

# Constants
NUM_IMG = 2000
N_CLUSTER = 20 # K-means 
NUM_TEST_IMG = 1850

# Controls
RUN_TEST = True
RUN_TRAIN = False

In [3]:
# Image Processing & General Feature Extraction

# Gray scale image
def gray(image):
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

# SIFT feature
def get_features(image, SIFT_obj):
    keypoints, features = SIFT_obj.detectAndCompute(image, None) # Don't need grayscale image here
    return keypoints, features


In [4]:
# Deeper Feature Extraction

# Bag of word
# clustering
def bow_cluster(kmeans_obj, descriptor_stack):
    kmeans_ret = kmeans_obj.fit_predict(descriptor_stack)
    return kmeans_ret

# generate vertical stack of descriptors
def bow_vstack(desc_list):
    stack = np.array(desc_list[0])
    for rest in desc_list[1:]:
        stack = np.vstack((stack, rest))
    desc_stack = stack.copy()
    return desc_stack

# generate bag of words frequency matrix (shape: NUM_TRAIN_IMG * N_CLUSTER)
def bow_get_freq_matrix(num_imgs, num_clusters, SIFT_list, kmeans_ret):
    # initialization
    matrix = np.array([np.zeros(num_clusters) for i in range(num_imgs)])
    
    # keep track of index of kmeans_ret
    kmeans_id = 0
    for i in range(num_imgs):
        l = len(SIFT_list[i])
        for j in range(l):
            cluster_id = kmeans_ret[kmeans_id + j]
            matrix[i][cluster_id] += 1
        kmeans_id += l
    
    return matrix
            

In [5]:
# Read In Image Files
def get_images(path, num_img):
    imlist = []
    for i in range(num_img):
        path_str = path + "pet" + str(i+1) + ".jpg"
        im = cv2.imread(path_str)
        imlist.append(im)
        
    return imlist

In [6]:
start_time = datetime.now()
print("Reading Images @ " + str(start_time))
imlist = []
if RUN_TRAIN:
    imlist = get_images(IMG_PATH, NUM_IMG) # imlist contains a list of image numpy arrays
elif RUN_TEST:
    imlist = get_images(TEST_IMG_PATH, NUM_TEST_IMG)
end_time = datetime.now()
print("Complete @ " + str(end_time) + " Time Cost: " + str(end_time - start_time))

Reading Images @ 2018-03-04 23:48:05.060112
Complete @ 2018-03-04 23:48:14.088560 Time Cost: 0:00:09.028448


In [7]:
print("Image count: " + str(len(imlist)))

Image count: 1850


In [8]:
# Read In Labels
def get_labels(path):
    labels = np.loadtxt(path, dtype = 'str')
    np.reshape(labels, (-1, 1))
    return labels

In [9]:
labellist = []
if RUN_TRAIN:
    labellist = get_labels(LABEL_PATH)
    print(labellist.shape)

In [10]:
## Compute SIFT features
start_time = datetime.now()
print("Generating SIFT features @ " + str(start_time))
SIFT_obj = cv2.xfeatures2d.SIFT_create()
SIFT_list = []
for im in imlist:
    keypoint, descriptor = get_features(im, SIFT_obj)
    SIFT_list.append(descriptor)
end_time = datetime.now()
print("SIFT features complete @ " + str(end_time) + " Time Cost: " + str(end_time - start_time))

Generating SIFT features @ 2018-03-04 23:48:23.424259
SIFT features complete @ 2018-03-04 23:49:09.481780 Time Cost: 0:00:46.057521


In [11]:
# Generate a vertical stack of descriptors to perform clustering
if RUN_TRAIN:
    start_time = datetime.now()
    print("Generating SIFT stacked matrix @ " + str(start_time))
    descriptor_stack = bow_vstack(SIFT_list)

    end_time = datetime.now()
    print("Matrix complete @ " + str(end_time) + " Time Cost: " + str(end_time - start_time))
    print("Shape of vstack matrix: " + str(descriptor_stack.shape))

In [12]:
# BOW clustering
if RUN_TRAIN:
    start_time = datetime.now()
    print("Start Clustering @ " + str(start_time))

    kmeans_obj = KMeans(n_clusters = N_CLUSTER)
    kmeans_ret = bow_cluster(kmeans_obj, descriptor_stack)
    
    kmeans_filename = '../output/models/kmeans' + str(N_CLUSTER) + '.sav'
    print("Saving kmeans model")
    joblib.dump(kmeans_obj, kmeans_filename)
    print("Saving complete")

    end_time = datetime.now()
    print("Matrix complete @ " + str(end_time) + " Time Cost: " + str(end_time - start_time))
    print("Shape of kmeans ret: " + str(kmeans_ret.shape))

In [13]:
if RUN_TRAIN:
    start_time = datetime.now()
    print("Generating BOW Vocabulary @ " + str(start_time))

    vocab_matrix = bow_get_freq_matrix(NUM_IMG, N_CLUSTER, SIFT_list, kmeans_ret)

    #### IMPORTANT: FOR SVM, STANDARDIZE DATA BEFORE FEEDING INTO SVC()
    # scale = StandardScaler().fit(vocab_matrix)
    # vocab_matrix_std = scale.transform(vocab_matrix)

    df = pd.DataFrame(data = vocab_matrix)
    df.index.name = 'IMG_ID'
    column_name_list = []
    for i in range(N_CLUSTER):
        column_name_list.append('CLUSTER_ID_' + str(i))
    df.columns = column_name_list
    df.to_csv('../output/additionals/BOWmatrix-' + str(N_CLUSTER) + '.csv', mode = 'a', index = True, sep = ',')
    
    end_time = datetime.now()
    print("BOW Vocabulary complete @ " + str(end_time) + " Time Cost: " + str(end_time - start_time))
    print("Plot Histogram of entire dataset (x = vocabulary, y = count)")
    x_scalar = np.arange(N_CLUSTER)
    y_scalar = np.array([abs(np.sum(vocab_matrix[:, h], dtype = np.int32)) for h in range(N_CLUSTER)])
    plt.bar(x_scalar, y_scalar)
    plt.xlabel("Vocabulary Index")
    plt.ylabel("Frequency")
    plt.title("BOW frequency histogram")
    plt.xticks(x_scalar + 0.4, x_scalar)
    plt.plot()
    plt.savefig("../figs/BOW" + str(N_CLUSTER) + ".png")
    print("Vocabulary Matrix shape: " + str(vocab_matrix.shape))

In [14]:
if RUN_TRAIN:
    train_labels = np.asarray(labellist)
    print(train_labels.shape)

    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(vocab_matrix, train_labels, test_size = 0.2, random_state =42)

    print(X_train.shape)
    print(X_test.shape)
    print(y_train.shape)
    print(y_test.shape)

In [15]:
if RUN_TRAIN:
    start_time = datetime.now()
    print("Start training using GBM @ " + str(start_time))
    
    params = {'n_estimators': 2000, 'max_depth': 9, 'subsample': 0.5,
          'learning_rate': 0.01, 'min_samples_leaf': 1, 'random_state': 3}
    clf = GradientBoostingClassifier(**params)
    clf.fit(X_train, y_train)
    pred = np.asarray(clf.predict(X_test))
    accuracy = (pred == y_test).mean()
    
    gbm_filename = '../output/models/gbmModel' + str(N_CLUSTER) + '.sav'
    joblib.dump(clf, gbm_filename)
    
    print("Validation Accuracy: %.2f%%" % (accuracy * 100))
    end_time = datetime.now()
    print("End training @ " + str(end_time) + " Time Cost: " + str(end_time - start_time))

In [16]:
# Test
RUN_TEST = True
if RUN_TEST:
    kmeans_filename = '../output/models/kmeans' + str(N_CLUSTER) + '.sav'
    gbm_filename = '../output/models/gbmModel' + str(N_CLUSTER) + '.sav'
    
    model = joblib.load(kmeans_filename)
    clf = joblib.load(gbm_filename)
    pred_tmp = []
    for i in range(len(SIFT_list)):
        vocab = np.array( [[ 0 for j in range(N_CLUSTER)]])
        test_ret = model.predict(SIFT_list[i])
        for k in test_ret:
            vocab[0][k] += 1
        pred_test = clf.predict(vocab)
        pred_tmp.append(pred_test)
        
    pred_filename = "../output/predictions/baseline.sav"
    joblib.dump(pred_tmp, pred_filename)