## Import Packages

In [2]:
import cv2
import numpy as np
import pandas as pd
import glob
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn import preprocessing
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
import time
import cv2
from sklearn.cluster import MiniBatchKMeans
from sklearn.preprocessing import normalize
from sklearn.neural_network import MLPClassifier
import xgboost

## Load Images

In [3]:
# Load images first
numImages = len(glob.glob('./images/*jpg'))
images = [None for i in xrange(numImages)]
for fileName in glob.glob('./images/*jpg'):
    fileNum = int(fileName[9:][:-4])
    images[fileNum-1] = np.array(cv2.imread(fileName, 0))
images = np.array(images)

# Load csv data next
train_data = pd.read_csv('data/train.csv').drop(['species'], axis=1).values
train_labels = pd.read_csv('data/train.csv')['species'].values
train_images = [images[int(data[0]-1)] for data in train_data]
train_ids = [data[0] for data in train_data]
train_data = np.delete(train_data, 0, 1)


test_data = pd.read_csv('data/test.csv').values
test_images = [images[int(data[0]-1)] for data in test_data]
test_ids = [data[0] for data in test_data]
test_data = np.delete(test_data, 0, 1)

del images

## Preprocess Data

In [4]:
le= preprocessing.LabelEncoder()
#encode train labels
le.fit(train_labels)
train_labels_encoded=le.transform(train_labels)

#separate the 3 histograms
train_margin_data=((pd.read_csv('data/train.csv').drop(['species'], axis=1)).loc[:,'margin1':'margin64']).values
train_shape_data=((pd.read_csv('data/train.csv').drop(['species'], axis=1)).loc[:,'shape1':'shape64']).values
train_texture_data=((pd.read_csv('data/train.csv').drop(['species'], axis=1)).loc[:,'texture1':'texture64']).values

test_margin_data=((pd.read_csv('data/test.csv')).loc[:,'margin1':'margin64']).values
test_shape_data=((pd.read_csv('data/test.csv')).loc[:,'shape1':'shape64']).values
test_texture_data=((pd.read_csv('data/test.csv')).loc[:,'texture1':'texture64']).values

#print train_margin_data.head()
#print train_shape_data.head()
#print train_texture_data.head()

## Get HOG descriptors

In [34]:
from skimage import feature
from skimage import exposure
def get_descriptor(images, dense=False):
    sift = cv2.xfeatures2d.SIFT_create()
    des_per_Img = np.array([sift.detectAndCompute(img,None)[1] for img in images])
    return des_per_Img
def get_HOG(images):
    features=[]
    hog_des = np.array([feature.hog(img, visualise=True, orientations=8, pixels_per_cell=(32,32))[0].reshape(1,-1) for img in images])
    return hog_des
def get_clusters(descriptors, vocabSize):
    des_list = np.concatenate(descriptors)

    kmeans = MiniBatchKMeans(vocabSize, batch_size=100)
    kmeans.fit(np.array(des_list))
    
    return kmeans

def get_vocabulary(descriptors, clusters, vocabSize):
    return np.array([normalize(np.histogram(clusters.predict(dscrs), bins=range(vocabSize))[0].reshape(1,-1)).ravel() for dscrs in descriptors])
        

In [35]:
des_start_time =time.time()
des_list_train = get_HOG(train_images)
des_list_test = get_HOG(test_images)
des_end_time =time.time()
print "Descriptors computed in {:2f} seconds".format(des_end_time-des_start_time)

ValueError: could not broadcast input array from shape (12096) into shape (1)

In [24]:

clustering_start_time=time.time()
clusters = get_clusters(des_list_train,150)
clustering_end_time=time.time()
print "Clustering completed in {:2f} seconds".format(clustering_end_time-clustering_start_time)

Clustering completed in 38.982271 seconds


In [33]:
print des_list_train[0].shape

(12096, 1)


In [25]:
vocab_train = get_vocabulary(des_list_train,clusters,150)
vocab_test = get_vocabulary(des_list_test,clusters,150)

print vocab_train.shape

(990, 149)


## Train weak learners from extracted data

In [26]:
mlp_train_margin = MLPClassifier(learning_rate='constant', max_iter=5000,hidden_layer_sizes=(80,))
mlp_train_margin.fit(train_margin_data, train_labels_encoded)
mlp_train_margin_pred = mlp_train_margin.predict_proba(train_margin_data)

mlp_train_texture = MLPClassifier(learning_rate='constant', max_iter=5000,hidden_layer_sizes=(80,))
mlp_train_texture.fit(train_texture_data, train_labels_encoded)
mlp_train_texture_pred = mlp_train_texture.predict_proba(train_texture_data)

mlp_train_shape = MLPClassifier(learning_rate='constant', max_iter=5000,hidden_layer_sizes=(80,))
mlp_train_shape.fit(train_shape_data, train_labels_encoded)
mlp_train_shape_pred = mlp_train_shape.predict_proba(train_shape_data)

mlp_train_sift_bof = MLPClassifier(learning_rate='constant', max_iter=5000,hidden_layer_sizes=(80,))
mlp_train_sift_bof.fit(vocab_train, train_labels_encoded)
mlp_train_sift_bof_pred = mlp_train_sift_bof.predict_proba(vocab_train)

second_level_input = np.array(np.append(mlp_train_margin_pred,mlp_train_texture_pred,axis=1))
second_level_input = np.array(np.append(second_level_input,mlp_train_shape_pred,axis=1))
second_level_input = np.array(np.append(second_level_input,mlp_train_sift_bof_pred,axis=1))

In [27]:
mlc_model = MLPClassifier(learning_rate='constant', max_iter=8000,hidden_layer_sizes=(400,))
mlc_model.fit(second_level_input, train_labels_encoded)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(400,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=8000, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

## Prepare and Export Output File for Kaggle Submission

In [30]:
out_file = generateDeepSubmission(test_ids,test_margin_data,test_texture_data,test_shape_data,vocab_test,mlp_train_margin, mlp_train_texture,mlp_train_shape,mlp_train_sift_bof,mlc_model,99)



(594, 396)


In [29]:
def generateDeepSubmission(ids,test_l1,test_l2,test_l3,test_l4
                           ,model_b1,model_b2,model_b3,model_b4,model_top,num_classes):
    num_test = len(test_l1)
    block1_pred = model_b1.predict_proba(test_l1)
    block2_pred = model_b2.predict_proba(test_l2)
    block3_pred = model_b3.predict_proba(test_l3)
    block4_pred = model_b4.predict_proba(test_l4)
    final_input = np.array(np.append(block1_pred,block2_pred,axis=1))
    final_input = np.array(np.append(final_input,block3_pred,axis=1))
    final_input = np.array(np.append(final_input,block4_pred,axis=1))
    print final_input.shape
    final_pred = model_top.predict(final_input)
    final_confidence =  model_top.predict_proba(final_input)
    output = np.zeros((num_test, num_classes+1))
    
    for i in xrange(num_test):
        p = final_pred[i]
        c = final_confidence[i][p]
        #prob = c
        prob = min(max(10e-15, c), 1-10e-15)
        logLoss = np.log(prob)
        output[i][p+1] = -logLoss / num_test
        output[i][0] = ids[i]
    return output
    #for i in xrange()
    #return final_pred

In [165]:
# #predict using KNN
# preds_knn=knn_model.predict(test_data)
# #predict using SVM
# preds_svm=svm_model.decision_function(test_data)
# print np.array(preds_svm).shape

def generateSubmission(ids, test, model, num_classes):
    num_test = len(test)
    predictions = model.predict(test)
    confidence = model.predict_proba(test)
    output = np.zeros((num_test, num_classes+1))
    
    for i in xrange(num_test):
        p = predictions[i]
        c = confidence[i][p]
        #prob = c
        prob = min(max(10e-15, c), 1-10e-15)
        logLoss = np.log(prob)
        output[i][p+1] = -logLoss / num_test
        output[i][0] = ids[i]
    return output

In [31]:
# out_file = generateSubmission(test_ids, test_data,svm_model,99)
headerRow=np.array(['id'] + le.inverse_transform(range(99)).tolist())
df = pd.DataFrame(data=out_file, columns = headerRow)
df['id'] = df['id'].astype(np.int)
df=df.set_index('id')
#print df.head()
# np.set_printoptions(threshold=np.inf)
# print out_file
df.to_csv('output/16_11_18_001.csv')

  if diff:
