# project

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import math
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from imutils import paths
import argparse
import imutils
import cv2
import os


# Setup
- 1. Select user
- 2. Confirm main_data_folder holds the labeled excel sheet
- 3. Confirm that root_folder contains a data folder which holds at least a train and validate folder
- 4. Select if train PCA
- 5. Select if train KNN

In [2]:
def directories_set(user):
    if user == 'Aaron':
        # Aaron
        data_name = 'data_5000'
        main_data_folder = '../SML_Project_Data'
        root_folder= main_data_folder + '/' + data_name
    elif user == 'Qiang':
        # Qiang
        data_name = 'data'
        main_data_folder = 'D:/academic/DS 5220 Supervised Machine Learning/project/data'
        root_folder='C:/Users/mjfun/Downloads/data'
    elif user == 'Aishwara':
        main_data_folder = 'C:/Users/Aishwarya/Desktop/NEU/SML/project'
        root_folder = main_data_folder + '/data'
        data_name = 'data'
    else:
        print('Unkown Users')
        main_data_folder = None
        root_folder = None
        user = None
        data_name = None
        
    return(main_data_folder, root_folder, user, data_name)

main_data_folder, root_folder, user, data_name = directories_set('Qiang')

train_pca = True
train_knn = False

In [3]:
df = pd.read_csv(main_data_folder + '/trainLabels.csv',index_col=False)
df.head()

Unnamed: 0,image,level
0,10_left,0
1,10_right,0
2,13_left,0
3,13_right,0
4,15_left,1


In [4]:
df.groupby('level').count()

Unnamed: 0_level_0,image
level,Unnamed: 1_level_1
0,25810
1,2443
2,5292
3,873
4,708


In [5]:
def image_to_feature_vector(image, size=(500, 500)):
    # resize the image to a fixed size, then flatten the image into
    # a list of raw pixel intensities
    return cv2.resize(image, size).flatten()

In [6]:
def extract_color_histogram(image, bins=(8, 8, 8)):
    # extract a 3D color histogram from the HSV color space using
    # the supplied number of `bins` per channel
    hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
    hist = cv2.calcHist([hsv], [0, 1, 2], None, bins,[0, 180, 0, 256, 0, 256])
    # handle normalizing the histogram if we are using OpenCV 2.4.X
    if imutils.is_cv2():
        hist = cv2.normalize(hist)
        # otherwise, perform "in place" normalization in OpenCV 3 (I# personally hate the way this is done
    else:
        cv2.normalize(hist, hist)
        # return the flattened histogram as the feature vector
    return hist.flatten()

In [7]:
# grab the list of images that we'll be describing
def create_list(root_folder, verbose = False, histogram = False):
    if verbose:
        print("[INFO] describing images...")
    # initialize the raw pixel intensities matrix, the features matrix, and labels list
    rawImages = []
    features = []
    labels = []
    for subfolder in next(os.walk(root_folder))[1]:
        if verbose:
            print(subfolder)
        folder_path=root_folder+'/'+subfolder
        if verbose:
            print(folder_path)
        imagePaths = list(paths.list_images(folder_path))
        label=subfolder.split('_')[1]
        # loop over the input images
        for (i, imagePath) in enumerate(imagePaths):
            # load the image
            if verbose:
                print(i,' read image: ',imagePath)
            image = cv2.imread(imagePath)
            #label = df[df['image']==imagePaths[i].split('\\')[1].split('.')[0]]['level'].values[0]
            if verbose:
                print(i,'label is: ',label)
            # extract raw pixel intensity "features", followed by a color histogram to characterize the color distribution of the pixels in the image
            if verbose:
                print('start processing pixels.')
            pixels = image_to_feature_vector(image)
            if verbose:
                print('start processing histogram')
            if histogram:
                hist = extract_color_histogram(image)
            # update the raw images, features, and labels matricies,respectively
            if verbose:
                print('append list.')
            rawImages.append(pixels)
            
            if histogram:
                features.append(hist)
            labels.append(label)
        if histogram == False:
            feature = None
    return rawImages,features,labels

In [8]:
#train data
trainRI,trainFeat,trainLabels=create_list(root_folder + '/train')

In [9]:
#validatioin data
testRI,testFeat,testLabels=create_list(root_folder + '/val')

In [13]:
# show some information on the memory consumed by the raw images
# matrix and features matrix
arr_trainRI = np.array(trainRI)
#arr_trainFeat = np.array(trainFeat)
arr_trainLabels = np.array(trainLabels)
print("[INFO] pixels matrix: {:.2f}MB".format(arr_trainRI.nbytes / (1024 * 1000.0)))
print("[INFO] pixels matrix: {:.2f}GB".format(arr_trainRI.nbytes / (1024 * 1000.0 * 1000)))
#print("[INFO] features matrix: {:.2f}MB".format(arr_trainFeat.nbytes / (1024 * 1000.0)))


[INFO] pixels matrix: 18310.55MB
[INFO] pixels matrix: 18.31GB


In [14]:
arr_testRI = np.array(testRI)
#arr_testFeat = np.array(testFeat)
arr_testLabels = np.array(testLabels)
print("[INFO] pixels matrix: {:.2f}MB".format(arr_testRI.nbytes / (1024 * 1000.0)))
#print("[INFO] features matrix: {:.2f}MB".format(arr_testFeat.nbytes / (1024 * 1000.0)))

[INFO] pixels matrix: 520.02MB


In [15]:
pd.DataFrame(arr_trainLabels,columns=['label']).groupby(pd.DataFrame(arr_trainLabels,columns=['label'])['label']).count()

Unnamed: 0_level_0,label
label,Unnamed: 1_level_1
0,5000
1,5000
2,5000
3,5000
4,5000


In [16]:
pd.DataFrame(testLabels,columns=['label']).groupby(pd.DataFrame(testLabels,columns=['label'])['label']).count()

Unnamed: 0_level_0,label
label,Unnamed: 1_level_1
0,142
1,142
2,142
3,142
4,142


In [17]:
# partition the data into training and testing splits, using 75%
# of the data for training and the remaining 25% for testing
#(trainRI, testRI, trainRL, testRL) = train_test_split(rawImages, labels, test_size=0.25, random_state=42)
#(trainFeat, testFeat, trainLabels, testLabels) = train_test_split(features, labels, test_size=0.25, random_state=42)
from datetime import datetime

# PCA

In [18]:
from sklearn.decomposition import PCA
from sklearn import model_selection
import pickle
filename = 'pca_model_' + data_name + '.sav'

In [None]:
# Training PCA model, takes around 30 minutes
if train_pca:
    start_time = datetime.now()
    print("[INFO] Start getting PCA", start_time)
    pca_train = PCA(n_components=500)
    pca_train.fit(trainRI)

    pickle.dump(pca_train, open(filename, 'wb'))

    print("[INFO] PCA model is made after", datetime.now() - start_time)

[INFO] Start getting PCA 2019-11-23 13:44:10.574785


In [None]:
# load the model from disk
pca_train = pickle.load(open(filename, 'rb'))
print(pca_train)

pca_train_result = pca_train.transform(X=trainRI)
pca_test_result = pca_train.transform(X=testRI)

# Logistic

In [None]:
from sklearn.linear_model import LogisticRegression

def calculate_accuracy(actual, predicted):
    correct = 0
    incorrect = 0
    for item in range(0, len(predicted)):
        if actual[item] == predicted[item]:
            correct = correct + 1
        else:
            incorrect = incorrect + 1
    print('Accuracy is', correct / (correct + incorrect) )

In [None]:
start_time = datetime.now()
print("[INFO] evaluating model accuracy after", datetime.now() - start_time)
model_logistic = LogisticRegression(n_jobs=8, verbose = 1, random_state=0, solver ='sag')# solver = 'lbfgs'
print("[INFO] compiling fit")
model_logistic.fit(pca_train_result, trainLabels)
print("[INFO] calculating accuracy")
predicted = model_logistic.predict(pca_train_result)

print('Training Results:')
calculate_accuracy(trainLabels, predicted)

predicted = model_logistic.predict(pca_test_result)
print('Test Results:')
calculate_accuracy(testLabels, predicted)

# KNN

In [16]:
#Training on non-pca
if train_knn:
    # train and evaluate a k-NN classifer on the raw pixel intensities
    start_time = datetime.now()
    print("[INFO] evaluating raw pixel accuracy at", start_time)
    model_raw = KNeighborsClassifier(n_neighbors=3, n_jobs=-1)
    print("[INFO] compiling fit")
    model_raw.fit(trainRI, trainLabels)
    print("[INFO] calculating accuracy")
    acc_raw = model_raw.score(testRI, testLabels)
    end_time = datetime.now()
    print("[INFO] raw pixel accuracy: {:.2f}%".format(acc_raw * 100))
    print("[INFO] scipt completing at", end_time, "script took", end_time - start_time)


[INFO] evaluating raw pixel accuracy at
[INFO] compiling fit
[INFO] calculating accuracy
[INFO] raw pixel accuracy: 22.96%
[INFO] scipt completing at 2019-11-17 15:48:55.251440 script took 0:01:59.372027


In [18]:
if train_knn:
    # train and evaluate a k-NN classifer on the histogram
    # representations
    print("[INFO] evaluating histogram accuracy...")
    model_histogram = KNeighborsClassifier(n_neighbors=3, n_jobs=-1)
    model_histogram.fit(trainFeat, trainLabels)
    acc_histogram = model_histogram.score(testFeat, testLabels)
    print("[INFO] histogram accuracy: {:.2f}%".format(acc_histogram * 100))

[INFO] evaluating histogram accuracy...
[INFO] histogram accuracy: 22.25%


# Random Forest

In [306]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

In [307]:
start_time = datetime.now()
rfc=RandomForestClassifier(n_estimators=50)
rfc.fit(pca_train_result, trainLabels)

print('Random Forest Train Accuracy')
y_pred=rfc.predict(pca_train_result)
calculate_error(actual=trainLabels, predicted=y_pred)

2830 0
Accuracy is 1.0


In [308]:
y_pred=rfc.predict(pca_test_result)
print('Random Forest Test Accuracy')
calculate_error(actual=testLabels, predicted=y_pred)

[INFO] scipt completing at 2019-11-22 23:17:52.991492 script took in total 0:00:09.633491
145 565
Accuracy is 0.20422535211267606
