In [1]:
# Used for loding PNG
from PIL import Image
import numpy as np
import pandas as pd
import os

In [2]:
def load_data(name):
    df = pd.read_csv(name + '.csv')
    df_sorted = df.sort_values(by='image_id')
    #labels = df_sorted.drop(columns=['image_id'])

    labelDict = dict()
    labels = df_sorted.values
    for entry in labels:
        try:
            labelDict[entry[0]] = entry[1]
        except:
            labelDict[entry[0]] = 0

    images = []
    labels = []
    filenames = []
    for filename in os.listdir(name):
        if filename.endswith('.png'):
            file_path = os.path.join(name, filename)
            image = Image.open(file_path)
            image_array = np.array(image)
            if filename[:-4] in labelDict.keys():
                filenames.append(filename[:-4])
                images.append(image_array)
                labels.append(labelDict[filename[:-4]])
        

    for i in range(len(images)):
        if images[i].shape != (80, 80, 3):
            # images[i] = np.mean(images[i], axis=2, keepdims=True)
            images[i] = np.stack((images[i], images[i], images[i]), axis=-1)
        # else:
        #     images[i] = np.reshape(images[i], (80, 80, 1))

    images = np.array(images)
    labels = np.array(labels)

    means = np.mean(images, axis=(0, 1, 2), keepdims=True)
    stds = np.std(images, axis=(0, 1, 2), keepdims=True)
    images = (images - means) / stds

    mins = np.min(images, axis=(0, 1, 2), keepdims=True)
    maxs = np.max(images, axis=(0, 1, 2), keepdims=True)
    images = (images - mins) / (maxs - mins) * 255.0

    return filenames, images, labels

In [3]:
#LOAD
test_filenames, test_images, test_labels = load_data('test')
train_filenames, train_images, train_labels = load_data('train')
validation_filenames, validation_images, validation_labels = load_data('validation')

train_images = np.array(train_images)
validation_images = np.array(validation_images)

train_images = train_images.reshape((10500, 80*80*3))
validation_images = validation_images.reshape((3000, 80*80*3))
test_images = test_images.reshape((4500, 80*80*3))

print(np.shape(train_images))
print(np.shape(train_labels))
print(np.shape(test_images))
print(np.shape(test_labels))
print(np.shape(validation_images))
print(np.shape(validation_labels))

Table = []

(10500, 19200)
(10500,)
(4500, 19200)
(4500,)
(3000, 19200)
(3000,)


In [4]:
def printPrediction(prediction, name):
    files = []
    folder_path = 'test'
    for filename in os.listdir(folder_path):
        if filename.endswith('.png'):
            files.append(filename[:-4])
    d = dict()
    for i in range(len(files)):
        d[files[i]] = prediction[i]


    df = pd.read_csv('sample_submission.csv')
    for i in range(len(files)):
        df.at[i, 'label'] = d[df.at[i, 'image_id']]
    df.to_csv(name, index=False)

In [5]:
%%time
from sklearn.naive_bayes import MultinomialNB

def runNaiveBayes():
    import sklearn
    # import the library
    
    # instantiate & fit
    mnb = MultinomialNB().fit(train_images, train_labels)
    score = str(mnb.score(validation_images, validation_labels))
    print("score on test: " + score)
    Table.append(['Naive Bayes', score])

    prediction = mnb.predict(test_images)
    printPrediction(prediction, 'naive-bayes.csv')

CPU times: user 1.05 s, sys: 89 ms, total: 1.14 s
Wall time: 280 ms


In [6]:
%%time

# import the library
from sklearn.neighbors import KNeighborsClassifier

def runKNeighborsClassifier():
    # instantiate & fit
    knn = KNeighborsClassifier(algorithm = 'auto', n_jobs=-1)
    knn.fit(train_images, train_labels)
    score = str(knn.score(validation_images, validation_labels))
    print("score on test: " + score)
    Table.append(['KNN', score])

    prediction = knn.predict(test_images)
    printPrediction(prediction, 'k-neighbors.csv')

CPU times: user 44.5 ms, sys: 4.03 ms, total: 48.5 ms
Wall time: 47.9 ms


In [7]:
%%time

# import the library
from sklearn.tree import DecisionTreeClassifier

def runDecisionTreeClassifier():
    # instantiate & fit
    clf = DecisionTreeClassifier(min_samples_split=10,max_depth=None)
    clf.fit(train_images, train_labels)
    score = str(clf.score(validation_images, validation_labels))
    print("score on test: "  + score)
    Table.append(['Decision Tree', score])

    prediction = clf.predict(test_images)
    printPrediction(prediction, "decision-tree.csv")

CPU times: user 5.16 ms, sys: 0 ns, total: 5.16 ms
Wall time: 4.54 ms


In [8]:
runNaiveBayes()
runKNeighborsClassifier()
runDecisionTreeClassifier()

score on test: 0.418
score on test: 0.42333333333333334
score on test: 0.41533333333333333


In [9]:
print(Table)

[['Naive Bayes', '0.418'], ['KNN', '0.42333333333333334'], ['Decision Tree', '0.41533333333333333']]
