In [1]:
import cv2
import os
import json
import time
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm, metrics
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.linear_model import SGDClassifier

In [2]:
def read_file(file_name):
# Opening JSON file
    try:
        with open(file_name, "r") as f:
            x = [line.strip() for line in f.readlines()]
#             print(x)
            header = x[0].split(',')
            data_set = {
                "id": [],
                "label": []
            }
            if len(header) == 2 and header[0] == "id" and header[1] == "label":
                for line in range(1, len(x)):
                    data = x[line].split(",")
                    data_set["id"].append(data[0])
                    data_set["label"].append(data[1])
            elif header[0] == "id" and len(header) < 2:
                data_set["id"] = x[1:]
            else:
                raise Exception("File doesn't have the required format")
            
            return data_set
    except Exception as e:
        return e

In [3]:
def read_images(folder_path, filenames):
    start = time.time()
    arrays = []
    for filename in filenames:
        arrays.append(plt.imread(os.path.join(folder_path,filename)))
    print("Time for reading images in {}:".format(folder_path), time.time() - start)
    return np.array(arrays)

In [4]:
test_data = read_file("test.txt")
train_data = read_file("train.txt")
validation_data = read_file("validation.txt")

train_images = read_images("train+validation", train_data["id"])
test_images= read_images("test", test_data["id"])
validation_images = read_images("train+validation", validation_data["id"])

Time for reading images in train+validation: 434.09565472602844
Time for reading images in test: 115.03572487831116
Time for reading images in train+validation: 55.07323956489563


In [5]:
def normalize_images(images, axis=(1,2)): 
    mean = np.mean(images, axis=axis, keepdims=True)
    standard = np.sqrt(((images - mean)**2).mean(axis=axis, keepdims=True))
    new_array = (images - mean) / standard
    return new_array

def normalize_images_old(images):
#     return cv2.normalize(img,  np.zeros((16,16)), 0, 255, cv2.NORM_MINMAX)
    return np.array([cv2.normalize(img, None, alpha=0, beta=1, norm_type=cv2.NORM_MINMAX, dtype=cv2.CV_32F).flatten() for img in images])


def normalize_images_new(images):
#     return cv2.normalize(img,  np.zeros((16,16)), 0, 255, cv2.NORM_MINMAX)
    return images / 255

    
def normalize_flatten_images(images):
    return normalize_images_old(images)


train_images_normalized = normalize_flatten_images(train_images)
validation_images_normalized = normalize_flatten_images(validation_images)
test_images_normalized = normalize_flatten_images(test_images)

print("done")

done


In [None]:
def classifier(X_train, y_train):
    start = time.time()
#     clf = make_pipeline(StandardScaler(), svm.SVC(kernel='rbf'))  # 0.55
#     clf = make_pipeline(Normalizer(), svm.SVC(kernel='rbf'))  # 0.53
#     clf = make_pipeline(StandardScaler(), SGDClassifier(random_state=42, max_iter=1000, tol=1e-3))  # 0,44
#     clf = make_pipeline(StandardScaler(with_mean=False), svm.LinearSVC(random_state=0, tol=1e-5, max_iter = 10000))  # 0.47
#     clf = svm.SVC(kernel='rbf')  # 0.54

    clf.fit(X_train, y_train)
    print("Classification time: {}".format(time.time() - start))
    return clf

X_train = train_images_normalized
X_validate = validation_images_normalized
X_test = test_images_normalized
y_train = train_data["label"]
y_validate = validation_data["label"]

clf_train = classifier(X_train, y_train)
predicted_validate = clf_train.predict(X_validate)
print(metrics.classification_report(y_validate, predicted_validate))
# predicted_train = clf_train.predict(X_train)
predicted_test = clf_train.predict(X_test)

# print(metrics.classification_report(y_train, predicted_train))
print(predicted_test)

Classification time: 46.66421461105347
              precision    recall  f1-score   support

           0       0.78      0.60      0.68       216
           1       0.42      0.36      0.39       201
           2       0.63      0.80      0.70       142
           3       0.47      0.60      0.53       150
           4       0.64      0.71      0.67       143
           5       0.39      0.39      0.39       145
           6       0.50      0.45      0.47       176

    accuracy                           0.55      1173
   macro avg       0.55      0.56      0.55      1173
weighted avg       0.55      0.55      0.55      1173



In [7]:
test_data["label"] = list(predicted_test)
print(test_data["label"][:20])
print(len(test_data["label"]))

['0', '6', '2', '3', '1', '5', '0', '1', '5', '5', '5', '4', '2', '1', '4', '2', '0', '5', '4', '6']
2819


In [8]:
test_data["label"] = list(predicted_test)
print(test_data["label"][:20])
print(len(test_data["label"]))

['0', '6', '2', '3', '1', '5', '0', '1', '5', '5', '5', '4', '2', '1', '4', '2', '0', '5', '4', '6']
2819


In [9]:
with open('test_submission.txt', 'w') as f:
    f.write('id,label\n')
    f.write('\n'.join(["{},{}".format(id_image,label) for id_image, label in zip(test_data["id"],test_data["label"])]))

In [10]:
print(len(train_images))

8000
