diff --git a/classification/.gitignore b/classification/.gitignore new file mode 100644 index 0000000..0d20b64 --- /dev/null +++ b/classification/.gitignore @@ -0,0 +1 @@ +*.pyc diff --git a/classification/README.md b/classification/README.md new file mode 100644 index 0000000..8d3861c --- /dev/null +++ b/classification/README.md @@ -0,0 +1,12 @@ +## Classification of images into entry/other + +### Proposed Technique +* Convert all images into Black&White. +* Downsize all images into (150, 250) +* Define a simple CNN-classifier and train it on the given data +* Batch-normalization is used to handle the variance in given data, while automatic class-weights are used to balance the error function (as the class distribution is biased) +* To account for the low amount of data given, a small learning rate is used (to avoid overfitting) + +### Running it +* Run `python trainClassifier.py ` from the current directory to train an end-to-end model. +* For example, run `python trainClassifier.py images/freecen/ data/gold/combined_classifications_20180227.csv` diff --git a/classification/model.py b/classification/model.py new file mode 100644 index 0000000..fe1c3a2 --- /dev/null +++ b/classification/model.py @@ -0,0 +1,30 @@ +import keras +from keras.models import Sequential +from keras.layers import Dense, Dropout, Flatten, Activation +from keras.layers import Conv2D, MaxPooling2D, BatchNormalization + +# Define a simple CNN model + + +def getSimpleCNN(input_shape, num_classes): + model = Sequential() + model.add(Conv2D(16, kernel_size=(3, 3), input_shape=input_shape)) + model.add(BatchNormalization()) + model.add(Activation('relu')) + model.add(Conv2D(32, (3, 3))) + model.add(BatchNormalization()) + model.add(Activation('relu')) + model.add(MaxPooling2D(pool_size=(2, 2))) + model.add(Dropout(0.25)) + model.add(Flatten()) + model.add(Dense(64)) + model.add(BatchNormalization()) + model.add(Activation('relu')) + model.add(Dropout(0.5)) + model.add(Dense(num_classes, activation='softmax')) + + model.compile(loss=keras.losses.categorical_crossentropy, + optimizer=keras.optimizers.Adadelta(lr=0.1), + metrics=['accuracy']) + + return model diff --git a/classification/readData.py b/classification/readData.py new file mode 100644 index 0000000..73f21ed --- /dev/null +++ b/classification/readData.py @@ -0,0 +1,37 @@ +import numpy as np +from PIL import Image +import os +from tqdm import tqdm +from scipy.misc import imresize +import csv + +# Read label classification file, construct data + + +def getData(imageDirPrefix, filePath): + X = [] + Y = [] + with open(filePath, 'r') as f: + reader = csv.reader(f) + for line in tqdm(reader): + filePath = line[0] + imgClass = line[1] + # Read image as a black&white image + image = np.asarray( + Image.open(os.path.join(imageDirPrefix, filePath)).convert('L')) + # Resize into a smaller image + image = imresize(image, (150, 250)) + X.append(image) + Y.append(imgClass) + X = np.array(X) + X = X.reshape(X.shape + (1,)) + # Also store the mapping between class-names and indices + mappingDict = dict([(y, x) for x, y in enumerate(sorted(set(Y)))]) + Y = np.array([mappingDict[x] for x in Y]) + return X, Y, mappingDict + + +if __name__ == "__main__": + import sys + X, Y, mapping = getData(sys.argv[1], sys.argv[2]) + print X.shape, Y.shape diff --git a/classification/trainClassifier.py b/classification/trainClassifier.py new file mode 100644 index 0000000..2e0c31b --- /dev/null +++ b/classification/trainClassifier.py @@ -0,0 +1,19 @@ +import readData +import model +import keras + + +if __name__ == "__main__": + import sys + # Load data + X, Y, mapping = readData.getData(sys.argv[1], sys.argv[2]) + num_classes = len(mapping.keys()) + input_shape = X.shape[1:] + # Loada simple CNN for tha classification task + model = model.getSimpleCNN(input_shape, num_classes) + Y = keras.utils.to_categorical(Y, num_classes) + batch_size = 8 + epochs = 20 + # Train our model on the available data + model.fit(X, Y, batch_size=batch_size, epochs=epochs, + validation_split=0.2, class_weight='auto')