In [1]:
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import numpy as np 
import pickle
from matplotlib import pyplot as plt
import keras
from keras.preprocessing.image import ImageDataGenerator
from keras.utils.np_utils import to_categorical
from keras import models, layers
from keras.preprocessing.image import ImageDataGenerator
import os

from google.colab import drive
drive.mount('/content/drive')

Using TensorFlow backend.


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## 1. Data loading & preprocessing

### Set seed for reproducibility

In [0]:
SEED = 123
np.random.seed(SEED)

### Load data

In [0]:
DIR = "drive/My Drive/Colab Notebooks/input/" # Location of input data

In [0]:
X_train = np.load(os.path.join(DIR, "train_images.npy"), encoding='latin1')
train_labels = np.genfromtxt(os.path.join(DIR, "train_labels.csv"), names=True, delimiter=',', dtype=[('Id', 'i8'), ('Category', 'S15')])
X_test = np.load(os.path.join(DIR, "test_images.npy"), encoding = 'latin1')

X_train = np.array(tuple(x[1] for x in X_train))
X_test = np.array(tuple(x[1] for x in X_test))

### One-hot encode image labels

In [0]:
# Numerical encoding
y_train = train_labels[:]['Category']
y_train = preprocessing.LabelEncoder().fit_transform(y_train)

# One-hot encoding for keras
n_classes = len(np.unique(y_train))
y_train = to_categorical(y_train, n_classes)

### Normalize data

In [0]:
# converts images to greyscale
X_train = preprocessing.normalize(X_train)
X_test = preprocessing.normalize(X_test)

### (SEE TODO): Standardize Data

In [0]:
## TODO IS THIS REALLY NEEDED (ALL FEATURES ARE ON THE SAME SCALE)
scaler = preprocessing.StandardScaler()
scaler.fit(X_train)

X_train, X_test = scaler.transform(X_train), scaler.transform(X_test)

### Split into training and validation sets

In [0]:
X_train, X_valid, y_train, y_valid = train_test_split(X_train,
                                                      y_train,
                                                      random_state=SEED,
                                                      stratify=y_train/len(y_train)
                                                     )

In [0]:
# Reshape to 4D tensor (last dimension is nb. of channels)
# Required by keras models
X_train = X_train.reshape(X_train.shape[0], 100, 100, 1)
X_valid = X_valid.reshape(X_valid.shape[0], 100, 100, 1)
X_test = X_test.reshape(X_test.shape[0], 100, 100, 1)

### Augment data through transformations

In [0]:
# Training batch size
# TODO PLAY AROUND WITH THIS
BATCH_SIZE = 128

In [0]:
# Load datagen if already saved
path_to_datagen = os.path.join(DIR, "datagen")

try:
    with open(path_to_datagen, "rb") as f:
        path_to_datagen = pickle.load(f)
        
except FileNotFoundError:
    # Create datagen
    datagen = ImageDataGenerator(# zca_whitening=True, # EXTREMELY SLOW dimensionality reduction
                                rotation_range=30, # rotate images by [0,30] deg.
                                horizontal_flip=True,
                                )
# TODO SHOULD I SET FEATUREWISE_CENTER TO TRUE? IF YES, SHOULD I STILL NORMALIZE DATA BEFORE RUNNING THIS?    
    datagen.fit(X_train,
               seed=SEED
               )

    # Save datagen to file
    path_to_datagen = os.path.join(DIR, "datagen")
    with open(path_to_datagen, "wb") as f:
        pickle.dump(datagen, f)

In [0]:
X_train_transformed = datagen.flow(X_train,
                y_train,
                batch_size=BATCH_SIZE,
                seed=SEED
               )

## 2. Model

### Convolution auto-encoder for noise reduction

In [0]:
path_to_cae = os.path.join(DIR, "cae.h5")
if os.path.isfile(path_to_cae):
    cae = models.load_model(cae)

else:
    
    input_img = layers.Input(shape=(100, 100, 1))
    
    encoded = layers.Conv2D(16,
                           (3,3),
                           padding="same",
                           activation="relu")(input_img)
    
    encoded = layers.MaxPooling2D((2, 2), padding='same')(encoded)

    encoded = layers.Conv2D(8,
                           (3,3),
                           padding="same",
                           activation="relu")(encoded)

    encoded = layers.MaxPooling2D((2, 2), padding='same')(encoded)

    encoded = layers.Conv2D(8,
                           (3,3),
                           padding="same",
                           activation="relu")(encoded)

    encoded = layers.MaxPooling2D((2, 2), padding='same')(encoded)
    
    decoded = layers.Conv2D(8,
                            (3, 3),
                            activation='relu',
                            padding='same')(encoded)

    decoded = layers.UpSampling2D((2, 2))(decoded)

    decoded = layers.Conv2D(8,
                            (3, 3),
                            activation='relu',
                            padding='same')(encoded)

    decoded = layers.UpSampling2D((2, 2))(decoded)
    
    decoded = layers.Conv2D(16,
                            (3, 3),
                            activation='relu',
                            padding='same')(encoded)

    decoded = layers.UpSampling2D((2, 2))(decoded)

    decoded = layers.Conv2D(1,
                            (3, 3),
                            activation='sigmoid',
                            padding='same')(decoded)

    cae = models.Model(input_img, decoded)
    cae.compile(optimizer='adadelta',
                        loss='binary_crossentropy')

    cae.fit(X_train,
            X_train, # X_train is intentionally passed twice. The autoencoder requires that
            epochs = 50,
            batch_size = BATCH_SIZE,
            validation_data = (X_valid, X_valid),
           )

In [0]:
cae.save(path_to_cae)

### CNN

In [0]:
path_to_cnn = os.path.join(DIR, "cnn.h5")
if os.path.isfile(path_to_cnn):
        cnn = models.load_model(path_to_cnn)

else:
    
    # Create CNN

    cnn = models.Sequential()

    cnn.add(layers.Conv2D(filters=32, # TODO PLAY AROUND WITH THIS
                         kernel_size=(5, 5), # TODO PLAY AROUND WITH THIS
                         activation="relu",
                         input_shape=(100, 100, 1), # TODO PLAY AROUND WITH THIS
                         padding="same",
                         ))

    cnn.add(layers.Conv2D(filters=64, # TODO PLAY AROUND WITH THIS
                         kernel_size=(5, 5), # TODO PLAY AROUND WITH THIS
                         activation="relu",
                         padding="same",
                         ))

    cnn.add(layers.MaxPooling2D(pool_size=(2, 2)))
    
    cnn.add(layers.Dropout(0.25))

    cnn.add(layers.Conv2D(filters=32, # TODO PLAY AROUND WITH THIS
                         kernel_size=(3, 3), # TODO PLAY AROUND WITH THIS
                         activation="relu",
                         padding="same",
                         ))

    cnn.add(layers.Conv2D(filters=64, # TODO PLAY AROUND WITH THIS
                         kernel_size=(3, 3), # TODO PLAY AROUND WITH THIS
                         activation="relu",
                         padding="same",
                         ))
    
    cnn.add(layers.MaxPooling2D(pool_size=(2, 2)))
    cnn.add(layers.Dropout(0.25)) # Randomly drop 25% percent of features
    
    cnn.add(layers.Flatten())

    cnn.add(layers.Dense(256, activation="relu")) # TODO PLAY AROUND WITH THIS

    cnn.add(layers.Dropout(0.5)) # Randomly drop 25% percent of features

    cnn.add(layers.Dense(n_classes, activation="softmax"))


    cnn.compile(optimizer=keras.optimizers.Adadelta(),
               loss="categorical_crossentropy",
               metrics=["accuracy"])
    # Fit CNN
    losses = cnn.fit_generator(X_train_transformed,
                     epochs=150,
                     steps_per_epoch=X_train.shape[0] // BATCH_SIZE,
                     verbose=2, 
                     validation_data=(X_valid, y_valid),
                     )



In [0]:
cnn.save(os.path.join(DIR, "CNN"))