## Imports

In [None]:
"""ECEN 689 Challenge 7 - TAMU DGCI

https://www.kaggle.com/c/tamu-dgci

This kernel already has the data needed in it.
"""
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import random as rn
from skimage.transform import resize
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import cv2
import colorsys

# Seeding
os.environ['PYTHONHASHSEED'] = '0'
np.random.seed(102)
rn.seed(123)
tf.set_random_seed(142)


import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D, AveragePooling2D

print('Imports complete')

## Load Data

In [None]:
# Input data files are available in the "../input/" directory.
IN_DIR = os.path.join('..', 'input')
IMAGES_DIR = os.path.join(IN_DIR, 'archive')
# print(os.listdir(IN_DIR))

# By loading the first file, we can see that the images are (300, 400, 3)
# im = plt.imread(os.path.join(IMAGES_DIR, os.listdir(IMAGES_DIR)[0]))
# plt.imshow(im)
# print(im.shape)
# print(np.max(im))
# print(np.min(im))
IMG_ROWS = 224
IMG_COLS = 224
IMG_DEPTH = 3
IMG_SHAPE = (IMG_ROWS, IMG_COLS, IMG_DEPTH)

if keras.backend.image_data_format() == 'channels_first':
    raise UserWarning('I did not bother to implement the channels_first shape.')
    # print('channels first')
    input_shape = ()
else:
    # print('data first')
    data_format = 'channels_last'

# Load the training and testing files.
train_df = pd.read_csv(os.path.join(IN_DIR, "train.csv"), index_col=0)
test_df = pd.read_csv(os.path.join(IN_DIR, "sample.csv"), index_col=0)

# Compute the number of negative samples.
print('Out of {} training samples, {} have a DGCI < 0.'.format(train_df.shape[0],
                                                               np.count_nonzero(train_df['DGCI'].values < 0)))

# # Translate DGCI to [0, 1] interval
# mms = MinMaxScaler(feature_range=(0, 1))
# mms.fit(train_df['DGCI'].values.reshape(-1, 1))
# y_train = mms.transform(train_df['DGCI'].values.reshape(-1, 1)).ravel()
y_train = train_df['DGCI'].values.ravel()

# Get listing of all the files.
all_files = os.listdir(IMAGES_DIR)

# Initialize arrays to hold images.
x_train = np.zeros(shape=(train_df.shape[0], *IMG_SHAPE))
x_test = np.zeros(shape=(test_df.shape[0], *IMG_SHAPE))
id_test = []

# Function for loading and reshaping and image.
def load_reshape(im_id):
    temp_im = cv2.imread(os.path.join(IMAGES_DIR, str(im_id) + '.jpg')) #/ 255.0
    # Get hsb
    # im_hsb = cv2.cvtColor(temp_im, cv2.COLOR_BGR2HSV)
    
    im = resize(temp_im/255.0, IMG_SHAPE, mode='reflect')
    return im

# Loop over training data.
x_idx = 0
for row in train_df.itertuples():
    im = load_reshape(row.Index)
    x_train[x_idx, :, :, :] = im
    x_idx += 1

# Create weights. <10% of samples have a DGCI < 1, so
# we'll double the weight of those samples.
sample_weights = np.ones(train_df.shape[0])
sample_weights[train_df['DGCI'].values < 0] = 2

# DGCI actually goes from 0 to 1
# https://pdfs.semanticscholar.org/1f0d/0add993944b5230ff3548c6cb7b2e9954535.pdf
# https://scholarworks.uark.edu/cgi/viewcontent.cgi?referer=https://www.google.com/&httpsredir=1&article=2455&context=etd
# AFTER LOOKING AT THE IMAGES, IT'S A BAD IDEA TO RUN THE CODE BELOW.
# print('IMPORTANT NOTE:')
# print('Negative DGCI values have been multiplied by -1. DGCI should be on [0,1] interval.')
# train_df[train_df['DGCI'] < 0] *= -1

# Loop over testing data.
x_idx = 0
for row in test_df.itertuples():
    im = load_reshape(row.Index)
    x_test[x_idx, :, :, :] = im
    x_idx += 1
        
print('All images loaded.')

## Examine Training Data

In [None]:
print(x_train.shape)
print(x_test.shape)
print(np.max(x_train))
print(np.min(x_train))
# Make sure things are working.
for k in range(4):
    rand_ind = np.random.randint(low=0, high=x_train.shape[0])
    ax = plt.subplot(2, 2, k+1)
    ax.imshow(np.reshape(x_train[rand_ind, :, :, :], IMG_SHAPE))
    ax.set_title('Image {}. DGCI: {:.2f}'.format(train_df.index[rand_ind], train_df.iloc[rand_ind]['DGCI']))
    
plt.tight_layout()

print('DGCI stats:')
print(train_df['DGCI'].describe())

print('Scaled DGCI stats:')
print(pd.Series(y_train).describe())
#plt.imshow(np.reshape(x_train[5, :, :, :], IMG_SHAPE))
#plt.imshow(np.reshape(x_test[5, :, :, :], IMG_SHAPE))

## Look at Negative DGCI in the Training Data

In [None]:
ind = np.arange(train_df.shape[0])
neg_indices = ind[train_df['DGCI'].values < 0]
for k in range(4):
    rand_ind = np.random.choice(neg_indices)
    ax = plt.subplot(2, 2, k+1)
    ax.imshow(np.reshape(x_train[rand_ind, :, :, :], IMG_SHAPE))
    ax.set_title('Image {}. DGCI: {:.2f}'.format(train_df.index[rand_ind], train_df.iloc[rand_ind]['DGCI']))
    
plt.tight_layout()

## Look at Testing Data

In [None]:
for k in range(4):
    rand_ind = np.random.randint(low=0, high=x_test.shape[0])
    ax = plt.subplot(2, 2, k+1)
    ax.imshow(np.reshape(x_test[rand_ind, :, :, :], IMG_SHAPE))
    ax.set_title('Image {}'.format(train_df.index[rand_ind]))
    
plt.tight_layout()

In [None]:
# # Scale test data.
# x_scaler = StandardScaler()
# x_norm_train = x_scaler.fit_transform(x_train)
# x_norm_test = x_scaler.transform(x_test)

## Setup CNN

In [None]:
# Based on my interpretation of: https://arxiv.org/pdf/1409.1556.pdf
# To start, we ran with configuration A
# TODO: try configuration C (adding 1x1 convolution)
model = Sequential()
model.add(Conv2D(64, kernel_size=3, strides=1, activation='relu', input_shape=IMG_SHAPE, padding='same', kernel_regularizer=keras.regularizers.l2(5e-4), data_format=data_format))
model.add(AveragePooling2D(pool_size=2, strides=2))
model.add(Conv2D(128, kernel_size=3, strides=1, activation='relu', padding='same', kernel_regularizer=keras.regularizers.l2(5e-4), data_format=data_format))
model.add(AveragePooling2D(pool_size=2, strides=2))
model.add(Conv2D(256, kernel_size=3, strides=1, activation='relu', padding='same', kernel_regularizer=keras.regularizers.l2(5e-4), data_format=data_format))
model.add(Conv2D(256, kernel_size=3, strides=1, activation='relu', padding='same', kernel_regularizer=keras.regularizers.l2(5e-4), data_format=data_format))
model.add(AveragePooling2D(pool_size=2, strides=2))
model.add(Conv2D(512, kernel_size=3, strides=1, activation='relu', padding='same', kernel_regularizer=keras.regularizers.l2(5e-4), data_format=data_format))
model.add(Conv2D(512, kernel_size=3, strides=1, activation='relu', padding='same', kernel_regularizer=keras.regularizers.l2(5e-4), data_format=data_format))
model.add(AveragePooling2D(pool_size=2, strides=2))
model.add(Conv2D(512, kernel_size=3, strides=1, activation='relu', padding='same', kernel_regularizer=keras.regularizers.l2(5e-4), data_format=data_format))
model.add(Conv2D(512, kernel_size=3, strides=1, activation='relu', padding='same', kernel_regularizer=keras.regularizers.l2(5e-4), data_format=data_format))
model.add(Flatten())
# This is where we deviate from the paper - they used two Dense @ 4096 into a 1000-way softmax.
# Since we're doing more of a regression task, we'll use smaller layers, and of course end
# with a single node.
model.add(Dense(1024, activation='relu', kernel_regularizer=keras.regularizers.l2(5e-4)))
model.add(Dropout(0.5))
model.add(Dense(1024, activation='relu', kernel_regularizer=keras.regularizers.l2(5e-4)))
model.add(Dropout(0.5))
model.add(Dense(1))


# optimizer = tf.train.RMSPropOptimizer(0.001)
optimizer = keras.optimizers.SGD(0.01, momentum=0.9)
# optimizer = keras.optimizers.Adadelta()
model.compile(loss=keras.losses.mean_squared_error, optimizer=optimizer, metrics=['mse'], weighted_metrics=['mse'])

print(model.summary())

## Train CNN

In [None]:
# Use ImageDataGenerator to increase the effective size of our dataset.
# Was tracking some metrics here, but was using bad data...
dg = keras.preprocessing.image.ImageDataGenerator(horizontal_flip=True, vertical_flip=True, width_shift_range=0.5, height_shift_range=0.5, fill_mode='wrap',
                                                 rotation_range=180, shear_range=180, data_format=data_format)
dg.fit(x_train)

# Initial training
print('Training with rate at 0.01')
early_stop = keras.callbacks.EarlyStopping(monitor='weighted_mean_squared_error', patience=10)
batch_size = 32
model.fit_generator(dg.flow(x_train, y_train, batch_size=batch_size, sample_weight=sample_weights),
                    epochs=68, verbose=2, steps_per_epoch=(x_train.shape[0] / batch_size)) #, callbacks=[early_stop])

# # Don't really see improvement for the reduced learning rates.
# # Recompile with smaller learning rate.
# print('\nTraining with rate at 0.001')
# optimizer = keras.optimizers.SGD(0.001, momentum=0.9)
# model.compile(loss=keras.losses.mean_squared_error, optimizer=optimizer, metrics=['mse'])
# model.fit(x_train, y_train, epochs=200, verbose=2, batch_size=128, sample_weight=sample_weights, callbacks=[early_stop])

# # Recompile with smaller learning rate.
# print('\nTraining with rate at 0.0001')
# optimizer = keras.optimizers.SGD(0.0001, momentum=0.9)
# model.compile(loss=keras.losses.mean_squared_error, optimizer=optimizer, metrics=['mse'])
# model.fit(x_train, y_train, epochs=200, verbose=2, batch_size=128, sample_weight=sample_weights, callbacks=[early_stop])

# swap to average pooling:
# mean_squared_error: 0.0406 - weighted_mean_squared_error: 0.0682

## Make Predictions, Write to File

In [None]:
# Test.
# y_test = mms.inverse_transform(model.predict(x_test))
y_test = model.predict(x_test)
test_df['DGCI'] = y_test.ravel()
print(test_df.head())
print(test_df.describe())
# Write to file.
test_df.to_csv('output.csv')

yt = model.predict(x_train)
print(pd.Series(yt.ravel()).describe())

## Plot Predictions

In [None]:
for k in range(4):
    rand_ind = np.random.randint(low=0, high=x_test.shape[0])
    ax = plt.subplot(2, 2, k+1)
    ax.imshow(np.reshape(x_test[rand_ind, :, :, :], IMG_SHAPE))
    ax.set_title('Image {}. DGCI: {:.2f}'.format(test_df.index[rand_ind], test_df.iloc[rand_ind]['DGCI']))
    
plt.tight_layout()