In [1]:
%matplotlib inline
import matplotlib.pyplot as plt

import tensorflow as tf
import numpy as np
import cv2
import os
import pandas as pd
from random import shuffle
from tqdm import tqdm
import time
import pickle

# TF learn imports
import tflearn
from tflearn.layers.conv import conv_2d, max_pool_2d
from tflearn.layers.core import input_data, dropout, fully_connected
from tflearn.layers.normalization import batch_normalization
from tflearn.layers.estimator import regression


# Define some constants
TRAIN_DIR = "./data/train"
TEST_DIR = "./data//test"
IMG_WIDTH = 50 # all images will be made square. Downsampled from 96
LR = 1e-3 #learning rate

ORIGINAL_SIZE = 96
CROP_SIZE = 48 # final size after cropping

dm = pd.read_csv("./data/train_labels.csv")

# ========= Some helper functions =========
def normalize_img(img):
    """Normalizes an image to all values between 0 and 1
    
    PARAMS
    ------
    img: array of shape: (width, height, 3)
    
    RETURNS
    -------
    img_n: array of shape: (width, height, 3), normalized between 0 and 1
    """
    img_n = np.zeros([CROP_SIZE, CROP_SIZE, 3])
    start_crop = (ORIGINAL_SIZE - CROP_SIZE) // 2
    end_crop = start_crop + CROP_SIZE
    for c_channel in range(3):
        channel_array = img[start_crop:end_crop, start_crop:end_crop, c_channel]/255
        img_n[:, :, c_channel] = channel_array
    
    return img_n


def create_train_data(d_meta, class_size=25000, save_fn=""):
    """
    Extracts sample_size data points from condition0 and condition1 *each*
    s.t. total dataset size = 2*sample_size
    Adds k-fold indices (k=5). 5-folds is hardcoded. 
    """
    # Randomly select names from the metadata set
    d0 = d_meta.loc[d_meta["label"]==0].sample(n=class_size)
    d1 = d_meta.loc[d_meta["label"]==1].sample(n=class_size)
    selected_names_ls = [x+".tif" for x in list(d0["id"])+list(d1["id"])]

    # Grab all names
    fn_ls = os.listdir(TRAIN_DIR)

    training_data = []
    for fn in tqdm(os.listdir(TRAIN_DIR)):
        if fn in selected_names_ls:
            full_fn = TRAIN_DIR+"/"+fn
            img = normalize_img(cv2.imread(full_fn))
            label = dm.loc[dm["id"]==fn.split(".")[0]].label.values[0]
            training_data.append([fn, np.array(img), label])
    
    n_sets = int((class_size*2)/5)
    k_indices_ls = [0, 1, 2, 3, 4] * n_sets
    df = pd.DataFrame(data=training_data, columns=["filename", "data", "label"])
    df = df.sort_values(by="label")
    df["k-index"] = k_indices_ls
    
    return df


Instructions for updating:
Colocations handled automatically by placer.


## Create and Save Dataset

Randomly selects a subset of the original ~220k images, and preps them (normalization, cropping, splitting into folds) for input into the CNN. Saves the result, so this need only be run once. 

* For n=50k, the resulting df will need to be saved into multiple `pickle` files, because a single `pickle` file would be too big.
* n=50k is actually overkill, because performance is only a little lower using n=20k. Nevertheless, let's pretend that we need n=50k, just so that this becomes an interesting engineering problem of dealing with data batching. 

In [None]:
# Create dataset
# Random sample of n condition0 + n condition1
df = create_train_data(dm, class_size=25000)

# check
df.groupby(["label", "k-index"]).size()

In [None]:
t0 = time.time()
# save each k-index separately
# Because the whole file is too large to save as a single file
for k in range(5):
    d_t = df.loc[df["k-index"]==k]
    save_fn = "data-50k-48px-fold"+str(k)+".pkl"
    d_t.to_pickle(save_fn)
    
print("Done in %.2fs" % (time.time() - t0))

## Load dataset

Load the dataset created previously. If saved as multiple `.pkl` files, merge them into 1 dataframe.

In [2]:
# load and merge the 5 separate data sets
df_ls = []
for k in range(5):
    fn = "data-50k-48px-fold"+str(k)+".pkl"
    #df_dict[k] = pd.read_pickle(fn)
    df_ls.append(pd.read_pickle(fn))
df = pd.concat(df_ls, axis=0)

# Insert sanity check printout here...

In [3]:
# prep data for input into CNN
d_train = df.loc[df["k-index"].isin([1, 2, 3, 4])]
d_test = df.loc[df["k-index"]==0]

X = list(d_train["data"])
X = np.array(X)
Y = []
for lab in list(d_train['label']):
    if lab == 0:
        Y.append([1, 0])
    elif lab == 1:
        Y.append([0, 1])

x_test = list(d_test["data"])
x_test = np.array(x_test)
y_test = []
for lab in list(d_test['label']):
    if lab == 0:
        y_test.append([1, 0])
    elif lab == 1:
        y_test.append([0, 1])

In [4]:
# Define model and fit

tf.reset_default_graph()

convnet = input_data(shape=[None, 48, 48, 3], name='input')

convnet = conv_2d(convnet, 32, 2, activation='relu')
convnet = max_pool_2d(convnet, 2)

convnet = conv_2d(convnet, 64, 2, activation='relu')
convnet = max_pool_2d(convnet, 2)

convnet = conv_2d(convnet, 32, 2, activation='relu')
convnet = max_pool_2d(convnet, 2)

convnet = conv_2d(convnet, 64, 2, activation='relu')
convnet = max_pool_2d(convnet, 2)

convnet = conv_2d(convnet, 32, 2, activation='relu')
convnet = max_pool_2d(convnet, 2)

convnet = conv_2d(convnet, 64, 2, activation='relu')
convnet = max_pool_2d(convnet, 2)

convnet = conv_2d(convnet, 32, 2, activation='relu')
convnet = max_pool_2d(convnet, 2)

convnet = conv_2d(convnet, 64, 2, activation='relu')
convnet = max_pool_2d(convnet, 2)

convnet = conv_2d(convnet, 32, 2, activation='relu')
convnet = max_pool_2d(convnet, 2)

convnet = conv_2d(convnet, 64, 2, activation='relu')
convnet = max_pool_2d(convnet, 2)

convnet = conv_2d(convnet, 32, 2, activation='relu')
convnet = max_pool_2d(convnet, 2)

convnet = conv_2d(convnet, 64, 2, activation='relu')
convnet = max_pool_2d(convnet, 2)

convnet = fully_connected(convnet, 512, activation='relu')
convnet = batch_normalization(convnet)

convnet = fully_connected(convnet, 2, activation='softmax')
convnet = regression(convnet, 
                     optimizer='adam', 
                     learning_rate=LR, 
                     loss='categorical_crossentropy', 
                     name='targets')

# the 2nd param logs tensorboard to /tmp
model = tflearn.DNN(convnet, tensorboard_dir="summaries")

t0 = time.time()
# Note: run_id is for tensorboard later
model.fit({'input': X}, {'targets': Y}, 
          n_epoch=50, 
          validation_set=({'input': x_test}, {'targets': y_test}), 
          snapshot_step=500, 
          show_metric=True, 
          run_id="cnn-L12-run4")

print("Done in %.2fs" % (time.time() - t0))

Training Step: 31249  | total loss: [1m[32m0.18484[0m[0m | time: 101.815s
| Adam | epoch: 050 | loss: 0.18484 - acc: 0.9180 -- iter: 39936/40000
Training Step: 31250  | total loss: [1m[32m0.17571[0m[0m | time: 107.471s
| Adam | epoch: 050 | loss: 0.17571 - acc: 0.9231 | val_loss: 0.76522 - val_acc: 0.7666 -- iter: 40000/40000
--
Done in 5878.27s


97.96666666666667