In [1]:
%matplotlib inline
import matplotlib.pyplot as plt

import tensorflow as tf
import numpy as np
import cv2
import os
import pandas as pd
from random import shuffle
from tqdm import tqdm
import time

# TF learn imports
import tflearn
from tflearn.layers.conv import conv_2d, max_pool_2d
from tflearn.layers.core import input_data, dropout, fully_connected
from tflearn.layers.estimator import regression


# Define some constants
TRAIN_DIR = "./data/train"
TEST_DIR = "./data//test"
IMG_WIDTH = 64 # all images will be made square
LR = 1e-3 #learning rate

#MODEL_NAME = "cats-v-dogs-{}.model".format("conv6")
#print("MODEL_NAME = %s" % MODEL_NAME)

dm = pd.read_csv("./data/train_labels.csv")
print(len(dm))

# ========= Some helper functions =========
def normalize_img(img):
    """Normalizes an image to 
    
    PARAMS
    ------
    img: array of shape: (width, height, 3)
    
    RETURNS
    -------
    img_n: array of shape: (width, height, 3), normalized between 0 and 1
    """
    img_n = np.zeros(img.shape)
    for c_channel in range(3):
        channel_array = img[:, :, c_channel]/255
        img_n[:, :, c_channel] = channel_array
    
    return img_n


def create_train_data(save_fn):
    """Does k-fold. 
    """
    training_data = []
    for fn in tqdm(os.listdir(TRAIN_DIR)):
        full_fn = TRAIN_DIR+"/"+fn
        img = cv2.resize(normalize_img(cv2.imread(full_fn)), (IMG_WIDTH, IMG_WIDTH))
        label = dm.loc[dm["id"]==fn.split(".")[0]].label.values[0]
        
        training_data.append([fn, np.array(img), label])
    
    #df = pd.DataFrame(data=training_data, columns=["filename", "data", "label"])
    #if len(save_fn) > 0:
    #    df.to_pickle(save_fn)
    return df


Instructions for updating:
Colocations handled automatically by placer.
220025


In [None]:
# Pre-process and save training data
train_data = create_train_data("all-data-df.pkl")


100%|██████████| 220025/220025 [1:26:49<00:00, 42.23it/s]


In [None]:
df = pd.DataFrame(data=training_data, columns=["filename", "data", "label"])

In [None]:
df = pd.DataFrame(data=train_data, columns=["img_data", "label"])
d0 = df.loc[df["label"]==[1, 0]]
d1 = df.loc[df["label"]==[0, 1]]

d0_test = d0.head(300)
train_names_ls = []
for nm in list(df[""])

In [None]:
#train_data = create_train_data()
# Otherwise, if already created:
train_data = np.load(TRAIN_DIR+"/train_data.npy")
print("Num. of records = ", len(train_data))
# Each row is of length 2; row[0] = image data, row[1] = one-hot encoded label

# Grab training and testing subsets
train = train_data[:-500]
test = train_data[-500:]

# Get array of images...
X = np.array([i[0] for i in train]).reshape(-1, IMG_WIDTH, IMG_WIDTH, 1)
# ...and their labels
Y = np.array([i[1] for i in train])

test_x = np.array([i[0] for i in test]).reshape(-1, IMG_WIDTH, IMG_WIDTH, 1)
# ...and their labels
test_y = np.array([i[1] for i in test])

In [None]:
# Define model and fit

tf.reset_default_graph()

convnet = input_data(shape=[None, IMG_WIDTH, IMG_WIDTH, 3], name='input')

convnet = conv_2d(convnet, 32, 2, activation='relu')
convnet = max_pool_2d(convnet, 2)

convnet = conv_2d(convnet, 64, 2, activation='relu')
convnet = max_pool_2d(convnet, 2)

convnet = conv_2d(convnet, 32, 2, activation='relu')
convnet = max_pool_2d(convnet, 2)

convnet = conv_2d(convnet, 64, 2, activation='relu')
convnet = max_pool_2d(convnet, 2)

convnet = conv_2d(convnet, 32, 2, activation='relu')
convnet = max_pool_2d(convnet, 2)

convnet = conv_2d(convnet, 64, 2, activation='relu')
convnet = max_pool_2d(convnet, 2)

convnet = conv_2d(convnet, 32, 2, activation='relu')
convnet = max_pool_2d(convnet, 2)

convnet = conv_2d(convnet, 64, 2, activation='relu')
convnet = max_pool_2d(convnet, 2)

convnet = conv_2d(convnet, 32, 2, activation='relu')
convnet = max_pool_2d(convnet, 2)

convnet = conv_2d(convnet, 64, 2, activation='relu')
convnet = max_pool_2d(convnet, 2)

convnet = fully_connected(convnet, 2048, activation='relu')
convnet = dropout(convnet, 0.5)

convnet = fully_connected(convnet, 2, activation='softmax')
convnet = regression(convnet, 
                     optimizer='adam', 
                     learning_rate=LR, 
                     loss='categorical_crossentropy', 
                     name='targets')

# the 2nd param logs tensorboard to /tmp
model = tflearn.DNN(convnet, tensorboard_dir="summaries")

t0 = time.time()
# Note: run_id is for tensorboard later
model.fit({'input': X}, {'targets': Y}, 
          n_epoch=5, 
          validation_set=({'input': test_x}, {'targets': test_y}), 
          snapshot_step=500, 
          show_metric=True, 
          run_id="prod-run")

print("Done in %.2fs" % (time.time() - t0))