In [None]:
import cv2                 
import numpy as np         
import os 
from random import shuffle

In [None]:
# CTE: These are my filepaths, here for convenience
cte_train_dir = 'D:\\Work\\Kaggle\\Cancer Screening\\Cancer-Screening-NN\\Data\\train\\'
cte_test_dir = 'D:\\Work\\Kaggle\\Cancer Screening\\Cancer-Screening-NN\\Data\\test\\'
cte_additional_t1_dir = 'D:\\Work\\Kaggle\\Cancer Screening\\Cancer-Screening-NN\\Data\\additional_Type_1_v2\\'
cte_additional_t2_dir = 'D:\\Work\\Kaggle\\Cancer Screening\\Cancer-Screening-NN\\Data\\additional_Type_2_v2\\'
cte_additional_t3_dir = 'D:\\Work\\Kaggle\\Cancer Screening\\Cancer-Screening-NN\\Data\\additional_Type_3_v2\\'

hjh_train_dir = '/Users/hayleeheilman/Data/Train'
hjh_test_dir = '/Users/hayleeheilman/Data/test'

TRAIN_DIR = hjh_train_dir # put the path to the training data here.
TEST_DIR = hjh_test_dir # put the path to the test data here.
IMG_SIZE = 50 # we can change this to whatever
STORAGE_DIR = 'D:\\Work\\Kaggle\\Cancer Screening\\Cancer-Screening-NN\\Data\\processed\\'

In [None]:
# Progress bar function, since some of these processes take a while
def show_progress(current_index, total_len):
    x = (current_index + 1) * 100 // total_len
    x_less1 = current_index * 100 // total_len
    if (x % 10 == 0 and x - x_less1 > 0 and x < 100):
        print('{}%...'.format(x), end='')
    elif current_index == total_len - 1:
        print('100%')

In [None]:
def label_img(file):
    data_label = file
    # conversion to one-hot array [type_1,type_2,type_3]
    if data_label == 'Type_1': return [1,0,0]
    elif data_label == 'Type_2': return [0,1,0]
    elif data_label == 'Type_3': return [0,0,1]
    elif data_label == 'additional_Type_1': return [1,0,0]
    elif data_label == 'additional_Type_2': return [0,1,0]
    elif data_label == 'additional_Type_3': return [0,0,1]

In [None]:
def create_train_data():
    training_data = []
    for file in os.listdir(TRAIN_DIR):
        print('Starting on file {}'.format(file))
        file_path = os.path.join(TRAIN_DIR,file)
        data_label = label_img(file)
        file_len = len(os.listdir(file_path))
        for i, img in enumerate(os.listdir(file_path)):
            show_progress(i,file_len) # Print progress
            label = data_label
            path = os.path.join(file_path,img)
            try:
                img = cv2.resize(cv2.imread(path,cv2.IMREAD_GRAYSCALE), (IMG_SIZE,IMG_SIZE))
            except:
                print('\nError resizing image {} in {}'.format(img, file_path))
                continue
            training_data.append([np.array(img),np.array(label)])
    shuffle(training_data)
    np.save('train_data.npy', training_data)
    return training_data

In [None]:
def process_test_data():
    testing_data = []
    file_len = len(os.listdir(TEST_DIR))
    for i, img in enumerate(os.listdir(TEST_DIR)):
        show_progress(i,file_len) # Print progress
        path = os.path.join(TEST_DIR,img)
        img_num = img
        img = cv2.resize(cv2.imread(path,cv2.IMREAD_GRAYSCALE), (IMG_SIZE,IMG_SIZE))
        testing_data.append([np.array(img), img_num])
        
    np.save('test_data.npy', testing_data)
    return testing_data

In [None]:
#train_data = create_train_data()
#process_test_data()
train_data = np.load('train_data.npy')

In [None]:
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D
from keras.constraints import maxnorm
from keras.optimizers import Adam, Adadelta
import pandas as pd

In [None]:
num_classes = 3
train = train_data[:-1500]
test = train_data[-1500:]
x_train = np.array([i[0] for i in train]).reshape(-1,IMG_SIZE,IMG_SIZE,1)
x_train = x_train.astype('float32')
x_train = x_train/255.0
y_train = np.array([i[1] for i in train])
x_test = np.array([i[0] for i in test]).reshape(-1,IMG_SIZE,IMG_SIZE,1)
x_test = x_test.astype('float32')
x_test = x_test/255.0
y_test = np.array([i[1] for i in test])

In [None]:
model = Sequential()
model.add(Conv2D(32,(3,3),
                activation='relu',
                input_shape=(IMG_SIZE,IMG_SIZE,1)))
model.add(Conv2D(32,(3,3),activation='relu'))
model.add(MaxPooling2D(pool_size=(2,2)))
model.add(Conv2D(64,(3,3),activation='relu'))
model.add(Conv2D(64,(3,3),activation='relu'))
model.add(MaxPooling2D(pool_size=(2,2)))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(128,activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes,activation='softmax'))
model.compile(loss='categorical_crossentropy',
             optimizer=Adam(lr=1e-3, decay=1e-6),
             metrics=['accuracy'])

In [None]:
model.fit(x_train, y_train,
         batch_size=10,
         epochs=20,
         verbose=1,
         validation_data=(x_test, y_test))

In [None]:
test_data = np.load('test_data.npy')
data = np.array([i[0] for i in test_data]).reshape(-1,IMG_SIZE,IMG_SIZE,1)
test_num = [str(i[1]) for i in test_data]
pred = model.predict(data)
df = pd.DataFrame(pred, columns=['Type_1', 'Type_2', 'Type_3'])
df['image_name'] = test_num
df.to_csv('submission.csv', index=False)