In [14]:
#Changes AEM:
#Updated make_submit() method to serialize trained xgboost model
#Method now checks for existence of serialized model before attempting to train
#4/3/17
import numpy as np
import dicom
import glob
from matplotlib import pyplot as plt
import os
import cv2
import mxnet as mx
import pandas as pd
from sklearn import cross_validation
import datetime as dt
# import cloudpickle as pickle

os.chdir('f:/dsb/inception_bn')
# get path in which file resides
filePath = os.path.dirname(os.path.realpath(os.getcwd()))
# replace xgboost with lightgbm
from lightgbm.sklearn import LGBMRegressor

#Some helper methods

def getFiles(aDir):
    aList = []
    for file in os.listdir(aDir):
        aList.append(file)
    return(aList)

def FileExists(aFile, aDir):
    aList = getFiles(aDir)
    
    for fl in aList:
        if fl == aFile:
            return True
        
    return False

def get_extractor():
    model = mx.model.FeedForward.load('model/inception-bn', 126, ctx=mx.cpu(), numpy_batch_size=1)
    fea_symbol = model.symbol.get_internals()["flatten_output"]
    feature_extractor = mx.model.FeedForward(ctx=mx.cpu(), symbol=fea_symbol, numpy_batch_size=64,
                                             arg_params=model.arg_params, aux_params=model.aux_params,
                                             allow_extra_params=True)

    return feature_extractor


def get_3d_data(path):
    slices = [dicom.read_file(path + '/' + s) for s in os.listdir(path)]
    slices.sort(key=lambda x: int(x.InstanceNumber))
    return np.stack([s.pixel_array for s in slices])


def get_data_id(path):
    sample_image = get_3d_data(path)
    sample_image[sample_image == -2000] = 0
    # f, plots = plt.subplots(4, 5, sharex='col', sharey='row', figsize=(10, 8))

    batch = []
    cnt = 0
    dx = 40
    ds = 512
    for i in range(0, sample_image.shape[0] - 3, 3):
        tmp = []
        for j in range(3):
            img = sample_image[i + j]
            img = 255.0 / np.amax(img) * img
            img = cv2.equalizeHist(img.astype(np.uint8))
            img = img[dx: ds - dx, dx: ds - dx]
            img = cv2.resize(img, (224, 224))
            tmp.append(img)

        tmp = np.array(tmp)
        batch.append(np.array(tmp))

        # if cnt < 20:
        #     plots[cnt // 5, cnt % 5].axis('off')
        #     plots[cnt // 5, cnt % 5].imshow(np.swapaxes(tmp, 0, 2))
        # cnt += 1

    # plt.show()
    batch = np.array(batch)
    return batch


def calc_features():
    net = get_extractor()
    for folder in glob.glob('stage1/*'):
        batch = get_data_id(folder)
        feats = net.predict(batch)
        print(feats.shape)
        np.save(folder, feats)


def train_xgboost():
    df = pd.read_csv('data/stage1_labels.csv')
    print(df.head())

    x = np.array([np.mean(np.load('stage1/%s.npy' % str(id)), axis=0) for id in df['id'].tolist()])
    y = df['cancer'].as_matrix()

    trn_x, val_x, trn_y, val_y = cross_validation.train_test_split(x, y, random_state=42, stratify=y,
                                                                   test_size=0.20)

# replaced xgb with lightgbm
    clf = LGBMRegressor(max_depth=50,
                        num_leaves=21,
                        n_estimators=5000,
                        min_child_weight=1,
                        learning_rate=0.001,
                        nthread=24,
                        subsample=0.80,
                        colsample_bytree=0.80,
                        seed=42)

    clf.fit(trn_x, trn_y, eval_set=[(val_x, val_y)], verbose=True, eval_metric='l2', early_stopping_rounds=300)

    return clf


def make_submit():
    # if not FileExists('xgModel.pk1', filePath):
        #Train model then serialize
    clf = train_xgboost()
    #    with open(filePath + '/' + 'xgModel.pk1','wb') as objSave:
    #        pickle.dump(clf, objSave, pickle.HIGHEST_PROTOCOL)
    # else:
    #    with open(filePath + '/' + 'xgModel.pk1','rb') as objLoad:
            #if model exists, just load from file
    #        clf = pickle.load(objLoad)

    df = pd.read_csv('data/stage1_sample_submission.csv')

    x = np.array([np.mean(np.load('stage1/%s.npy' % str(id)), axis=0) for id in df['id'].tolist()])

    pred = clf.predict(x)

    df['cancer'] = pred
    df.to_csv('subm1.csv', index=False)
    print(df.head())

In [15]:
os.getcwd()

'f:\\dsb\\inception_bn'

In [12]:
if __name__ == '__main__':
    print(dt.datetime.now())
    calc_features()
    # 28h17m
    print(dt.datetime.now())
    make_submit()
    print(dt.datetime.now())

2017-04-04 22:20:24.149945
(64, 1024)
(88, 1024)
(77, 1024)
(57, 1024)
(48, 1024)
(56, 1024)
(40, 1024)
(44, 1024)
(44, 1024)
(63, 1024)
(72, 1024)
(76, 1024)
(52, 1024)
(80, 1024)
(58, 1024)
(61, 1024)
(57, 1024)
(52, 1024)
(35, 1024)
(68, 1024)
(60, 1024)
(49, 1024)
(48, 1024)
(55, 1024)
(49, 1024)
(44, 1024)
(53, 1024)
(74, 1024)
(48, 1024)
(48, 1024)
(50, 1024)
(53, 1024)
(46, 1024)
(62, 1024)
(51, 1024)
(43, 1024)
(70, 1024)
(44, 1024)
(44, 1024)
(69, 1024)
(44, 1024)
(58, 1024)
(56, 1024)
(104, 1024)
(47, 1024)
(43, 1024)
(61, 1024)
(44, 1024)
(107, 1024)
(37, 1024)
(59, 1024)
(55, 1024)
(59, 1024)
(52, 1024)
(94, 1024)
(132, 1024)
(32, 1024)
(47, 1024)
(59, 1024)
(58, 1024)
(39, 1024)
(42, 1024)
(44, 1024)
(36, 1024)
(67, 1024)
(65, 1024)
(93, 1024)
(40, 1024)
(54, 1024)
(81, 1024)
(45, 1024)
(59, 1024)
(73, 1024)
(48, 1024)
(144, 1024)
(60, 1024)
(41, 1024)
(58, 1024)
(56, 1024)
(37, 1024)
(50, 1024)
(50, 1024)
(86, 1024)
(42, 1024)
(46, 1024)
(70, 1024)
(45, 1024)
(40, 1024)
(

AttributeError: module 'cloudpickle' has no attribute 'HIGHEST_PROTOCOL'

In [16]:
    print(dt.datetime.now())
    make_submit()
    print(dt.datetime.now())

2017-04-06 08:12:57.311924
                                 id  cancer
0  0015ceb851d7251b8f399e39779d1e7d       1
1  0030a160d58723ff36d73f41b170ec21       0
2  003f41c78e6acfa92430a057ac0b306e       0
3  006b96310a37b36cccb2ab48d10b49a3       1
4  008464bb8521d09a42985dd8add3d0d2       1
[1]	valid_0's l2: 0.439004
Train until valid scores didn't improve in 300 rounds.
[2]	valid_0's l2: 0.439006
[3]	valid_0's l2: 0.438999
[4]	valid_0's l2: 0.438982
[5]	valid_0's l2: 0.438963
[6]	valid_0's l2: 0.438948
[7]	valid_0's l2: 0.438926
[8]	valid_0's l2: 0.438906
[9]	valid_0's l2: 0.438903
[10]	valid_0's l2: 0.438877
[11]	valid_0's l2: 0.438866
[12]	valid_0's l2: 0.438865
[13]	valid_0's l2: 0.438861
[14]	valid_0's l2: 0.438866
[15]	valid_0's l2: 0.438834
[16]	valid_0's l2: 0.438838
[17]	valid_0's l2: 0.438797
[18]	valid_0's l2: 0.438785
[19]	valid_0's l2: 0.438784
[20]	valid_0's l2: 0.438754
[21]	valid_0's l2: 0.438737
[22]	valid_0's l2: 0.438733
[23]	valid_0's l2: 0.438717
[24]	valid_0's l2: 