In [13]:
#Changes AEM:
#Updated make_submit() method to serialize trained xgboost model
#Method now checks for existence of serialized model before attempting to train
#4/3/17
import numpy as np
import dicom
import glob
from matplotlib import pyplot as plt
import os
import cv2
import mxnet as mx
import pandas as pd
from sklearn import cross_validation
from sklearn.decomposition import PCA as sklearnPCA
import datetime as dt
import cloudpickle as pickle

os.chdir('f:/dsb/resnet50')
# get path in which file resides
filePath = os.path.dirname(os.path.realpath(os.getcwd()))
# replace xgboost with lightgbm
from lightgbm.sklearn import LGBMRegressor

#Some helper methods

def getFiles(aDir):
    aList = []
    for file in os.listdir(aDir):
        aList.append(file)
    return(aList)

def FileExists(aFile, aDir):
    aList = getFiles(aDir)
    
    for fl in aList:
        if fl == aFile:
            return True
        
    return False

def get_extractor():
    model = mx.model.FeedForward.load('model/resnet-50', 0, ctx=mx.cpu(), numpy_batch_size=1)
    fea_symbol = model.symbol.get_internals()["flatten0_output"]
    feature_extractor = mx.model.FeedForward(ctx=mx.cpu(), symbol=fea_symbol, numpy_batch_size=64,
                                             arg_params=model.arg_params, aux_params=model.aux_params,
                                             allow_extra_params=True)

    return feature_extractor


def get_3d_data(path):
    slices = [dicom.read_file(path + '/' + s) for s in os.listdir(path)]
    slices.sort(key=lambda x: int(x.InstanceNumber))
    return np.stack([s.pixel_array for s in slices])


def get_data_id(path):
    sample_image = get_3d_data(path)
    sample_image[sample_image == -2000] = 0
    # f, plots = plt.subplots(4, 5, sharex='col', sharey='row', figsize=(10, 8))

    batch = []
    cnt = 0
    dx = 40
    ds = 512
    for i in range(0, sample_image.shape[0] - 3, 3):
        tmp = []
        for j in range(3):
            img = sample_image[i + j]
            img = 255.0 / np.amax(img) * img
            img = cv2.equalizeHist(img.astype(np.uint8))
            img = img[dx: ds - dx, dx: ds - dx]
            img = cv2.resize(img, (224, 224))
            tmp.append(img)

        tmp = np.array(tmp)
        batch.append(np.array(tmp))

        # if cnt < 20:
        #     plots[cnt // 5, cnt % 5].axis('off')
        #     plots[cnt // 5, cnt % 5].imshow(np.swapaxes(tmp, 0, 2))
        # cnt += 1

    # plt.show()
    batch = np.array(batch)
    return batch


def calc_features():
    net = get_extractor()
    for folder in glob.glob('stage1/*'):
        batch = get_data_id(folder)
        feats = net.predict(batch)
        print(feats.shape)
        np.save(folder, feats)

def train_xgboost(verbose=True):
    df = pd.read_csv('data/stage1_labels.csv')
    
    x = np.zeros((df.shape[0], 2048))
    for i, id in enumerate(df['id'].tolist()):
        feat = np.load('stage1/%s.npy' % str(id))
        feat_pca = sklearnPCA(n_components=1).fit_transform(feat.transpose())
        x[i,:] = feat_pca.squeeze()
    print(x.shape)
    y = df['cancer'].as_matrix()
    
    trn_x, val_x, trn_y, val_y = cross_validation.train_test_split(x, y, random_state=42, stratify=y, test_size=0.20)
    
# replaced xgb with lightgbm
    clf = LGBMRegressor(max_depth=50,
                        num_leaves=21,
                        n_estimators=5000,
                        min_child_weight=1,
                        learning_rate=0.001,
                        nthread=24,
                        subsample=0.80,
                        colsample_bytree=0.80,
                        seed=42)

    clf.fit(trn_x, trn_y, eval_set=[(val_x, val_y)], verbose=True, eval_metric='l2', early_stopping_rounds=300)

    return clf


def make_submit():
    # if not FileExists('resnet50_pca_lgbm.pk1', filePath):
        #Train model then serialize
    clf = train_xgboost()
    #    with open(filePath + '/' + 'resnet50_pca_lgbm.pk1','wb') as objSave:
    #        pickle.dump(clf, objSave, pickle.HIGHEST_PROTOCOL)
    # else:
    #    with open(filePath + '/' + 'resnet50_pca_lgbm.pk1','rb') as objLoad:
            #if model exists, just load from file
    #        clf = pickle.load(objLoad)

    df = pd.read_csv('data/stage1_sample_submission.csv')

    x = np.zeros((df.shape[0], 2048))
    for i, id in enumerate(df['id'].tolist()):
        feat = np.load('stage1/%s.npy' % str(id))
        feat_pca = sklearnPCA(n_components=1).fit_transform(feat.transpose())
        x[i,:] = feat_pca.squeeze()
    print(x.shape)

    pred = clf.predict(x)

    df['cancer'] = pred
    df.to_csv('resnet50_pca1_lgbm.csv', index=False)
    print(df.head())

In [14]:
os.getcwd()

'f:\\dsb\\resnet50'

In [15]:
if __name__ == '__main__':
    # print(dt.datetime.now())
    # calc_features()
    print(dt.datetime.now())
    make_submit()
    print(dt.datetime.now())

2017-04-05 00:22:31.184553
(1397, 2048)
[1]	valid_0's l2: 0.439031
Train until valid scores didn't improve in 300 rounds.
[2]	valid_0's l2: 0.43905
[3]	valid_0's l2: 0.439043
[4]	valid_0's l2: 0.439039
[5]	valid_0's l2: 0.439044
[6]	valid_0's l2: 0.43906
[7]	valid_0's l2: 0.439053
[8]	valid_0's l2: 0.439035
[9]	valid_0's l2: 0.439001
[10]	valid_0's l2: 0.439023
[11]	valid_0's l2: 0.439003
[12]	valid_0's l2: 0.439006
[13]	valid_0's l2: 0.438999
[14]	valid_0's l2: 0.43901
[15]	valid_0's l2: 0.439006
[16]	valid_0's l2: 0.438999
[17]	valid_0's l2: 0.438996
[18]	valid_0's l2: 0.43901
[19]	valid_0's l2: 0.438988
[20]	valid_0's l2: 0.438981
[21]	valid_0's l2: 0.438984
[22]	valid_0's l2: 0.438974
[23]	valid_0's l2: 0.438971
[24]	valid_0's l2: 0.438967
[25]	valid_0's l2: 0.438951
[26]	valid_0's l2: 0.438922
[27]	valid_0's l2: 0.438923
[28]	valid_0's l2: 0.438936
[29]	valid_0's l2: 0.438923
[30]	valid_0's l2: 0.438927
[31]	valid_0's l2: 0.438897
[32]	valid_0's l2: 0.438882
[33]	valid_0's l2: 0.4