In [1]:
import numpy as np
import dicom
import glob
from matplotlib import pyplot as plt
import os
import cv2
import mxnet as mx
import pandas as pd
from sklearn import cross_validation
import datetime as dt

# replace xgboost with lightgbm
from lightgbm.sklearn import LGBMRegressor
from lightgbm.sklearn import LGBMClassifier

def get_extractor():
    model = mx.model.FeedForward.load('model/resnet-50', 0, ctx=mx.cpu(), numpy_batch_size=1)
    fea_symbol = model.symbol.get_internals()["flatten0_output"]
    feature_extractor = mx.model.FeedForward(ctx=mx.cpu(), symbol=fea_symbol, numpy_batch_size=64,
                                             arg_params=model.arg_params, aux_params=model.aux_params,
                                             allow_extra_params=True)

    return feature_extractor


def get_3d_data(path):
    slices = [dicom.read_file(path + '/' + s) for s in os.listdir(path)]
    slices.sort(key=lambda x: int(x.InstanceNumber))
    return np.stack([s.pixel_array for s in slices])


def get_data_id(path):
    sample_image = get_3d_data(path)
    sample_image[sample_image == -2000] = 0
    # f, plots = plt.subplots(4, 5, sharex='col', sharey='row', figsize=(10, 8))

    batch = []
    cnt = 0
    dx = 40
    ds = 512
    for i in range(0, sample_image.shape[0] - 3, 3):
        tmp = []
        for j in range(3):
            img = sample_image[i + j]
            img = 255.0 / np.amax(img) * img
            img = cv2.equalizeHist(img.astype(np.uint8))
            img = img[dx: ds - dx, dx: ds - dx]
            img = cv2.resize(img, (224, 224))
            tmp.append(img)

        tmp = np.array(tmp)
        batch.append(np.array(tmp))

        # if cnt < 20:
        #     plots[cnt // 5, cnt % 5].axis('off')
        #     plots[cnt // 5, cnt % 5].imshow(np.swapaxes(tmp, 0, 2))
        # cnt += 1

    # plt.show()
    batch = np.array(batch)
    return batch


def calc_features():
    net = get_extractor()
    for folder in glob.glob('stage1/*'):
        batch = get_data_id(folder)
        feats = net.predict(batch)
        print(feats.shape)
        np.save(folder, feats)


def train_xgboost():
    df = pd.read_csv('data/stage2_labels.csv')
    print(df.head())

    x = np.array([np.mean(np.load('stage1/%s.npy' % str(id)), axis=0) for id in df['id'].tolist()])
    y = df['cancer'].as_matrix()

    trn_x, val_x, trn_y, val_y = cross_validation.train_test_split(x, y, random_state=42, stratify=y,
                                                                   test_size=0.20)

# replaced xgb with lightgbm
    clf = LGBMRegressor(max_depth=50,
                        num_leaves=21,
                        n_estimators=5000,
                        min_child_weight=1,
                        learning_rate=0.001,
                        nthread=24,
                        subsample=0.80,
                        colsample_bytree=0.80,
                        seed=42)

    clf.fit(trn_x, trn_y, eval_set=[(val_x, val_y)], verbose=True, eval_metric='l2', early_stopping_rounds=300)

    return clf


def make_submit():
    clf = train_xgboost()

    df = pd.read_csv('data/stage2_sample_submission.csv')

    x = np.array([np.mean(np.load('stage1/%s.npy' % str(id)), axis=0) for id in df['id'].tolist()])

    pred = clf.predict(x)

    df['cancer'] = pred
    df.to_csv('stage2_subm_resnet50.csv', index=False)
    print(df.head())



In [2]:
import os
os.getcwd()
os.chdir('f:/dsb/resnet50')

In [None]:
print(dt.datetime.now())
calc_features()
print(dt.datetime.now())
#endtime = 2017-4-10 01:32AM = 27 hours (506 items)

2017-04-08 22:29:06.797114


In [4]:
print(dt.datetime.now())
make_submit()
print(dt.datetime.now())

2017-04-10 23:24:56.070845
                                 id  cancer
0  0015ceb851d7251b8f399e39779d1e7d       1
1  0030a160d58723ff36d73f41b170ec21       0
2  003f41c78e6acfa92430a057ac0b306e       0
3  006b96310a37b36cccb2ab48d10b49a3       1
4  008464bb8521d09a42985dd8add3d0d2       1
[1]	valid_0's l2: 0.44041
Train until valid scores didn't improve in 300 rounds.
[2]	valid_0's l2: 0.440413
[3]	valid_0's l2: 0.440382
[4]	valid_0's l2: 0.44039
[5]	valid_0's l2: 0.440402
[6]	valid_0's l2: 0.440392
[7]	valid_0's l2: 0.440378
[8]	valid_0's l2: 0.440373
[9]	valid_0's l2: 0.440372
[10]	valid_0's l2: 0.440359
[11]	valid_0's l2: 0.440353
[12]	valid_0's l2: 0.440372
[13]	valid_0's l2: 0.440359
[14]	valid_0's l2: 0.440345
[15]	valid_0's l2: 0.440351
[16]	valid_0's l2: 0.440358
[17]	valid_0's l2: 0.440352
[18]	valid_0's l2: 0.440359
[19]	valid_0's l2: 0.44036
[20]	valid_0's l2: 0.440333
[21]	valid_0's l2: 0.440345
[22]	valid_0's l2: 0.440347
[23]	valid_0's l2: 0.440319
[24]	valid_0's l2: 0.4