In [1]:
#Changes FFC:
# Try a real PCA60 method
#4/5/17
import numpy as np
import dicom
import glob
from matplotlib import pyplot as plt
import os
import cv2
import mxnet as mx
import pandas as pd
from sklearn import cross_validation
from sklearn.decomposition import PCA as sklearnPCA
import datetime as dt
import cloudpickle as pickle

os.chdir('f:/dsb/resnet50')
# get path in which file resides
filePath = os.path.dirname(os.path.realpath(os.getcwd()))
# replace xgboost with lightgbm
from lightgbm.sklearn import LGBMRegressor

#Some helper methods

def getFiles(aDir):
    aList = []
    for file in os.listdir(aDir):
        aList.append(file)
    return(aList)

def FileExists(aFile, aDir):
    aList = getFiles(aDir)
    
    for fl in aList:
        if fl == aFile:
            return True
        
    return False

def get_extractor():
    model = mx.model.FeedForward.load('model/resnet-50', 0, ctx=mx.cpu(), numpy_batch_size=1)
    fea_symbol = model.symbol.get_internals()["flatten0_output"]
    feature_extractor = mx.model.FeedForward(ctx=mx.cpu(), symbol=fea_symbol, numpy_batch_size=64,
                                             arg_params=model.arg_params, aux_params=model.aux_params,
                                             allow_extra_params=True)

    return feature_extractor


def get_3d_data(path):
    slices = [dicom.read_file(path + '/' + s) for s in os.listdir(path)]
    slices.sort(key=lambda x: int(x.InstanceNumber))
    return np.stack([s.pixel_array for s in slices])


def get_data_id(path):
    sample_image = get_3d_data(path)
    sample_image[sample_image == -2000] = 0
    # f, plots = plt.subplots(4, 5, sharex='col', sharey='row', figsize=(10, 8))

    batch = []
    cnt = 0
    dx = 40
    ds = 512
    for i in range(0, sample_image.shape[0] - 3, 3):
        tmp = []
        for j in range(3):
            img = sample_image[i + j]
            img = 255.0 / np.amax(img) * img
            img = cv2.equalizeHist(img.astype(np.uint8))
            img = img[dx: ds - dx, dx: ds - dx]
            img = cv2.resize(img, (224, 224))
            tmp.append(img)

        tmp = np.array(tmp)
        batch.append(np.array(tmp))

        # if cnt < 20:
        #     plots[cnt // 5, cnt % 5].axis('off')
        #     plots[cnt // 5, cnt % 5].imshow(np.swapaxes(tmp, 0, 2))
        # cnt += 1

    # plt.show()
    batch = np.array(batch)
    return batch


def calc_features():
    net = get_extractor()
    for folder in glob.glob('stage1/*'):
        batch = get_data_id(folder)
        feats = net.predict(batch)
        print(feats.shape)
        np.save(folder, feats)





In [5]:
def train_xgboost():
    df = pd.read_csv('data/stage1_labels.csv')
    print(df.head())

    x = np.array([np.mean(np.load('stage1/%s.npy' % str(id)), axis=0) for id in df['id'].tolist()])
    y = df['cancer'].as_matrix()

    trn_x, val_x, trn_y, val_y = cross_validation.train_test_split(x, y, random_state=42, stratify=y,
                                                                   test_size=0.20)

# replaced xgb with lightgbm
    clf = LGBMRegressor(max_depth=5,
                      #  num_leaves=21,
                        n_estimators=2500,
                        min_child_weight=96,
                        learning_rate=0.03757,
                        nthread=8,
                        subsample=0.85,
                        colsample_bytree=0.90,
                        seed=42)
    pca = sklearnPCA(n_components=60)

# use the PCA decomposition of Train dataset to extract Top 60 PCA of Validation dataset 
    print(dt.datetime.now())
    pca.fit(trn_x)
    trn_x = pca.transform(trn_x)
    val_x = pca.transform(val_x)
    print(val_x.shape)
    print(dt.datetime.now())
    
    clf.fit(trn_x, trn_y, eval_set=[(val_x, val_y)], verbose=True, eval_metric='l2', early_stopping_rounds=300)
    pca.fit(x)

    return clf, pca
    
def make_submit():
    # if not FileExists('resnet50_pca_lgbm.pk1', filePath):
        #Train model then serialize
    clf, pca = train_xgboost()

    #    with open(filePath + '/' + 'resnet50_pca_lgbm.pk1','wb') as objSave:
    #        pickle.dump(clf, objSave, pickle.HIGHEST_PROTOCOL)
    # else:
    #    with open(filePath + '/' + 'resnet50_pca_lgbm.pk1','rb') as objLoad:
            #if model exists, just load from file
    #        clf = pickle.load(objLoad)

    df = pd.read_csv('data/stage1_sample_submission.csv')

    x = np.array([np.mean(np.load('stage1/%s.npy' % str(id)), axis=0) for id in df['id'].tolist()])
    x = pca.transform(x)
    print(x.shape)
    
    pred = clf.predict(x)
    
    df['cancer'] = pred
    df.to_csv('resnet50_pca60_logloss.csv', index=False)
    print(df.head())

In [6]:
os.getcwd()

'f:\\dsb\\resnet50'

In [7]:
if __name__ == '__main__':
    # print(dt.datetime.now())
    # calc_features()
    print(dt.datetime.now())
    make_submit()
    print(dt.datetime.now())

2017-04-05 23:44:49.692312
                                 id  cancer
0  0015ceb851d7251b8f399e39779d1e7d       1
1  0030a160d58723ff36d73f41b170ec21       0
2  003f41c78e6acfa92430a057ac0b306e       0
3  006b96310a37b36cccb2ab48d10b49a3       1
4  008464bb8521d09a42985dd8add3d0d2       1
2017-04-05 23:47:01.992231
(280, 60)
2017-04-05 23:47:41.640263
[1]	valid_0's l2: 0.439123
Train until valid scores didn't improve in 300 rounds.
[2]	valid_0's l2: 0.439192
[3]	valid_0's l2: 0.438796
[4]	valid_0's l2: 0.438638
[5]	valid_0's l2: 0.439083
[6]	valid_0's l2: 0.438707
[7]	valid_0's l2: 0.438502
[8]	valid_0's l2: 0.438792
[9]	valid_0's l2: 0.438723
[10]	valid_0's l2: 0.438865
[11]	valid_0's l2: 0.438475
[12]	valid_0's l2: 0.438873
[13]	valid_0's l2: 0.438771
[14]	valid_0's l2: 0.438499
[15]	valid_0's l2: 0.438569
[16]	valid_0's l2: 0.438115
[17]	valid_0's l2: 0.438269
[18]	valid_0's l2: 0.437994
[19]	valid_0's l2: 0.438054
[20]	valid_0's l2: 0.43804
[21]	valid_0's l2: 0.437667
[22]	valid_0