# [Mxnet Xgboost Baseline](https://www.kaggle.com/drn01z3/data-science-bowl-2017/mxnet-xgboost-baseline-lb-0-57)

# Downloading Instructions
1. **pydicom** (dicom): type in anaconda command prompt: `pip install pydicom` ([reference](http://pydicom.readthedocs.io/en/latest/getting_started.html))
2. **opencv** (cv2): type in anaconda command prompt: `conda install -c https://conda.binstar.org/menpo opencv3` ([reference](https://rivercitylabs.org/up-and-running-with-opencv3-and-python-3-anaconda-edition/))
3. **mxnet**:
http://mxnet.io/get_started/setup.html#common-installation-problems
http://mxnet.io/architecture/index.html
http://mxnet.io/get_started/windows_setup.html#next-steps
http://mxnet.io/get_started/setup.html#requirements-for-using-gpus
https://no2147483647.wordpress.com/2015/12/07/deep-learning-for-hackers-with-mxnet-1/

# Dependency Descriptions
1. **numpy**: an extension to the Python programming language, adding support for large, multi-dimensional arrays and matrices, along with a large library of high-level mathematical functions to operate on these arrays
2. **pydicom** (dicom): pydicom is a pure python package for working with DICOM files such as medical images, reports, and radiotherapy objects
3. **glob**: a module that finds all the pathnames matching a specified pattern according to the rules used by the Unix shell (results returned in arbitrary order)
4. **matplotlib.pyplot**: a Python 2D plotting library which produces publication quality figures in a variety of hardcopy formats and interactive environments across platforms
5. **os**: a module that provides a portable way of using operating system dependent functionality
6. **open cv** (cv2): a library of programming functions mainly aimed at real-time computer vision
7. **mxnet** (mxnet): a Flexible and Efficient Library for Deep Learning
8. **pandas**: providing high-performance, easy-to-use data structures and data analysis tools for the Python programming language
9. **scikit-learn** (sklearn) *[cross_validation](http://scikit-learn.org/stable/modules/classes.html#module-sklearn.model_selection)*: simple and efficient tools for data mining and data analysis 
10. **xgboost**: a library designed and optimized for boosting trees algorithms

In [5]:
import numpy as np
import dicom
import glob
from matplotlib import pyplot as plt
import os
import cv2
import mxnet as mx
import pandas as pd
from sklearn import cross_validation
import xgboost as xgb

ImportError: No module named 'mxnet'

In [None]:
def get_extractor():
    model = mx.model.FeedForward.load('model/resnet-50', 0, ctx=mx.cpu(), numpy_batch_size=1)
    fea_symbol = model.symbol.get_internals()["flatten0_output"]
    feature_extractor = mx.model.FeedForward(ctx=mx.cpu(), symbol=fea_symbol, numpy_batch_size=64,
                                             arg_params=model.arg_params, aux_params=model.aux_params,
                                             allow_extra_params=True)

    return feature_extractor


def get_3d_data(path):
    slices = [dicom.read_file(path + '/' + s) for s in os.listdir(path)]
    slices.sort(key=lambda x: int(x.InstanceNumber))
    return np.stack([s.pixel_array for s in slices])


def get_data_id(path):
    sample_image = get_3d_data(path)
    sample_image[sample_image == -2000] = 0
    # f, plots = plt.subplots(4, 5, sharex='col', sharey='row', figsize=(10, 8))

    batch = []
    cnt = 0
    dx = 40
    ds = 512
    for i in range(0, sample_image.shape[0] - 3, 3):
        tmp = []
        for j in range(3):
            img = sample_image[i + j]
            img = 255.0 / np.amax(img) * img
            img = cv2.equalizeHist(img.astype(np.uint8))
            img = img[dx: ds - dx, dx: ds - dx]
            img = cv2.resize(img, (224, 224))
            tmp.append(img)

        tmp = np.array(tmp)
        batch.append(np.array(tmp))

        # if cnt < 20:
        #     plots[cnt // 5, cnt % 5].axis('off')
        #     plots[cnt // 5, cnt % 5].imshow(np.swapaxes(tmp, 0, 2))
        # cnt += 1

    # plt.show()
    batch = np.array(batch)
    return batch


def calc_features():
    net = get_extractor()
    for folder in glob.glob('stage1/*'):
        batch = get_data_id(folder)
        feats = net.predict(batch)
        print(feats.shape)
        np.save(folder, feats)


def train_xgboost():
    df = pd.read_csv('data/stage1_labels.csv')
    print(df.head())

    x = np.array([np.mean(np.load('stage1/%s.npy' % str(id)), axis=0) for id in df['id'].tolist()])
    y = df['cancer'].as_matrix()

    trn_x, val_x, trn_y, val_y = cross_validation.train_test_split(x, y, random_state=42, stratify=y,
                                                                   test_size=0.20)

    clf = xgb.XGBRegressor(max_depth=10,
                           n_estimators=1500,
                           min_child_weight=9,
                           learning_rate=0.05,
                           nthread=8,
                           subsample=0.80,
                           colsample_bytree=0.80,
                           seed=4242)

    clf.fit(trn_x, trn_y, eval_set=[(val_x, val_y)], verbose=True, eval_metric='logloss', early_stopping_rounds=50)
    return clf


def make_submit():
    clf = train_xgboost()

    df = pd.read_csv('data/stage1_sample_submission.csv')

    x = np.array([np.mean(np.load('stage1/%s.npy' % str(id)), axis=0) for id in df['id'].tolist()])

    pred = clf.predict(x)

    df['cancer'] = pred
    df.to_csv('subm1.csv', index=False)
    print(df.head())


if __name__ == '__main__':
    calc_features()
    make_submit()
