In [1]:
from utils import *
from path import *
%matplotlib inline
from utils.imports import *

Using TensorFlow backend.


In [2]:
# usage: python classify_nodes.py nodes.npy 

import numpy as np
import pickle
import scipy as sp

from sklearn import cross_validation
from sklearn.cross_validation import StratifiedKFold as KFold
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier as RF
import xgboost as xgb

In [3]:
def get_dirfiles(dir):
    file_list = []
    subset_path = os.listdir(dir)
    for _ in range(len(subset_path)):
        if subset_path[_] != '.DS_Store':
            file_list.append(dir + subset_path[_])
    return file_list

def getRegionFromMap(slice_npy):
    #thr = np.where(slice_npy > np.mean(slice_npy),0.,1.0)
    label_image = measure.label(slice_npy)
    labels = label_image.astype(int)
    regions = measure.regionprops(labels)
    return regions

def getRegionMetricRow(fname,model):
    # fname, numpy array of dimension [#slices, 1, 512, 512] containing the images
    
    seg = np.load(fname)
    nslices = seg.shape[0]

    pred = np.expand_dims(seg,0)
    pred = np.expand_dims(pred,0)  
    cls = model.predict(pred)[0][1]
    
    #metrics
    totalArea = 0.
    avgArea = 0.
    maxArea = 0.
    avgEcc = 0.
    avgEquivlentDiameter = 0.
    stdEquivlentDiameter = 0.
    weightedX = 0.
    weightedY = 0.
    numNodes = 0.
    numNodesperSlice = 0.
    # crude hueristic to filter some bad segmentaitons
    # do not allow any nodes to be larger than 10% of the pixels to eliminate background regions
    maxAllowedArea = 0.10 * 512 * 512 
       
    
    areas = []
    eqDiameters = []
    seg = np.expand_dims(seg,1)
    for slicen in range(nslices):
        regions = getRegionFromMap(seg[slicen,0,:,:])
        for region in regions:
            if region.area > maxAllowedArea:
                continue
            totalArea += region.area
            areas.append(region.area)
            avgEcc += region.eccentricity
            avgEquivlentDiameter += region.equivalent_diameter
            eqDiameters.append(region.equivalent_diameter)
            weightedX += region.centroid[0]*region.area
            weightedY += region.centroid[1]*region.area
            numNodes += 1
    if totalArea == 0 or numNodes == 0:
        os.remove(fname)
    else:
        weightedX = weightedX / totalArea 
        weightedY = weightedY / totalArea
        avgArea = totalArea / numNodes
        avgEcc = avgEcc / numNodes
        avgEquivlentDiameter = avgEquivlentDiameter / numNodes
        stdEquivlentDiameter = np.std(eqDiameters)

        maxArea = max(areas)


        numNodesperSlice = numNodes*1. / nslices
    return np.array([avgArea,maxArea,avgEcc,avgEquivlentDiameter,\
                         stdEquivlentDiameter, weightedX, weightedY, numNodes, numNodesperSlice, cls])

def createFeatureDataset(nodfiles_true,nodfiles_false,model):
    numfeatures = 10
    feature_array = np.zeros((len(nodfiles_true)+len(nodfiles_false),numfeatures))
    truth_metric = np.zeros((len(nodfiles_true)+len(nodfiles_false)))
    
    for i,nodfile in enumerate(tqdm(nodfiles_true)):
        patID = nodfile.split("_")[3][-10:]
        truth_metric[i] = 1
        feature_array[i] = getRegionMetricRow(nodfiles_true[i],model)
    for i,nodfile in enumerate(tqdm(nodfiles_false)):
        patID = nodfile.split("_")[3][-10:]
        truth_metric[len(nodfiles_true)+i] = 0
        feature_array[len(nodfiles_true)+i] = getRegionMetricRow(nodfiles_false[i],model)   
    np.save("dataY.npy", truth_metric)
    np.save("dataX.npy", feature_array)


def logloss(act, pred):
    epsilon = 1e-15
    pred = sp.maximum(epsilon, pred)
    pred = sp.minimum(1-epsilon, pred)
    ll = sum(act*sp.log(pred) + sp.subtract(1,act)*sp.log(sp.subtract(1,pred)))
    ll = ll * -1.0/len(act)
    return ll



    
def findpara():
    X = np.load("dataX.npy")
    Y = np.load("dataY.npy")
    # try XGBoost
    print ("XGBoost")
    kf = KFold(Y, n_folds=3)
    y_pred = Y * 0

    xgb_model = xgb.XGBClassifier()
    parameters = {'nthread':[4], #when use hyperthread, xgboost may become slower
                      'objective':['binary:logistic'],
                      'learning_rate': [0.1,0.5,0.001,0.005,0.0001], #so called `eta` value
                      'max_depth': [2,4,6,8,12,14],
                      'min_child_weight': [3,4,5,6,7],
                      'silent': [1],
                      'subsample': [0.8,0.9,1],
                      'n_estimators': [2000], #number of trees, change it to 1000 for better results
                      'seed': [1337]}


    clf = GridSearchCV(xgb_model, parameters, n_jobs=5, 
                           cv=StratifiedKFold(y_train, n_folds=5, shuffle=True), 
                           scoring='roc_auc',
                           verbose=1, refit=True)

    clf.fit(X_train, y_train)


    best_parameters, score, _ = max(clf.grid_scores_, key=lambda x: x[1])
    print('Raw AUC score:', score)
    for param_name in sorted(best_parameters.keys()):
        print("%s: %r" % (param_name, best_parameters[param_name]))
    return

def classifyData():
    X = np.load("dataX.npy")
    Y = np.load("dataY.npy")
    # try XGBoost
    print ("XGBoost")
    kf = KFold(Y, n_folds=3)
    y_pred = Y * 0
    for train, test in kf:
        X_train, X_test, y_train, y_test = X[train,:], X[test,:], Y[train], Y[test]
        clf = xgb.XGBClassifier(nthread=4, #when use hyperthread, xgboost may become slower
                      objective='binary:logistic',)
        clf.fit(X_train, y_train)
        y_pred[test] = clf.predict(X_test)
         

    print classification_report(Y, y_pred, target_names=["No Cancer", "Cancer"])
    print("logloss",logloss(Y, y_pred))   
    return

def classifymodel():
    X = np.load("dataX.npy")
    Y = np.load("dataY.npy")

    clf = xgb.XGBClassifier(nthread=4, objective='binary:logistic',)
    clf.fit(X, Y)
        

    joblib.dump(clf,'classifymodel.pkl')
    clf=joblib.load('classifymodel.pkl')
    
    return




# 开始

In [4]:
file_list_true=glob.glob(PATH['cls_train_cube_30_true']+"*.npy")[0:42000]
file_list_false=glob.glob(PATH['cls_train_cube_30_false']+"*.npy")[0:42000]

In [5]:
model_paths = PATH['model_paths']
model_final = PATH['model_final']
model_cube_30 = load_model(model_paths + 'Fenge_32_32_32_0703.h5')

In [6]:
#model_cube_30 = []
createFeatureDataset(file_list_true,file_list_false,model_cube_30) #第一遍可能会报错，不用管再跑一遍即可

100%|██████████| 42000/42000 [14:53<00:00, 47.01it/s]  
100%|██████████| 42000/42000 [11:33<00:00, 60.59it/s]


In [7]:
classifymodel()

In [8]:
classifyData()

XGBoost
             precision    recall  f1-score   support

  No Cancer       0.99      0.99      0.99     42000
     Cancer       0.99      0.99      0.99     42000

avg / total       0.99      0.99      0.99     84000

('logloss', 0.21833700661662117)
