In [1]:
from utils.imports import *

Using TensorFlow backend.


In [2]:
def getRegionFromMap(slice_npy):
    #thr = np.where(slice_npy > np.mean(slice_npy),0.,1.0)
    label_image = measure.label(slice_npy)
    labels = label_image.astype(int)
    regions = measure.regionprops(labels)
    return regions

def getRegionMetricRow(fname):
    # fname, numpy array of dimension [#slices, 1, 512, 512] containing the images
    
    seg = np.load(fname)
    #seg = cv2.imread(fname,cv2.IMREAD_GRAYSCALE).astype('float32')
    seg = np.expand_dims(seg,1)
    
    nslices = seg.shape[0]
    
    #metrics
    totalArea = 0.
    avgArea = 0.
    maxArea = 0.
    avgEcc = 0.
    avgEquivlentDiameter = 0.
    stdEquivlentDiameter = 0.
    weightedX = 0.
    weightedY = 0.
    numNodes = 0.
    numNodesperSlice = 0.
    # crude hueristic to filter some bad segmentaitons
    # do not allow any nodes to be larger than 10% of the pixels to eliminate background regions
    maxAllowedArea = 0.10 * 512 * 512 
    
    areas = [0]
    eqDiameters = []

    for slicen in range(nslices):
        regions = getRegionFromMap(seg[slicen,0,:,:])
        for region in regions:
            if region.area > maxAllowedArea:
                continue
            totalArea += region.area
            areas.append(region.area)
            avgEcc += region.eccentricity
            avgEquivlentDiameter += region.equivalent_diameter
            eqDiameters.append(region.equivalent_diameter)
            weightedX += region.centroid[0]*region.area
            weightedY += region.centroid[1]*region.area
            numNodes += 1
    if totalArea == 0 or numNodes == 0:
        weightedX = 0
        weightedY = 0
        avgArea = 0
        avgEcc = 0
        avgEquivlentDiameter = 0
        stdEquivlentDiameter = np.std(eqDiameters)       

    else:                
        weightedX = weightedX / totalArea  
        weightedY = weightedY / totalArea 
        avgArea = totalArea / numNodes
        avgEcc = avgEcc / numNodes
        avgEquivlentDiameter = avgEquivlentDiameter / numNodes
        stdEquivlentDiameter = np.std(eqDiameters)
    
    maxArea = max(areas)
    
    
    numNodesperSlice = numNodes*1. / nslices
    
    result = np.array([avgArea,maxArea,avgEcc,avgEquivlentDiameter,\
                     stdEquivlentDiameter, weightedX, weightedY, numNodes, numNodesperSlice])
    result = np.nan_to_num(result)
    return 

def createFeatureDataset(nodfiles_true,nodfiles_false):
    numfeatures = 9
    feature_array = np.zeros((len(nodfiles_true)+len(nodfiles_false),numfeatures))
    truth_metric = np.zeros((len(nodfiles_true)+len(nodfiles_false)))
    
    for i,nodfile in enumerate(tqdm(nodfiles_true)):
        patID = nodfile.split("_")[2]
        truth_metric[i] = 1
        feature_array[i] = getRegionMetricRow(nodfiles_true[i])

    for i,nodfile in enumerate(tqdm(nodfiles_false)):
        patID = nodfile.split("_")[2]
        truth_metric[len(nodfiles_true)+i] = 0
        feature_array[len(nodfiles_true)+i] = getRegionMetricRow(nodfiles_false[i])
    
    np.save("dataY.npy", truth_metric)
    np.save("dataX.npy", feature_array)


def logloss(act, pred):
    epsilon = 1e-15
    pred = sp.maximum(epsilon, pred)
    pred = sp.minimum(1-epsilon, pred)
    ll = sum(act*sp.log(pred) + sp.subtract(1,act)*sp.log(sp.subtract(1,pred)))
    ll = ll * -1.0/len(act)
    return ll


def classifyData():
    X = np.load("dataX.npy")
    Y = np.load("dataY.npy")
    
    
    kf = KFold(Y, n_folds=3)
    y_pred = Y * 0
    for train, test in kf:
        X_train, X_test, y_train, y_test = X[train,:], X[test,:], Y[train], Y[test]
        clf = RF(n_estimators=100, n_jobs=3)
        clf.fit(X_train, y_train)
        y_pred[test] = clf.predict(X_test)
    print classification_report(Y, y_pred, target_names=["No Cancer", "Cancer"])
    print("logloss",logloss(Y, y_pred))

    # All Cancer
    print "Predicting all positive"
    y_pred = np.ones(Y.shape)
    print classification_report(Y, y_pred, target_names=["No Cancer", "Cancer"])
    print("logloss",logloss(Y, y_pred))

    # No Cancer
    print "Predicting all negative"
    y_pred = Y*0
    print classification_report(Y, y_pred, target_names=["No Cancer", "Cancer"])
    print("logloss",logloss(Y, y_pred))

    # try XGBoost
    print ("XGBoost")
    kf = KFold(Y, n_folds=3)
    y_pred = Y * 0
    for train, test in kf:
        X_train, X_test, y_train, y_test = X[train,:], X[test,:], Y[train], Y[test]
        clf = xgb.XGBClassifier(objective="binary:logistic")
        clf.fit(X_train, y_train)
        y_pred[test] = clf.predict(X_test)
    print classification_report(Y, y_pred, target_names=["No Cancer", "Cancer"])
    print("logloss",logloss(Y, y_pred))

# 带mask的npy！！

In [3]:
true_path = PATH['cls_train_20_true']
false_path = PATH['cls_train_20_false']

file_list_true=glob.glob(true_path + "*_3d_20_6_i.npy")
file_list_false=glob.glob(false_path + "*_3d_20_6_i.npy")

In [4]:
print len(file_list_true)
print len(file_list_false)

10953
12532


In [5]:
createFeatureDataset(file_list_true,file_list_false)

100%|██████████| 10953/10953 [00:26<00:00, 415.65it/s]
100%|██████████| 12532/12532 [00:43<00:00, 291.30it/s]


In [6]:
classifyData()

ValueError: Cannot have number of folds n_folds=3 greater than the number of samples: 1.

In [11]:
    X = np.load("dataX.npy")
    Y = np.load("dataY.npy")
    