In [2]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
# Define a function to Describe Features extracted
def describeData(X,y):
    print('Total number of images: {}'.format(len(X)))
    print('Number of Benign Images: {}'.format(np.sum(y==0)))
    print('Number of Malignant Images: {}'.format(np.sum(y==1)))
    print('Percentage of positive images: {:.2f}%'.format(100*np.mean(y)))
    print('Image shape (Samples, Rows, Columns, Features): {}'.format(X[0].shape))
    print()

def feature_dimension(train_or_test, model, factor):
    X = np.load('../data/features_' + model + '/'+ train_or_test + '/' + str(factor)+'/X.npy')
    y = np.load('../data/features_' + model + '/'+ train_or_test + '/' + str(factor)+'/y.npy')
    print('Discription of ' + str(factor) + 'x images and features extracted by ' + model)
    describeData(X, y)

In [4]:
# Check feature dimension extracted by Xception
feature_dimension('train', 'xception', 40)
feature_dimension('train', 'xception', 100)
feature_dimension('train', 'xception', 200)
feature_dimension('train', 'xception', 400)

Discription of 40x images and features extracted by xception
Total number of images: 1367
Number of Benign Images: 428
Number of Malignant Images: 939
Percentage of positive images: 68.69%
Image shape (Samples, Rows, Columns, Features): (1, 10, 10, 2048)

Discription of 100x images and features extracted by xception
Total number of images: 1449
Number of Benign Images: 439
Number of Malignant Images: 1010
Percentage of positive images: 69.70%
Image shape (Samples, Rows, Columns, Features): (1, 10, 10, 2048)

Discription of 200x images and features extracted by xception
Total number of images: 1383
Number of Benign Images: 412
Number of Malignant Images: 971
Percentage of positive images: 70.21%
Image shape (Samples, Rows, Columns, Features): (1, 10, 10, 2048)

Discription of 400x images and features extracted by xception
Total number of images: 1274
Number of Benign Images: 410
Number of Malignant Images: 864
Percentage of positive images: 67.82%
Image shape (Samples, Rows, Columns, Fe

In [5]:
# Check feature dimension extracted by VGG16
feature_dimension('train', 'vgg16', 40)
feature_dimension('train', 'vgg16', 100)
feature_dimension('train', 'vgg16', 200)
feature_dimension('train', 'vgg16', 400)

Discription of 40x images and features extracted by vgg16
Total number of images: 1367
Number of Benign Images: 428
Number of Malignant Images: 939
Percentage of positive images: 68.69%
Image shape (Samples, Rows, Columns, Features): (1, 7, 7, 512)

Discription of 100x images and features extracted by vgg16
Total number of images: 1449
Number of Benign Images: 439
Number of Malignant Images: 1010
Percentage of positive images: 69.70%
Image shape (Samples, Rows, Columns, Features): (1, 7, 7, 512)

Discription of 200x images and features extracted by vgg16
Total number of images: 1383
Number of Benign Images: 412
Number of Malignant Images: 971
Percentage of positive images: 70.21%
Image shape (Samples, Rows, Columns, Features): (1, 7, 7, 512)

Discription of 400x images and features extracted by vgg16
Total number of images: 1274
Number of Benign Images: 410
Number of Malignant Images: 864
Percentage of positive images: 67.82%
Image shape (Samples, Rows, Columns, Features): (1, 7, 7, 51

In [6]:
# Check feature dimension extracted by VGG19
feature_dimension('train', 'vgg19', 40)
feature_dimension('train', 'vgg19', 100)
feature_dimension('train', 'vgg19', 200)
feature_dimension('train', 'vgg19', 400)

Discription of 40x images and features extracted by vgg19
Total number of images: 1367
Number of Benign Images: 428
Number of Malignant Images: 939
Percentage of positive images: 68.69%
Image shape (Samples, Rows, Columns, Features): (1, 14, 14, 512)

Discription of 100x images and features extracted by vgg19
Total number of images: 1449
Number of Benign Images: 439
Number of Malignant Images: 1010
Percentage of positive images: 69.70%
Image shape (Samples, Rows, Columns, Features): (1, 14, 14, 512)

Discription of 200x images and features extracted by vgg19
Total number of images: 1383
Number of Benign Images: 412
Number of Malignant Images: 971
Percentage of positive images: 70.21%
Image shape (Samples, Rows, Columns, Features): (1, 14, 14, 512)

Discription of 400x images and features extracted by vgg19
Total number of images: 1274
Number of Benign Images: 410
Number of Malignant Images: 864
Percentage of positive images: 67.82%
Image shape (Samples, Rows, Columns, Features): (1, 14

In [17]:
#Using Logistic Regression as Classifier
from sklearn.linear_model import LogisticRegression
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

labels = ["Benign",  
          "Malignant"]

def runLogisticRegression(a,b,c,d):
    """Run LogisticRegression as Classifier"""
    model = LogisticRegression(solver='liblinear')
    clf = model.fit(a,b)
    preds = clf.predict(c)
    print(classification_report(d, preds, target_names=labels))
    #print("acc: %.02f" % accuracy_score(d, preds))
    print("F1 Score: {}".format(f1_score(d, preds)))
    #kfold = model_selection.KFold(n_splits=10)
    #f1= model_selection.cross_val_score(model, c,d, cv=kfold, scoring='f1')
    #mean = f1.mean() 
    #stdev = f1.std()
    #print('F1 score: %s (%s)' % (mean, stdev))
    print('')
    

def Model_Performance(model, magnification_factor):
    # Make Data 1D for compatability with standard classifiers
    X_train=np.load('../data/features_' + model + '/Train/' + magnification_factor +'/X.npy')
    X_trainShape = X_train.shape[1]*X_train.shape[2]*X_train.shape[3]*X_train.shape[4]
    X_trainFlat = X_train.reshape(X_train.shape[0], X_trainShape)
    y_train = np.load('../data/features_' + model  + '/Train/' +magnification_factor+'/y.npy')
    
    X_test=np.load('../data/features_' + model  + '/Test/' + magnification_factor+'/X.npy')
    X_testShape = X_test.shape[1]*X_test.shape[2]*X_test.shape[3]*X_test.shape[4]
    X_testFlat = X_test.reshape(X_test.shape[0], X_testShape)
    y_test = np.load('../data/features_' + model  + '/Test/' + magnification_factor+'/y.npy')
    
    print('Performance of pre-trained CNN model ' + model + ' for images at ' + magnification_factor + 'x:')
    runLogisticRegression(X_trainFlat, y_train, X_testFlat, y_test)

magnification_factors = ['40', '100', '200', '400']
for factor in magnification_factors:
    Model_Performance('xception', factor)
    Model_Performance('vgg16', factor)
    Model_Performance('vgg19', factor)

Performance of pre-trained CNN model xception for images at 40x:
              precision    recall  f1-score   support

      Benign       0.65      0.65      0.65       197
   Malignant       0.84      0.84      0.84       431

   micro avg       0.78      0.78      0.78       628
   macro avg       0.74      0.75      0.74       628
weighted avg       0.78      0.78      0.78       628

F1 Score: 0.8381839348079162

Performance of pre-trained CNN model vgg16 for images at 40x:
              precision    recall  f1-score   support

      Benign       0.57      0.56      0.56       197
   Malignant       0.80      0.81      0.80       431

   micro avg       0.73      0.73      0.73       628
   macro avg       0.68      0.68      0.68       628
weighted avg       0.73      0.73      0.73       628

F1 Score: 0.8023121387283237

Performance of pre-trained CNN model vgg19 for images at 40x:
              precision    recall  f1-score   support

      Benign       0.54      0.47      0.5

In [None]:
# Compare Performance of Different Classification Algorithms
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

classifiers = []
classifiers.append(('LR', LogisticRegression()))
classifiers.append(('RF', RandomForestClassifier()))
classifiers.append(('KNN', KNeighborsClassifier()))
classifiers.append(('SVM', SVC()))
classifiers.append(('LSVM', LinearSVC()))
classifiers.append(('GNB', GaussianNB()))
classifiers.append(('DTC', DecisionTreeClassifier()))
#classifiers.append(('GBC', GradientBoostingClassifier()))
#classifiers.append(('LDA', LinearDiscriminantAnalysis())) 

def defineModels():
    """
    This function just defines each abbreviation used in the previous function (e.g. LR = Logistic Regression)
    """
    print('')
    print('LR = LogisticRegression')
    print('RF = RandomForestClassifier')
    print('KNN = KNeighborsClassifier')
    print('SVM = Support Vector Machine SVC')
    print('LSVM = LinearSVC')
    print('GNB = GaussianNB')
    print('DTC = DecisionTreeClassifier')
    #print('GBC = GradientBoostingClassifier')
    #print('LDA = LinearDiscriminantAnalysis')
    print('')
    return
defineModels()
    
def compareABunchOfDifferentModelsAccuracy(a,b,c,d):
    """
    compare performance of classifiers on X_train, X_test, Y_train, Y_test
    http://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html#sklearn.metrics.accuracy_score
    http://scikit-learn.org/stable/modules/model_evaluation.html#accuracy-score
    """    
    print('')
    print('Compare Multiple Classifiers:')
    print('')
    print('K-Fold Cross-Validation Accuracy:')
    print('')      
    resultsAccuracy = []
    names = []
    for name, model in classifiers:
        model.fit(a, b)
        kfold = model_selection.KFold(n_splits=10)
        accuracy_results = model_selection.cross_val_score(model, c, d, cv=kfold, scoring='accuracy')
        resultsAccuracy.append(accuracy_results)
        names.append(name)
        accuracyMessage = "%s: %f (%f)" % (name, accuracy_results.mean(), accuracy_results.std())
        print(accuracyMessage)  
    # boxplot algorithm comparison
    fig = plt.figure()
    fig.suptitle('Algorithm Comparison: Accuracy')
    ax = fig.add_subplot(111)
    plt.boxplot(resultsAccuracy)
    ax.set_xticklabels(names)
    ax.set_ylabel('Cross-Validation: Accuracy Score')
    plt.show()
    return
compareABunchOfDifferentModelsAccuracy(X_trainFlat, Y_train, X_testFlat, Y_test)

In [None]:

for i in range(len(X)):
    x=GlobalAveragePooling2D()(x)
    x=Dense(1024,activation='relu')(x) #we add dense layers so that the model can learn more complex functions and classify for better results.
    x=Dense(1024,activation='relu')(x) #dense layer 2
    x=Dense(512,activation='relu')(x) #dense layer 3
    preds=Dense(2,activation='softmax')(x) #final layer with softmax activation

def classification_training(train_or_test, feature_extractor, factor):
    X = np.load('../data/features_' + model + '/'+ train_or_test + '/' + str(factor)+'/X.npy')
    y = np.load('../data/features_' + model + '/'+ train_or_test + '/' + str(factor)+'/y.npy')