# Breast cancer

## Import dependencies

In [55]:
import datetime
import glob
import os
import time
from collections import deque
from sklearn.utils import shuffle

from IPython.core.display_functions import display
from progressbar import ProgressBar

import librosa
import numpy as np
import pandas as pd
from scipy.stats import skew, kurtosis
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Perceptron
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

# disable all warnings
import warnings

warnings.filterwarnings('ignore')

matrikelNumber = 11776836
isKaggle = False

## Classifiers

### KNN

In [56]:
def kNearestNeighbours(datasetName, kNeighbours, XTrain, XTest, yTrain, yTest):
    results = []

    for k in kNeighbours:
        print(datasetName)
        print('kNN with', k, 'neighbours')
        knnClassifier = KNeighborsClassifier(n_neighbors=k, algorithm='kd_tree')

        # Train the classifier
        startTime = time.time()
        knnClassifier.fit(XTrain, yTrain.ravel())
        endTime = time.time()
        trainTime = endTime - startTime

        # Predict
        startTime = time.time()
        predicted = knnClassifier.predict(XTest)
        endTime = time.time()
        predictionTime = endTime - startTime

        # Effectiveness measurement
        accuracyScore = accuracy_score(yTest, predicted)
        f1Score = f1_score(yTest, predicted, average='weighted')

        formattedTrainTime = str("{:.3f}s".format(trainTime))
        formattedPredictionTime = str("{:.3f}s".format(predictionTime))
        formattedAccuracyScore = str("{:.3f}%".format(accuracyScore * 100))
        formattedF1Score = str("{:.3f}%".format(f1Score * 100))

        print('Training time:', formattedTrainTime)
        print('Testing time:', formattedPredictionTime)
        print()

        print('Accuracy:', formattedAccuracyScore)
        print('F1 score:', formattedF1Score)
        print('------------------------------------')

        result = {
            'datasetName': datasetName,
            'algorithmName': 'kNN with "' + str(k) + '" neighbours',
            'neighbours': k,
            'accuracyScore': accuracyScore,
            'f1Score': f1Score,
            'trainTime': formattedTrainTime,
            'predictionTime': formattedPredictionTime,
            'yTestPredicted': predicted,
            'XTrain': XTrain,
            'XTest': XTest,
            'yTrain': yTrain,
            'yTest': yTest,
            'classifier': knnClassifier,
        }

        results.append(result)

    return results

### Decision Tree

In [57]:
def decisionTree(datasetName, maxFeatureValues, XTrain, XTest, yTrain, yTest):
    results = []

    for maxFeatures in maxFeatureValues:
        print(datasetName)
        print('Decision Tree with max features', maxFeatures)

        decisionTreeClassifier = DecisionTreeClassifier(max_features=maxFeatures, random_state=matrikelNumber)

        # Train the classifier
        startTime = time.time()
        decisionTreeClassifier.fit(XTrain, yTrain.ravel())
        endTime = time.time()
        trainTime = endTime - startTime

        # Predict
        startTime = time.time()
        predicted = decisionTreeClassifier.predict(XTest)
        endTime = time.time()
        predictionTime = endTime - startTime

        # Effectiveness measurement
        accuracyScore = accuracy_score(yTest, predicted)
        f1Score = f1_score(yTest, predicted, average='weighted')

        formattedTrainTime = str("{:.3f}s".format(trainTime))
        formattedPredictionTime = str("{:.3f}s".format(predictionTime))
        formattedAccuracyScore = str("{:.3f}%".format(accuracyScore * 100))
        formattedF1Score = str("{:.3f}%".format(f1Score * 100))

        print('Training time:', formattedTrainTime)
        print('Testing time:', formattedPredictionTime)
        print()

        print('Accuracy:', formattedAccuracyScore)
        print('F1 score:', formattedF1Score)
        print('------------------------------------')

        result = {
            'datasetName': datasetName,
            'algorithmName': 'Decision Tree with "' + str(maxFeatures) + '" max features',
            'maxFeatures': maxFeatures,
            'accuracyScore': accuracyScore,
            'f1Score': f1Score,
            'trainTime': formattedTrainTime,
            'predictionTime': formattedPredictionTime,
            'yTestPredicted': predicted,
            'XTrain': XTrain,
            'XTest': XTest,
            'yTrain': yTrain,
            'yTest': yTest,
            'classifier': decisionTreeClassifier,
        }

        results.append(result)

    return results

### Support Vector Machine

In [58]:
def supportVectorMachine(datasetName, XTrain, XTest, yTrain, yTest):
    print(datasetName)
    print('Support Vector Machine')

    results = []

    svmClassifier = make_pipeline(StandardScaler(), SVC(random_state=matrikelNumber))

    # Train the classifier
    startTime = time.time()
    svmClassifier.fit(XTrain, yTrain.ravel())
    endTime = time.time()
    trainTime = endTime - startTime

    # Predict
    startTime = time.time()
    predicted = svmClassifier.predict(XTest)
    endTime = time.time()
    predictionTime = endTime - startTime

    # Effectiveness measurement
    accuracyScore = accuracy_score(yTest, predicted)
    f1Score = f1_score(yTest, predicted, average='weighted')

    formattedTrainTime = str("{:.3f}s".format(trainTime))
    formattedPredictionTime = str("{:.3f}s".format(predictionTime))
    formattedAccuracyScore = str("{:.3f}%".format(accuracyScore * 100))
    formattedF1Score = str("{:.3f}%".format(f1Score * 100))

    print('Training time:', formattedTrainTime)
    print('Testing time:', formattedPredictionTime)
    print()

    print('Accuracy:', formattedAccuracyScore)
    print('F1 score:', formattedF1Score)
    print('------------------------------------')

    result = {
        'datasetName': datasetName,
        'algorithmName': 'SVM',
        'accuracyScore': accuracyScore,
        'f1Score': f1Score,
        'trainTime': formattedTrainTime,
        'predictionTime': formattedPredictionTime,
        'yTestPredicted': predicted,
        'XTrain': XTrain,
        'XTest': XTest,
        'yTrain': yTrain,
        'yTest': yTest,
        'classifier': svmClassifier,
    }

    results.append(result)

    return results

### Random forests

In [59]:
def randomForest(datasetName, numberOfTrees, maxFeatureValues, XTrain, XTest, yTrain, yTest):
    results = []

    for numberOfTreeElements in numberOfTrees:
        for maxFeatureValue in maxFeatureValues:
            print(datasetName)
            print('Random forest with', numberOfTreeElements, 'trees and', maxFeatureValue, 'max features')

            randomForestClassifier = RandomForestClassifier(
                n_estimators=numberOfTreeElements,
                max_features=maxFeatureValue,
                random_state=matrikelNumber
            )

            # Train the classifier
            startTime = time.time()
            randomForestClassifier.fit(XTrain, yTrain.ravel())
            endTime = time.time()
            trainTime = endTime - startTime

            # Predict
            startTime = time.time()
            predicted = randomForestClassifier.predict(XTest)
            endTime = time.time()
            predictionTime = endTime - startTime

            # Effectiveness measurement
            accuracyScore = accuracy_score(yTest, predicted)
            f1Score = f1_score(yTest, predicted, average='weighted')

            formattedTrainTime = str("{:.3f}s".format(trainTime))
            formattedPredictionTime = str("{:.3f}s".format(predictionTime))
            formattedAccuracyScore = str("{:.3f}%".format(accuracyScore * 100))
            formattedF1Score = str("{:.3f}%".format(f1Score * 100))

            print('Training time:', formattedTrainTime)
            print('Testing time:', formattedPredictionTime)
            print()

            print('Accuracy:', formattedAccuracyScore)
            print('F1 score:', formattedF1Score)
            print('------------------------------------')

            result = {
                'datasetName': datasetName,
                'algorithmName': 'Random F. ("' + str(numberOfTreeElements) + '" trees, "' + str(
                    maxFeatureValue) + '" max feature)',
                'numberOfTreeElements': numberOfTreeElements,
                'maxFeatureValue': maxFeatureValue,
                'accuracyScore': accuracyScore,
                'f1Score': f1Score,
                'trainTime': formattedTrainTime,
                'predictionTime': formattedPredictionTime,
                'yTestPredicted': predicted,
                'XTrain': XTrain,
                'XTest': XTest,
                'yTrain': yTrain,
                'yTest': yTest,
                'classifier': randomForestClassifier,
            }

            results.append(result)

    return results

## Load dataset

In [60]:
datasetName = 'Breast Cancer'

# Load training dataset
filePathTrainDataset = './data/breast-cancer/breast-cancer-diagnostic.shuf.lrn.csv'
if isKaggle:
    filePathTrainDataset = '/kaggle/input/mse-bb-2-ss2022-mle-breastcancer/breast-cancer-diagnostic.shuf.lrn.csv'

breastCancerTrain = pd.read_csv(filePathTrainDataset, sep=',')

# Split data in input features (X) and target (y) feature
trainDataX = breastCancerTrain.loc[:, 'radiusMean':]
trainDataY = breastCancerTrain.loc[:, 'class']

# Load test dataset
filePathTestData = './data/breast-cancer/breast-cancer-diagnostic.shuf.tes.csv'
if isKaggle:
    filePathTestData = '/kaggle/input/mse-bb-2-ss2022-mle-breastcancer/breast-cancer-diagnostic.shuf.tes.csv'

breastCancerTest = pd.read_csv(filePathTestData, sep=',')

breastCancerTestX = breastCancerTest.loc[:, 'radiusMean':]

## Apply Classifiers

In [61]:
testSplitSize = 0.33

kNNNeighbours = [2, 4, 6]
decisionTreeMaxFeatureValues = [None, 'sqrt', 'log2']
randomForestTrees = [10, 50, 100]
randomForestMaxFeatureValues = ['sqrt', 'log2']
breastCancerResults = []

X, y = shuffle(trainDataX, trainDataY, random_state=matrikelNumber)

trainData, testData, trainLabels, testLabels = train_test_split(
    X,
    y,
    test_size=testSplitSize,
    random_state=matrikelNumber
)

kNNResults = kNearestNeighbours(
    datasetName,
    kNNNeighbours,
    trainData,
    testData,
    trainLabels,
    testLabels
)
breastCancerResults.extend(kNNResults)

decisionTreeResults = decisionTree(
    datasetName,
    decisionTreeMaxFeatureValues,
    trainData,
    testData,
    trainLabels,
    testLabels
)
breastCancerResults.extend(decisionTreeResults)

supportVectorMachineResults = supportVectorMachine(
    datasetName,
    trainData,
    testData,
    trainLabels,
    testLabels
)
breastCancerResults.extend(supportVectorMachineResults)

randomForestResults = randomForest(
    datasetName,
    randomForestTrees,
    randomForestMaxFeatureValues,
    trainData,
    testData,
    trainLabels,
    testLabels
)
breastCancerResults.extend(randomForestResults)

Breast Cancer
kNN with 2 neighbours
Training time: 0.002s
Testing time: 0.005s

Accuracy: 90.526%
F1 score: 90.468%
------------------------------------
Breast Cancer
kNN with 4 neighbours
Training time: 0.002s
Testing time: 0.005s

Accuracy: 90.526%
F1 score: 90.581%
------------------------------------
Breast Cancer
kNN with 6 neighbours
Training time: 0.001s
Testing time: 0.004s

Accuracy: 90.526%
F1 score: 90.468%
------------------------------------
Breast Cancer
Decision Tree with max features None
Training time: 0.003s
Testing time: 0.001s

Accuracy: 96.842%
F1 score: 96.860%
------------------------------------
Breast Cancer
Decision Tree with max features sqrt
Training time: 0.002s
Testing time: 0.001s

Accuracy: 89.474%
F1 score: 89.852%
------------------------------------
Breast Cancer
Decision Tree with max features log2
Training time: 0.001s
Testing time: 0.001s

Accuracy: 91.579%
F1 score: 91.754%
------------------------------------
Breast Cancer
Support Vector Machine


## Display results

In [62]:
breastCancerDf = pd.DataFrame(breastCancerResults,
                              columns=['algorithmName', 'accuracyScore', 'f1Score', 'trainTime', 'predictionTime'])
display(breastCancerDf)

Unnamed: 0,algorithmName,accuracyScore,f1Score,trainTime,predictionTime
0,"kNN with ""2"" neighbours",0.905263,0.904675,0.002s,0.005s
1,"kNN with ""4"" neighbours",0.905263,0.905811,0.002s,0.005s
2,"kNN with ""6"" neighbours",0.905263,0.904675,0.001s,0.004s
3,"Decision Tree with ""None"" max features",0.968421,0.968604,0.003s,0.001s
4,"Decision Tree with ""sqrt"" max features",0.894737,0.898516,0.002s,0.001s
5,"Decision Tree with ""log2"" max features",0.915789,0.91754,0.001s,0.001s
6,SVM,1.0,1.0,0.003s,0.001s
7,"Random F. (""10"" trees, ""sqrt"" max feature)",0.978947,0.979182,0.015s,0.003s
8,"Random F. (""10"" trees, ""log2"" max feature)",0.947368,0.947673,0.013s,0.002s
9,"Random F. (""50"" trees, ""sqrt"" max feature)",0.947368,0.948688,0.055s,0.007s


## Find best result

In [63]:
bestResultF1 = None
bestResultAccuracy = None

for indexClassifier, classifier in enumerate(breastCancerResults):
    if bestResultF1 is None or bestResultF1['f1Score'] < classifier['f1Score']:
        bestResultF1 = classifier
    if bestResultAccuracy is None or bestResultAccuracy['accuracyScore'] < classifier['accuracyScore']:
        bestResultAccuracy = classifier

print('The classifier with the best f1-score was', bestResultF1['algorithmName'], 'with a score of ', bestResultF1['f1Score'])
print('The classifier with the best accuracy-score was', bestResultAccuracy['algorithmName'], 'with a score of ', bestResultF1['accuracyScore'])

The classifier with the best f1-score was SVM with a score of  1.0
The classifier with the best accuracy-score was SVM with a score of  1.0


## Retrain the best classifier

In [64]:
bestClassifier = bestResultF1['classifier']

bestClassifier.fit(X, y.ravel())

# Predict again with the test-set
testPrediction = bestClassifier.predict(breastCancerTestX)


## Save the prediction

Now we save the prediction.

In [65]:
resultCsv = list(zip(*[breastCancerTest.loc[:,'ID'], [str(x).lower() for x in testPrediction]]))
filePath = 'results/breast-cancer-result.csv'
np.savetxt(filePath,
           resultCsv,
           header='ID,class',
           comments='',
           delimiter =",",
           fmt ='%s')