# Stroke

## Import dependencies

In [1]:
import datetime
import glob
import os
import time
from collections import deque
from sklearn.utils import shuffle

from IPython.core.display_functions import display
from progressbar import ProgressBar

import librosa
import numpy as np
import pandas as pd
from scipy.stats import skew, kurtosis
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Perceptron
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

# disable all warnings
import warnings

warnings.filterwarnings('ignore')

matrikelNumber = 21006
isKaggle = False

## Classifiers

### KNN

In [2]:
def kNearestNeighbours(datasetName, kNeighbours, XTrain, XTest, yTrain, yTest):
    results = []

    for k in kNeighbours:
        print(datasetName)
        print('kNN with', k, 'neighbours')
        knnClassifier = KNeighborsClassifier(n_neighbors=k, algorithm='kd_tree')

        # Train the classifier
        startTime = time.time()
        knnClassifier.fit(XTrain, yTrain.ravel())
        endTime = time.time()
        trainTime = endTime - startTime

        # Predict
        startTime = time.time()
        predicted = knnClassifier.predict(XTest)
        endTime = time.time()
        predictionTime = endTime - startTime

        # Effectiveness measurement
        accuracyScore = accuracy_score(yTest, predicted)
        f1Score = f1_score(yTest, predicted, average='weighted')

        formattedTrainTime = str("{:.3f}s".format(trainTime))
        formattedPredictionTime = str("{:.3f}s".format(predictionTime))
        formattedAccuracyScore = str("{:.3f}%".format(accuracyScore * 100))
        formattedF1Score = str("{:.3f}%".format(f1Score * 100))

        print('Training time:', formattedTrainTime)
        print('Testing time:', formattedPredictionTime)
        print()

        print('Accuracy:', formattedAccuracyScore)
        print('F1 score:', formattedF1Score)
        print('------------------------------------')

        result = {
            'datasetName': datasetName,
            'algorithmName': 'kNN with "' + str(k) + '" neighbours',
            'neighbours': k,
            'accuracyScore': accuracyScore,
            'f1Score': f1Score,
            'trainTime': formattedTrainTime,
            'predictionTime': formattedPredictionTime,
            'yTestPredicted': predicted,
            'XTrain': XTrain,
            'XTest': XTest,
            'yTrain': yTrain,
            'yTest': yTest,
            'classifier': knnClassifier,
        }

        results.append(result)

    return results

### Decision Tree

In [3]:
def decisionTree(datasetName, maxFeatureValues, XTrain, XTest, yTrain, yTest):
    results = []

    for maxFeatures in maxFeatureValues:
        print(datasetName)
        print('Decision Tree with max features', maxFeatures)

        decisionTreeClassifier = DecisionTreeClassifier(max_features=maxFeatures, random_state=matrikelNumber)

        # Train the classifier
        startTime = time.time()
        decisionTreeClassifier.fit(XTrain, yTrain.ravel())
        endTime = time.time()
        trainTime = endTime - startTime

        # Predict
        startTime = time.time()
        predicted = decisionTreeClassifier.predict(XTest)
        endTime = time.time()
        predictionTime = endTime - startTime

        # Effectiveness measurement
        accuracyScore = accuracy_score(yTest, predicted)
        f1Score = f1_score(yTest, predicted, average='weighted')

        formattedTrainTime = str("{:.3f}s".format(trainTime))
        formattedPredictionTime = str("{:.3f}s".format(predictionTime))
        formattedAccuracyScore = str("{:.3f}%".format(accuracyScore * 100))
        formattedF1Score = str("{:.3f}%".format(f1Score * 100))

        print('Training time:', formattedTrainTime)
        print('Testing time:', formattedPredictionTime)
        print()

        print('Accuracy:', formattedAccuracyScore)
        print('F1 score:', formattedF1Score)
        print('------------------------------------')

        result = {
            'datasetName': datasetName,
            'algorithmName': 'Decision Tree with "' + str(maxFeatures) + '" max features',
            'maxFeatures': maxFeatures,
            'accuracyScore': accuracyScore,
            'f1Score': f1Score,
            'trainTime': formattedTrainTime,
            'predictionTime': formattedPredictionTime,
            'yTestPredicted': predicted,
            'XTrain': XTrain,
            'XTest': XTest,
            'yTrain': yTrain,
            'yTest': yTest,
            'classifier': decisionTreeClassifier,
        }

        results.append(result)

    return results

### Support Vector Machine

In [4]:
def supportVectorMachine(datasetName, XTrain, XTest, yTrain, yTest):
    print(datasetName)
    print('Support Vector Machine')

    results = []

    svmClassifier = make_pipeline(StandardScaler(), SVC(kernel='sigmoid', random_state=matrikelNumber))

    # Train the classifier
    startTime = time.time()
    svmClassifier.fit(XTrain, yTrain.ravel())
    endTime = time.time()
    trainTime = endTime - startTime

    # Predict
    startTime = time.time()
    predicted = svmClassifier.predict(XTest)
    endTime = time.time()
    predictionTime = endTime - startTime

    # Effectiveness measurement
    accuracyScore = accuracy_score(yTest, predicted)
    f1Score = f1_score(yTest, predicted, average='weighted')

    formattedTrainTime = str("{:.3f}s".format(trainTime))
    formattedPredictionTime = str("{:.3f}s".format(predictionTime))
    formattedAccuracyScore = str("{:.3f}%".format(accuracyScore * 100))
    formattedF1Score = str("{:.3f}%".format(f1Score * 100))

    print('Training time:', formattedTrainTime)
    print('Testing time:', formattedPredictionTime)
    print()

    print('Accuracy:', formattedAccuracyScore)
    print('F1 score:', formattedF1Score)
    print('------------------------------------')

    result = {
        'datasetName': datasetName,
        'algorithmName': 'SVM',
        'accuracyScore': accuracyScore,
        'f1Score': f1Score,
        'trainTime': formattedTrainTime,
        'predictionTime': formattedPredictionTime,
        'yTestPredicted': predicted,
        'XTrain': XTrain,
        'XTest': XTest,
        'yTrain': yTrain,
        'yTest': yTest,
        'classifier': svmClassifier,
    }

    results.append(result)

    return results

### Random forests

In [5]:
def randomForest(datasetName, numberOfTrees, maxFeatureValues, XTrain, XTest, yTrain, yTest):
    results = []

    for numberOfTreeElements in numberOfTrees:
        for maxFeatureValue in maxFeatureValues:
            print(datasetName)
            print('Random forest with', numberOfTreeElements, 'trees and', maxFeatureValue, 'max features')

            randomForestClassifier = RandomForestClassifier(
                n_estimators=numberOfTreeElements,
                max_features=maxFeatureValue,
                random_state=matrikelNumber
            )

            # Train the classifier
            startTime = time.time()
            randomForestClassifier.fit(XTrain, yTrain.ravel())
            endTime = time.time()
            trainTime = endTime - startTime

            # Predict
            startTime = time.time()
            predicted = randomForestClassifier.predict(XTest)
            endTime = time.time()
            predictionTime = endTime - startTime

            # Effectiveness measurement
            accuracyScore = accuracy_score(yTest, predicted)
            f1Score = f1_score(yTest, predicted, average='weighted')

            formattedTrainTime = str("{:.3f}s".format(trainTime))
            formattedPredictionTime = str("{:.3f}s".format(predictionTime))
            formattedAccuracyScore = str("{:.3f}%".format(accuracyScore * 100))
            formattedF1Score = str("{:.3f}%".format(f1Score * 100))

            print('Training time:', formattedTrainTime)
            print('Testing time:', formattedPredictionTime)
            print()

            print('Accuracy:', formattedAccuracyScore)
            print('F1 score:', formattedF1Score)
            print('------------------------------------')

            result = {
                'datasetName': datasetName,
                'algorithmName': 'Random F. ("' + str(numberOfTreeElements) + '" trees, "' + str(
                    maxFeatureValue) + '" max feature)',
                'numberOfTreeElements': numberOfTreeElements,
                'maxFeatureValue': maxFeatureValue,
                'accuracyScore': accuracyScore,
                'f1Score': f1Score,
                'trainTime': formattedTrainTime,
                'predictionTime': formattedPredictionTime,
                'yTestPredicted': predicted,
                'XTrain': XTrain,
                'XTest': XTest,
                'yTrain': yTrain,
                'yTest': yTest,
                'classifier': randomForestClassifier,
            }

            results.append(result)

    return results

## Load dataset

In [6]:
datasetName = 'Stroke'

# Load training dataset
filePathTrainDataset = './data/stroke/stroke.shuf.lrn.csv'
if isKaggle:
    filePathTrainDataset = '/kaggle/input/mse-bb-2-ss2022-mle-stroke/stroke.shuf.lrn.csv'

strokeTrain = pd.read_csv(filePathTrainDataset, sep=',')

# Load test dataset
filePathTestData = './data/stroke/stroke.shuf.tes.csv'
if isKaggle:
    filePathTestData = '/kaggle/input/mse-bb-2-ss2022-mle-stroke/stroke.shuf.tes.csv'

strokeTest = pd.read_csv(filePathTestData, sep=',')

## Prepare dataset

The dataset contains string values, which cannot be processed, we have to change the values to numbers.

In [7]:
def mapToNumeric(data_set, column_name):
    label_encoder = preprocessing.LabelEncoder()
    label_encoder.fit(data_set[column_name])
    data_set[column_name] = label_encoder.transform(data_set[column_name])
    return data_set

def mapStringsToNumbers(data_set):
    data_set = mapToNumeric(data_set, 'gender')
    data_set = mapToNumeric(data_set, 'ever_married')
    data_set = mapToNumeric(data_set, 'work_type')
    data_set = mapToNumeric(data_set, 'Residence_type')
    data_set = mapToNumeric(data_set, 'smoking_status')
    data_set = data_set.fillna(0)
    return data_set

strokeTrain = mapStringsToNumbers(strokeTrain)
strokeTest = mapStringsToNumbers(strokeTest)

## Split

In [8]:
# Split data in input features (X) and target (y) feature
trainDataX = strokeTrain.loc[:, 'gender':'smoking_status']
trainDataY = strokeTrain.loc[:, 'stroke']

strokeTestX = strokeTest.loc[:, 'gender':'smoking_status']

## Apply Classifiers

In [9]:
testSplitSize = 0.33

kNNNeighbours = [2, 4, 6]
decisionTreeMaxFeatureValues = [None, 'sqrt', 'log2']
randomForestTrees = [10, 50, 100]
randomForestMaxFeatureValues = ['sqrt', 'log2']
strokeResults = []

X, y = shuffle(trainDataX, trainDataY, random_state=matrikelNumber)

trainData, testData, trainLabels, testLabels = train_test_split(
    X,
    y,
    test_size=testSplitSize,
    random_state=matrikelNumber
)

kNNResults = kNearestNeighbours(
    datasetName,
    kNNNeighbours,
    trainData,
    testData,
    trainLabels,
    testLabels
)
strokeResults.extend(kNNResults)

decisionTreeResults = decisionTree(
    datasetName,
    decisionTreeMaxFeatureValues,
    trainData,
    testData,
    trainLabels,
    testLabels
)
strokeResults.extend(decisionTreeResults)

supportVectorMachineResults = supportVectorMachine(
    datasetName,
    trainData,
    testData,
    trainLabels,
    testLabels
)
strokeResults.extend(supportVectorMachineResults)

randomForestResults = randomForest(
    datasetName,
    randomForestTrees,
    randomForestMaxFeatureValues,
    trainData,
    testData,
    trainLabels,
    testLabels
)
strokeResults.extend(randomForestResults)

Stroke
kNN with 2 neighbours
Training time: 0.003s
Testing time: 0.023s

Accuracy: 95.498%
F1 score: 93.646%
------------------------------------
Stroke
kNN with 4 neighbours
Training time: 0.003s
Testing time: 0.019s

Accuracy: 95.616%
F1 score: 93.705%
------------------------------------
Stroke
kNN with 6 neighbours
Training time: 0.002s
Testing time: 0.019s

Accuracy: 95.853%
F1 score: 93.824%
------------------------------------
Stroke
Decision Tree with max features None
Training time: 0.005s
Testing time: 0.001s

Accuracy: 92.536%
F1 score: 92.484%
------------------------------------
Stroke
Decision Tree with max features sqrt
Training time: 0.003s
Testing time: 0.001s

Accuracy: 91.706%
F1 score: 92.111%
------------------------------------
Stroke
Decision Tree with max features log2
Training time: 0.003s
Testing time: 0.001s

Accuracy: 91.706%
F1 score: 92.111%
------------------------------------
Stroke
Support Vector Machine
Training time: 0.021s
Testing time: 0.007s

Accur

## Display results

In [10]:
strokeDf = pd.DataFrame(strokeResults,
                        columns=['algorithmName', 'accuracyScore', 'f1Score', 'trainTime', 'predictionTime'])
display(strokeDf)

Unnamed: 0,algorithmName,accuracyScore,f1Score,trainTime,predictionTime
0,"kNN with ""2"" neighbours",0.954976,0.936456,0.003s,0.023s
1,"kNN with ""4"" neighbours",0.956161,0.937049,0.003s,0.019s
2,"kNN with ""6"" neighbours",0.958531,0.938235,0.002s,0.019s
3,"Decision Tree with ""None"" max features",0.925355,0.924838,0.005s,0.001s
4,"Decision Tree with ""sqrt"" max features",0.917062,0.921109,0.003s,0.001s
5,"Decision Tree with ""log2"" max features",0.917062,0.921109,0.003s,0.001s
6,SVM,0.946682,0.93401,0.021s,0.007s
7,"Random F. (""10"" trees, ""sqrt"" max feature)",0.957346,0.941739,0.023s,0.004s
8,"Random F. (""10"" trees, ""log2"" max feature)",0.957346,0.941739,0.021s,0.004s
9,"Random F. (""50"" trees, ""sqrt"" max feature)",0.957346,0.937643,0.095s,0.010s


## Find best result

In [11]:
bestResultF1 = None
bestResultAccuracy = None

for indexClassifier, classifier in enumerate(strokeResults):
    if bestResultF1 is None or bestResultF1['f1Score'] < classifier['f1Score']:
        bestResultF1 = classifier
    if bestResultAccuracy is None or bestResultAccuracy['accuracyScore'] < classifier['accuracyScore']:
        bestResultAccuracy = classifier

print('The classifier with the best f1-score was', bestResultF1['algorithmName'], 'with a score of ', bestResultF1['f1Score'])
print('The classifier with the best accuracy-score was', bestResultAccuracy['algorithmName'], 'with a score of ', bestResultF1['accuracyScore'])

bestScore = bestResultF1 if bestResultF1['f1Score'] > bestResultAccuracy['accuracyScore'] else bestResultAccuracy

The classifier with the best f1-score was Random F. ("10" trees, "sqrt" max feature) with a score of  0.9417389453825978
The classifier with the best accuracy-score was kNN with "6" neighbours with a score of  0.957345971563981


## Retrain the best classifier

In [12]:
bestClassifier = bestScore['classifier']

bestClassifier.fit(X, y.ravel())

# Predict again with the test-set
testPrediction = bestClassifier.predict(strokeTestX)


## Save the prediction

Now we save the prediction.

In [13]:
resultCsv = list(zip(*[strokeTest.loc[:, 'ID'], [str(x).lower() for x in testPrediction]]))
filePath = 'results/stroke-result.csv'
np.savetxt(filePath,
           resultCsv,
           header='ID,stroke',
           comments='',
           delimiter=",",
           fmt='%s')