# Import Dependencies


In [38]:
import pandas as pd
import numpy as np
import time
import statistics

from sklearn.preprocessing import StandardScaler
from tabulate import tabulate
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import Perceptron
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Load dataset


In [39]:
hrDataset = pd.read_csv('../data/HRDataset_v14.csv', sep=',')
hrDataset.sample(frac=1, random_state=42).reset_index(drop=True)

hrDataset.describe()

Unnamed: 0,EmpID,MarriedID,MaritalStatusID,GenderID,EmpStatusID,DeptID,PerfScoreID,FromDiversityJobFairID,Salary,Termd,PositionID,Zip,ManagerID,EngagementSurvey,EmpSatisfaction,SpecialProjectsCount,DaysLateLast30,Absences
count,311.0,311.0,311.0,311.0,311.0,311.0,311.0,311.0,311.0,311.0,311.0,311.0,303.0,311.0,311.0,311.0,311.0,311.0
mean,10156.0,0.398714,0.810289,0.434084,2.392283,4.610932,2.977492,0.093248,69020.684887,0.334405,16.845659,6555.482315,14.570957,4.11,3.890675,1.21865,0.414791,10.237942
std,89.922189,0.490423,0.943239,0.496435,1.794383,1.083487,0.587072,0.291248,25156.63693,0.472542,6.223419,16908.396884,8.078306,0.789938,0.909241,2.349421,1.294519,5.852596
min,10001.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,45046.0,0.0,1.0,1013.0,1.0,1.12,1.0,0.0,0.0,1.0
25%,10078.5,0.0,0.0,0.0,1.0,5.0,3.0,0.0,55501.5,0.0,18.0,1901.5,10.0,3.69,3.0,0.0,0.0,5.0
50%,10156.0,0.0,1.0,0.0,1.0,5.0,3.0,0.0,62810.0,0.0,19.0,2132.0,15.0,4.28,4.0,0.0,0.0,10.0
75%,10233.5,1.0,1.0,1.0,5.0,5.0,3.0,0.0,72036.0,1.0,20.0,2355.0,19.0,4.7,5.0,0.0,0.0,15.0
max,10311.0,1.0,4.0,1.0,5.0,6.0,4.0,1.0,250000.0,1.0,30.0,98052.0,39.0,5.0,5.0,8.0,6.0,20.0


# Clean dataset

I drop all continuous columns from the dataset, I can't encode these columns well. Categorization of these columns will not work, because every value is very different.

In [40]:
filteredHR = hrDataset.drop("EmpID", axis=1)
filteredHR = filteredHR.drop("Employee_Name", axis=1)
filteredHR = filteredHR.drop("DateofTermination", axis=1)
filteredHR = filteredHR.drop("LastPerformanceReview_Date", axis=1)
filteredHR = filteredHR.drop("DateofHire", axis=1)
filteredHR = filteredHR.drop("EngagementSurvey", axis=1)
filteredHR = filteredHR.drop("Salary", axis=1)
filteredHR = filteredHR.drop("DOB", axis=1)

filteredHR['absences_label'] = filteredHR['Absences'].apply(lambda value: '0-14'
if value <= 4 else '0-5'
if value <= 5 else '6-10'
if value <= 10 else '11-15'
if value <= 14 else '15+')

filteredHR = filteredHR.drop("Absences", axis=1)

filteredHR.head()

Unnamed: 0,MarriedID,MaritalStatusID,GenderID,EmpStatusID,DeptID,PerfScoreID,FromDiversityJobFairID,Termd,PositionID,Position,...,EmploymentStatus,Department,ManagerName,ManagerID,RecruitmentSource,PerformanceScore,EmpSatisfaction,SpecialProjectsCount,DaysLateLast30,absences_label
0,0,0,1,1,5,4,0,0,19,Production Technician I,...,Active,Production,Michael Albert,22.0,LinkedIn,Exceeds,5,0,0,0-14
1,1,1,1,5,3,3,0,1,27,Sr. DBA,...,Voluntarily Terminated,IT/IS,Simon Roup,4.0,Indeed,Fully Meets,3,6,0,15+
2,1,1,0,5,5,3,0,1,20,Production Technician II,...,Voluntarily Terminated,Production,Kissy Sullivan,20.0,LinkedIn,Fully Meets,3,0,0,0-14
3,1,1,0,1,5,3,0,0,19,Production Technician I,...,Active,Production,Elijiah Gray,16.0,Indeed,Fully Meets,5,0,0,15+
4,0,2,0,5,5,3,0,1,19,Production Technician I,...,Voluntarily Terminated,Production,Webster Butler,39.0,Google Search,Fully Meets,4,0,0,0-14


# Encode the dataset

In [41]:
# One hot encode the dataset
encodedHR = pd.get_dummies(filteredHR, columns=filteredHR.columns, drop_first=True)
encodedHR.head()

# print(encodedHR.columns)
# encodedHR.to_csv('output.csv', sep=',')

Unnamed: 0,MarriedID_1,MaritalStatusID_1,MaritalStatusID_2,MaritalStatusID_3,MaritalStatusID_4,GenderID_1,EmpStatusID_2,EmpStatusID_3,EmpStatusID_4,EmpStatusID_5,...,DaysLateLast30_1,DaysLateLast30_2,DaysLateLast30_3,DaysLateLast30_4,DaysLateLast30_5,DaysLateLast30_6,absences_label_0-5,absences_label_11-15,absences_label_15+,absences_label_6-10
0,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,1,0,0,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
2,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,0,0,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


# Split the dataset

In [42]:
# What do we predict?
xAxis = encodedHR.drop('RaceDesc_White', axis=1)
yAxis = encodedHR['RaceDesc_White']

trainData, testData, trainLabels, testLabels = train_test_split(
    xAxis,
    yAxis,
    test_size=0.33,
    random_state=21006
)

scaler = StandardScaler()
scaler.fit(trainData)

xTrain = scaler.transform(trainData)
xTest = scaler.transform(testData)

print("X_train:", xTrain)
print("X_test:", xTest)
print("y_train:", trainLabels)
print("y_test:", testLabels)

X_train: [[ 1.26491106  1.26491106 -0.34391797 ... -0.45749571  1.4668044
  -0.56254395]
 [ 1.26491106  1.26491106 -0.34391797 ... -0.45749571  1.4668044
  -0.56254395]
 [-0.79056942 -0.79056942 -0.34391797 ... -0.45749571 -0.68175416
   1.77763888]
 ...
 [ 1.26491106  1.26491106 -0.34391797 ... -0.45749571 -0.68175416
   1.77763888]
 [ 1.26491106  1.26491106 -0.34391797 ... -0.45749571  1.4668044
  -0.56254395]
 [-0.79056942 -0.79056942 -0.34391797 ... -0.45749571 -0.68175416
   1.77763888]]
X_test: [[ 1.26491106  1.26491106 -0.34391797 ... -0.45749571  1.4668044
  -0.56254395]
 [ 1.26491106  1.26491106 -0.34391797 ... -0.45749571  1.4668044
  -0.56254395]
 [-0.79056942 -0.79056942 -0.34391797 ... -0.45749571 -0.68175416
  -0.56254395]
 ...
 [ 1.26491106  1.26491106 -0.34391797 ... -0.45749571  1.4668044
  -0.56254395]
 [-0.79056942 -0.79056942 -0.34391797 ... -0.45749571 -0.68175416
   1.77763888]
 [-0.79056942 -0.79056942 -0.34391797 ... -0.45749571 -0.68175416
  -0.56254395]]
y_tra

In [43]:
def printTrainTimes(alogrithm, trainTimes, predictionTimes, accuracyTimes, f1Times, accuracyScores, f1Scores):
    meanTrainTime = statistics.mean(trainTimes)
    meanPredictionTime = statistics.mean(predictionTimes)
    meanAccuracyTime = statistics.mean(accuracyTimes)
    meanF1Time = statistics.mean(f1Times)
    meanF1Score = statistics.mean(f1Scores)
    meanAccuracyScore = statistics.mean(accuracyScores)

    print('Algorithm:', alogrithm)

    print('Mean times')

    print('Mean training time:', meanTrainTime, 's')
    print('Mean test time:', meanPredictionTime, 's')
    print('Mean accuracy time:', meanAccuracyTime, 's')
    print('Mean F1 time:', meanF1Time, 's')

    print('Mean scores')

    print('Mean accuracy score:', meanF1Score * 100, '%')
    print('Mean F1 score:', meanAccuracyScore * 100, '%')
    print()

    return [
        alogrithm,
        str("{:.5f}s".format(meanTrainTime)),
        str("{:.5f}s".format(meanPredictionTime)),
        str("{:.5f}s".format(meanAccuracyTime)),
        str("{:.5f}s".format(meanF1Time)),
        str("{:.2f}%".format(meanAccuracyScore * 100)),
        str("{:.2f}%".format(meanF1Score * 100))
    ]


# KNN (k-Nearest-Neighbor Classifier)


In [44]:
# https://python-course.eu/machine-learning/k-nearest-neighbor-classifier-with-sklearn.php

kValues = [2, 4, 6]

f1Scores = []
accuracyScores = []

trainTimes = []
predictionTimes = []
accuracyTimes = []
f1Times = []

for k in kValues:
    print('k-Nearest-Neighbor with', k, 'neighbors')

    knn = KNeighborsClassifier(n_neighbors=k, algorithm='kd_tree')

    # Train the algorithm
    start = time.time()
    knn.fit(xTrain, trainLabels)
    end = time.time()
    trainTime = end - start
    trainTimes.append(end - start)

    # Predict
    start = time.time()
    predicted = knn.predict(xTest)
    end = time.time()
    predictionTime = end - start
    predictionTimes.append(end - start)

    # Effectiveness measurement

    # To evaluate the result, we will use the accuracy score
    start = time.time()
    accuracyScore = accuracy_score(testLabels, predicted)
    end = time.time()
    accuracyScores.append(accuracyScore)
    accuracyTime = (end - start)
    accuracyTimes.append(accuracyTime)

    # To evaluate the result, we will use the f1_score
    start = time.time()
    f1Score = f1_score(testLabels, predicted, average='weighted')
    end = time.time()
    f1Scores.append(f1Score)
    f1Time = end - start
    f1Times.append(f1Time)

    print('Training time:', trainTime, 's')
    print('Testing time:', predictionTime, 's')
    print('Accuracy time:', accuracyTime, 's')
    print('F1 time:', f1Time, 's')
    print()

    print('Accuracy:', accuracyScore * 100, '%')
    print('F1 score:', f1Score * 100, '%')
    print()

knnResults = printTrainTimes('k-Nearest-Neighbor', trainTimes, predictionTimes, accuracyTimes, f1Times,
                             accuracyScores, f1Scores)

k-Nearest-Neighbor with 2 neighbors
Training time: 0.00923776626586914 s
Testing time: 0.026286602020263672 s
Accuracy time: 0.0005044937133789062 s
F1 time: 0.0014603137969970703 s

Accuracy: 46.601941747572816 %
F1 score: 47.8224101874151 %

k-Nearest-Neighbor with 4 neighbors
Training time: 0.011719703674316406 s
Testing time: 0.021233081817626953 s
Accuracy time: 0.00030517578125 s
F1 time: 0.0008969306945800781 s

Accuracy: 54.36893203883495 %
F1 score: 52.54551273968748 %

k-Nearest-Neighbor with 6 neighbors
Training time: 0.007066249847412109 s
Testing time: 0.01934671401977539 s
Accuracy time: 0.0003345012664794922 s
F1 time: 0.0007684230804443359 s

Accuracy: 60.19417475728155 %
F1 score: 55.530771873993544 %

Algorithm: k-Nearest-Neighbor
Mean times
Mean training time: 0.009341239929199219 s
Mean test time: 0.022288799285888672 s
Mean accuracy time: 0.0003813902537027995 s
Mean F1 time: 0.0010418891906738281 s
Mean scores
Mean accuracy score: 51.966231600365376 %
Mean F1 scor

KNN produces very good results. The best accuracy was achieved with 6 neighbors. The execution is the longest of the three algorithms. For comparison reasons I stuck here with kd-algorithm, so that the results are comparable to the big dataset.

The accuracy and the f1-score are not very great.

# Perceptron

In [45]:
f1Scores = []
accuracyScores = []

trainTimes = []
predictionTimes = []
accuracyTimes = []
f1Times = []

alphas = np.logspace(-1, 1, 5)
penalties = ['l2', 'l1']

for a in alphas:
    for p in penalties:
        print('Perceptron with alpha', a, 'and penalty', p)

        perceptron = Perceptron(alpha=a, penalty=p)

        # Train the algorithm
        start = time.time()
        perceptron.fit(xTrain, trainLabels)
        end = time.time()
        trainTime = end - start
        trainTimes.append(end - start)

        # Predict
        start = time.time()
        predicted = perceptron.predict(xTest)
        end = time.time()
        predictionTime = end - start
        predictionTimes.append(end - start)

        # Effectiveness measurement
        start = time.time()
        accuracyScore = accuracy_score(testLabels, predicted)
        end = time.time()
        accuracyScores.append(accuracyScore)
        accuracyTime = (end - start)
        accuracyTimes.append(accuracyTime)

        # To evaluate the result, we will use the f1_score
        start = time.time()
        f1Score = f1_score(testLabels, predicted, average='weighted')
        end = time.time()
        f1Scores.append(f1Score)
        f1Time = end - start
        f1Times.append(f1Time)

        print('Training time:', trainTime, 's')
        print('Testing time:', predictionTime, 's')
        print('Accuracy time:', accuracyTime, 's')
        print('F1 time:', f1Time, 's')
        print()

        print('Accuracy:', accuracyScore * 100, '%')
        print('F1 score:', f1Score * 100, '%')
        print()

perceptronResults = printTrainTimes('Perceptron', trainTimes, predictionTimes, accuracyTimes, f1Times, accuracyScores,
                                    f1Scores)


Perceptron with alpha 0.1 and penalty l2
Training time: 0.00645899772644043 s
Testing time: 0.008118629455566406 s
Accuracy time: 0.0004279613494873047 s
F1 time: 0.0011830329895019531 s

Accuracy: 53.398058252427184 %
F1 score: 54.43486960278302 %

Perceptron with alpha 0.1 and penalty l1
Training time: 0.003297090530395508 s
Testing time: 0.00019860267639160156 s
Accuracy time: 0.00022101402282714844 s
F1 time: 0.001003265380859375 s

Accuracy: 96.11650485436894 %
F1 score: 96.05856694725166 %

Perceptron with alpha 0.31622776601683794 and penalty l2
Training time: 0.0017681121826171875 s
Testing time: 0.0036118030548095703 s
Accuracy time: 0.010434389114379883 s
F1 time: 0.0012750625610351562 s

Accuracy: 65.0485436893204 %
F1 score: 51.27355796687607 %

Perceptron with alpha 0.31622776601683794 and penalty l1
Training time: 0.0141448974609375 s
Testing time: 0.000469207763671875 s
Accuracy time: 0.0004410743713378906 s
F1 time: 0.0009529590606689453 s

Accuracy: 65.0485436893204 %


The runtime of this algorithm is faster than the runtime of the knn algorithm. It performs the second-best out of the three algorithms. Just like with the knn-algorithm the f1-score and the precision are not very great. The two scores are better than in the knn algorithm, but worse than the decision tree.


# Decision Tree


In [46]:
f1Scores = []
accuracyScores = []

trainTimes = []
predictionTimes = []
accuracyTimes = []
f1Times = []

# algo input parameter variation lists
minSamplesSplits = [2, 25, 50, 100, 250, 1000]
minSamplesLeafs = [1, 25, 50, 100, 1000]

for mSS in minSamplesSplits:
    for mSL in minSamplesLeafs:
        print('Decision Tree with min sample splits', mSS, 'and min sample leafs', mSL)

        decisionTree = DecisionTreeClassifier(
            min_samples_split=mSS,
            min_samples_leaf=mSL
        )

        # Train the algorithm
        start = time.time()
        decisionTree.fit(xTrain, trainLabels)
        end = time.time()
        trainTime = end - start
        trainTimes.append(end - start)

        # Predict
        start = time.time()
        predicted = decisionTree.predict(xTest)
        end = time.time()
        predictionTime = end - start
        predictionTimes.append(end - start)

        # Effectiveness measurement
        start = time.time()
        accuracyScore = accuracy_score(testLabels, predicted)
        end = time.time()
        accuracyScores.append(accuracyScore)
        accuracyTime = (end - start)
        accuracyTimes.append(accuracyTime)

        # To evaluate the result, we will use the f1_score
        start = time.time()
        f1Score = f1_score(testLabels, predicted, average='weighted')
        end = time.time()
        f1Scores.append(f1Score)
        f1Time = end - start
        f1Times.append(f1Time)

        print('Training time:', trainTime, 's')
        print('Testing time:', predictionTime, 's')
        print('Accuracy time:', accuracyTime, 's')
        print('F1 time:', f1Time, 's')
        print()

        print('Accuracy:', accuracyScore * 100, '%')
        print('F1 score:', f1Score * 100, '%')
        print()

decisionTreeResults = printTrainTimes('DecisionTree', trainTimes, predictionTimes, accuracyTimes, f1Times,
                                      accuracyScores, f1Scores)



Decision Tree with min sample splits 2 and min sample leafs 1
Training time: 0.0020885467529296875 s
Testing time: 0.00019311904907226562 s
Accuracy time: 0.00027370452880859375 s
F1 time: 0.0009369850158691406 s

Accuracy: 98.05825242718447 %
F1 score: 98.04479073182671 %

Decision Tree with min sample splits 2 and min sample leafs 25
Training time: 0.0012862682342529297 s
Testing time: 0.0001678466796875 s
Accuracy time: 0.0004062652587890625 s
F1 time: 0.001913309097290039 s

Accuracy: 88.3495145631068 %
F1 score: 87.6632530921665 %

Decision Tree with min sample splits 2 and min sample leafs 50
Training time: 0.0010445117950439453 s
Testing time: 0.0001583099365234375 s
Accuracy time: 0.00023889541625976562 s
F1 time: 0.0022232532501220703 s

Accuracy: 88.3495145631068 %
F1 score: 87.6632530921665 %

Decision Tree with min sample splits 2 and min sample leafs 100
Training time: 0.0009253025054931641 s
Testing time: 0.00020122528076171875 s
Accuracy time: 0.00024962425231933594 s
F1

The decision tree has the second-best runtime of the algorithms. The accuracy and the f1-score are not as high as in the census-income example, but these are the greatest of the three algorithms performed on the hr-dataset.

# Results

Here I compared the mean times and mean scores of all three algorithms.


In [47]:
print(
    tabulate(
        [
            knnResults,
            perceptronResults,
            decisionTreeResults
        ],
        ["Algorithm", "Mean Train Time", "Mean Prediction Time", "Mean Accuracy Time", "Mean F1 Time", "Accuracy",
         "F1"],
        tablefmt="grid"
    )
)

+--------------------+-------------------+------------------------+----------------------+----------------+------------+--------+
| Algorithm          | Mean Train Time   | Mean Prediction Time   | Mean Accuracy Time   | Mean F1 Time   | Accuracy   | F1     |
| k-Nearest-Neighbor | 0.00934s          | 0.02229s               | 0.00038s             | 0.00104s       | 53.72%     | 51.97% |
+--------------------+-------------------+------------------------+----------------------+----------------+------------+--------+
| Perceptron         | 0.00507s          | 0.00448s               | 0.00137s             | 0.00149s       | 66.99%     | 56.07% |
+--------------------+-------------------+------------------------+----------------------+----------------+------------+--------+
| DecisionTree       | 0.00259s          | 0.00075s               | 0.00056s             | 0.00267s       | 75.66%     | 67.21% |
+--------------------+-------------------+------------------------+----------------------+

Above we see that knn is the slowest of the three algorithms. But with the smaller dataset the difference is not as great as in the census-income example. knn performs the worst accuracy and f1-score. The perceptron is the second fastest of the three algorithms and produces accuracy- and f1-scores that are in between knn and decision-tree. The decision tree produces the best results. The accuracy- and f1-score are greater compared to the other two algorithms, but are not as good as in the census-income example.

By removing some columns from the dataset the computation time was reduced drastically and the accuracy and f1-score increased.