# Import Dependencies


In [227]:
import pandas as pd
import numpy as np
import time
import statistics

from sklearn.preprocessing import StandardScaler
from tabulate import tabulate
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import Perceptron
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Load dataset

In [228]:
censusIncome = pd.read_csv('../data/census-income.data', sep=',')
censusIncome.sample(frac=1, random_state=42).reset_index(drop=True)

censusIncome.describe()

Unnamed: 0,age,detailed industry recode,detailed occupation recode,wage per hour,capital gains,capital losses,dividends from stocks,instance weight,num persons worked for employer,own business or self employed,veterans benefits,weeks worked in year,year
count,199523.0,199523.0,199523.0,199523.0,199523.0,199523.0,199523.0,199523.0,199523.0,199523.0,199523.0,199523.0,199523.0
mean,34.494199,15.35232,11.306556,55.426908,434.71899,37.313788,197.529533,1740.380269,1.95618,0.175438,1.514833,23.174897,94.499672
std,22.310895,18.067129,14.454204,274.896454,4697.53128,271.896428,1984.163658,993.768156,2.365126,0.553694,0.851473,24.411488,0.500001
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,37.87,0.0,0.0,0.0,0.0,94.0
25%,15.0,0.0,0.0,0.0,0.0,0.0,0.0,1061.615,0.0,0.0,2.0,0.0,94.0
50%,33.0,0.0,0.0,0.0,0.0,0.0,0.0,1618.31,1.0,0.0,2.0,8.0,94.0
75%,50.0,33.0,26.0,0.0,0.0,0.0,0.0,2188.61,4.0,0.0,2.0,52.0,95.0
max,90.0,51.0,46.0,9999.0,99999.0,4608.0,99999.0,18656.3,6.0,2.0,2.0,52.0,95.0


# Clean dataset

In [229]:
# Categories the age label otherwise, the encoding won't work very well
censusIncome['age_label'] = censusIncome['age'].apply(lambda value: '0-14'
if value <= 14 else '15-24'
if value <= 24 else '25-54'
if value <= 54 else '55-64'
if value <= 64 else '65 years and over')

censusIncome['wage_label'] = censusIncome['wage per hour'].apply(lambda value: '0-3000'
if value <= 3000 else '3001-6000'
if value <= 6000 else '6001-7000'
if value <= 7000 else '7001-9000+')

filteredCensus = censusIncome[
    (censusIncome['class of worker'] != ' Not in universe') &
    (censusIncome['education'] != ' Children') &
    (censusIncome['wage per hour'] > 0) &
    (censusIncome['weeks worked in year'] > 0)
    ]

filteredCensus = filteredCensus.drop("age", axis=1)
filteredCensus = filteredCensus.drop("wage per hour", axis=1)
filteredCensus = filteredCensus.drop("year", axis=1)
filteredCensus = filteredCensus.drop("ignore", axis=1)
filteredCensus = filteredCensus.drop("instance weight", axis=1)

filteredCensus.head()

Unnamed: 0,class of worker,detailed industry recode,detailed occupation recode,education,enroll in edu inst last wk,marital stat,major industry code,major occupation code,race,hispanic origin,...,country of birth father,country of birth mother,country of birth self,citizenship,own business or self employed,fill inc questionnaire for veteran's admin,veterans benefits,weeks worked in year,age_label,wage_label
5,Private,40,10,Some college but no degree,Not in universe,Married-civilian spouse present,Entertainment,Professional specialty,Amer Indian Aleut or Eskimo,All other,...,Philippines,United-States,United-States,Native- Born in the United States,2,Not in universe,2,52,25-54,0-3000
8,Local government,43,26,Some college but no degree,Not in universe,Married-civilian spouse present,Education,Adm support including clerical,White,All other,...,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,2,52,25-54,0-3000
22,Private,35,22,Some college but no degree,Not in universe,Married-civilian spouse present,Finance insurance and real estate,Adm support including clerical,White,All other,...,United-States,United-States,United-States,Native- Born in the United States,2,Not in universe,2,32,55-64,0-3000
49,Private,37,36,Some college but no degree,Not in universe,Married-civilian spouse present,Business and repair services,Machine operators assmblrs & inspctrs,White,All other,...,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,2,52,65 years and over,0-3000
147,State government,43,23,Associates degree-occup /vocational,College or university,Married-A F spouse present,Education,Adm support including clerical,White,All other,...,United-States,United-States,United-States,Native- Born in the United States,2,Not in universe,2,48,15-24,0-3000


# Encode the dataset

In [230]:
# One hot encode the dataset
encodedIncome = pd.get_dummies(filteredCensus, columns=filteredCensus.columns, drop_first=True)
encodedIncome.head()

# print(encodedIncome.columns)

# encodedIncome.to_csv('output.csv', sep=',')

Unnamed: 0,class of worker_ Local government,class of worker_ Private,class of worker_ State government,detailed industry recode_2,detailed industry recode_3,detailed industry recode_4,detailed industry recode_5,detailed industry recode_6,detailed industry recode_7,detailed industry recode_8,...,weeks worked in year_49,weeks worked in year_50,weeks worked in year_51,weeks worked in year_52,age_label_25-54,age_label_55-64,age_label_65 years and over,wage_label_3001-6000,wage_label_6001-7000,wage_label_7001-9000+
5,0,1,0,0,0,0,0,0,0,0,...,0,0,0,1,1,0,0,0,0,0
8,1,0,0,0,0,0,0,0,0,0,...,0,0,0,1,1,0,0,0,0,0
22,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
49,0,1,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,1,0,0,0
147,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Split the dataset

In [231]:
# What do we predict?
xAxis = encodedIncome.drop('race_ White', axis=1)
yAxis = encodedIncome['race_ White']

trainData, testData, trainLabels, testLabels = train_test_split(
    xAxis,
    yAxis,
    test_size=0.33,
    random_state=21006
)

scaler = StandardScaler()
scaler.fit(trainData)

xTrain = scaler.transform(trainData)
xTest = scaler.transform(testData)

print("X_train:", xTrain)
print("X_test:", xTest)
print("y_train:", trainLabels)
print("y_test:", testLabels)

X_train: [[-0.27467111  0.40828054 -0.19816186 ... -0.07917536 -0.02328101
  -0.04034577]
 [-0.27467111  0.40828054 -0.19816186 ... -0.07917536 -0.02328101
  -0.04034577]
 [-0.27467111  0.40828054 -0.19816186 ... -0.07917536 -0.02328101
  -0.04034577]
 ...
 [-0.27467111 -2.44929625  5.04637987 ... -0.07917536 -0.02328101
  -0.04034577]
 [-0.27467111  0.40828054 -0.19816186 ... -0.07917536 -0.02328101
  -0.04034577]
 [-0.27467111  0.40828054 -0.19816186 ... -0.07917536 -0.02328101
  -0.04034577]]
X_test: [[-0.27467111  0.40828054 -0.19816186 ... -0.07917536 -0.02328101
  -0.04034577]
 [-0.27467111  0.40828054 -0.19816186 ... -0.07917536 -0.02328101
  -0.04034577]
 [-0.27467111  0.40828054 -0.19816186 ... -0.07917536 -0.02328101
  -0.04034577]
 ...
 [-0.27467111  0.40828054 -0.19816186 ... -0.07917536 -0.02328101
  -0.04034577]
 [-0.27467111  0.40828054 -0.19816186 ... -0.07917536 -0.02328101
  -0.04034577]
 [-0.27467111  0.40828054 -0.19816186 ... -0.07917536 -0.02328101
  -0.04034577]]

In [232]:
def printTrainTimes(alogrithm, trainTimes, predictionTimes, accuracyTimes, f1Times, accuracyScores, f1Scores):
    meanTrainTime = statistics.mean(trainTimes)
    meanPredictionTime = statistics.mean(predictionTimes)
    meanAccuracyTime = statistics.mean(accuracyTimes)
    meanF1Time = statistics.mean(f1Times)
    meanF1Score = statistics.mean(f1Scores)
    meanAccuracyScore = statistics.mean(accuracyScores)

    print('Algorithm:', alogrithm)

    print('Mean times')

    print('Mean training time:', meanTrainTime, 's')
    print('Mean test time:', meanPredictionTime, 's')
    print('Mean accuracy time:', meanAccuracyTime, 's')
    print('Mean F1 time:', meanF1Time, 's')

    print('Mean scores')

    print('Mean accuracy score:', meanF1Score * 100, '%')
    print('Mean F1 score:', meanAccuracyScore * 100, '%')
    print()

    return [
        alogrithm,
        str("{:.5f}s".format(meanTrainTime)),
        str("{:.5f}s".format(meanPredictionTime)),
        str("{:.5f}s".format(meanAccuracyTime)),
        str("{:.5f}s".format(meanF1Time)),
        str("{:.2f}%".format(meanAccuracyScore * 100)),
        str("{:.2f}%".format(meanF1Score * 100))
    ]


# KNN (k-Nearest-Neighbor Classifier)

In [233]:
# https://python-course.eu/machine-learning/k-nearest-neighbor-classifier-with-sklearn.php

kValues = [2, 4, 6]

f1Scores = []
accuracyScores = []

trainTimes = []
predictionTimes = []
accuracyTimes = []
f1Times = []

for k in kValues:
    print('k-Nearest-Neighbor with', k, 'neighbors')

    knn = KNeighborsClassifier(n_neighbors=k, algorithm='kd_tree')

    # Train the algorithm
    start = time.time()
    knn.fit(xTrain, trainLabels)
    end = time.time()
    trainTime = end - start
    trainTimes.append(end - start)

    # Predict
    start = time.time()
    predicted = knn.predict(xTest)
    end = time.time()
    predictionTime = end - start
    predictionTimes.append(end - start)

    # Effectiveness measurement

    # To evaluate the result, we will use the accuracy score
    start = time.time()
    accuracyScore = accuracy_score(testLabels, predicted)
    end = time.time()
    accuracyScores.append(accuracyScore)
    accuracyTime = (end - start)
    accuracyTimes.append(accuracyTime)

    # To evaluate the result, we will use the f1_score
    start = time.time()
    f1Score = f1_score(testLabels, predicted, average='weighted')
    end = time.time()
    f1Scores.append(f1Score)
    f1Time = end - start
    f1Times.append(f1Time)

    print('Training time:', trainTime, 's')
    print('Testing time:', predictionTime, 's')
    print('Accuracy time:', accuracyTime, 's')
    print('F1 time:', f1Time, 's')
    print()

    print('Accuracy:', accuracyScore * 100, '%')
    print('F1 score:', f1Score * 100, '%')
    print()

knnResults = printTrainTimes('k-Nearest-Neighbor', trainTimes, predictionTimes, accuracyTimes, f1Times,
                             accuracyScores, f1Scores)


k-Nearest-Neighbor with 2 neighbors
Training time: 1.6319575309753418 s
Testing time: 34.24049091339111 s
Accuracy time: 0.0005550384521484375 s
F1 time: 0.0017056465148925781 s

Accuracy: 80.53340665383558 %
F1 score: 81.73393161683573 %

k-Nearest-Neighbor with 4 neighbors
Training time: 1.6315422058105469 s
Testing time: 38.50601148605347 s
Accuracy time: 0.0005671977996826172 s
F1 time: 0.0013189315795898438 s

Accuracy: 86.60984327742645 %
F1 score: 85.24623652355848 %

k-Nearest-Neighbor with 6 neighbors
Training time: 1.5365197658538818 s
Testing time: 41.92724323272705 s
Accuracy time: 0.000507354736328125 s
F1 time: 0.0014867782592773438 s

Accuracy: 87.8746219411603 %
F1 score: 85.60203264144111 %

Algorithm: k-Nearest-Neighbor
Mean times
Mean training time: 1.6000065008799236 s
Mean test time: 38.22458187739054 s
Mean accuracy time: 0.0005431969960530599 s
Mean F1 time: 0.0015037854512532551 s
Mean scores
Mean accuracy score: 84.19406692727844 %
Mean F1 score: 85.00595729080

KNN produces very good results. The best accuracy was achieved with 6 neighbors. The execution is the longest of the three algorithms. Therefore, I switched to kd-tree algorithm, otherwise the runtime was not acceptable. WIth kd-tree the runtime is acceptable, but still the slowest.

Overall the accuracy and the f1-score are very good along all neighbors.

# Perceptron

In [234]:
f1Scores = []
accuracyScores = []

trainTimes = []
predictionTimes = []
accuracyTimes = []
f1Times = []

alphas = np.logspace(-1, 1, 5)
penalties = ['l2', 'l1']

for a in alphas:
    for p in penalties:
        print('Perceptron with alpha', a, 'and penalty', p)

        perceptron = Perceptron(alpha=a, penalty=p)

        # Train the algorithm
        start = time.time()
        perceptron.fit(xTrain, trainLabels)
        end = time.time()
        trainTime = end - start
        trainTimes.append(end - start)

        # Predict
        start = time.time()
        predicted = perceptron.predict(xTest)
        end = time.time()
        predictionTime = end - start
        predictionTimes.append(end - start)

        # Effectiveness measurement
        start = time.time()
        accuracyScore = accuracy_score(testLabels, predicted)
        end = time.time()
        accuracyScores.append(accuracyScore)
        accuracyTime = (end - start)
        accuracyTimes.append(accuracyTime)

        # To evaluate the result, we will use the f1_score
        start = time.time()
        f1Score = f1_score(testLabels, predicted, average='weighted')
        end = time.time()
        f1Scores.append(f1Score)
        f1Time = end - start
        f1Times.append(f1Time)

        print('Training time:', trainTime, 's')
        print('Testing time:', predictionTime, 's')
        print('Accuracy time:', accuracyTime, 's')
        print('F1 time:', f1Time, 's')
        print()

        print('Accuracy:', accuracyScore * 100, '%')
        print('F1 score:', f1Score * 100, '%')
        print()

perceptronResults = printTrainTimes('Perceptron', trainTimes, predictionTimes, accuracyTimes, f1Times, accuracyScores,
                                    f1Scores)


Perceptron with alpha 0.1 and penalty l2
Training time: 0.1498417854309082 s
Testing time: 0.0036530494689941406 s
Accuracy time: 0.0005824565887451172 s
F1 time: 0.0017695426940917969 s

Accuracy: 82.59554577948859 %
F1 score: 79.24263226963689 %

Perceptron with alpha 0.1 and penalty l1
Training time: 0.22814011573791504 s
Testing time: 0.004194974899291992 s
Accuracy time: 0.0005702972412109375 s
F1 time: 0.0034019947052001953 s

Accuracy: 85.09760791861424 %
F1 score: 78.2463150647983 %

Perceptron with alpha 0.31622776601683794 and penalty l2
Training time: 0.15801501274108887 s
Testing time: 0.0037915706634521484 s
Accuracy time: 0.0005228519439697266 s
F1 time: 0.0015716552734375 s

Accuracy: 85.09760791861424 %
F1 score: 78.2463150647983 %

Perceptron with alpha 0.31622776601683794 and penalty l1
Training time: 0.2955293655395508 s
Testing time: 0.004208087921142578 s
Accuracy time: 0.00046825408935546875 s
F1 time: 0.0012392997741699219 s

Accuracy: 14.902392081385758 %
F1 sco

The runtime of this algorithm is much faster than the runtime of the knn algorithm. It performs the best out of the three algorithms. THe accuracy and the f1-score are very good and constant over all parameters.

# Decision Tree

In [235]:
f1Scores = []
accuracyScores = []

trainTimes = []
predictionTimes = []
accuracyTimes = []
f1Times = []

# algo input parameter variation lists
minSamplesSplits = [2, 25, 50, 100, 250, 1000]
minSamplesLeafs = [1, 25, 50, 100, 1000]

for mSS in minSamplesSplits:
    for mSL in minSamplesLeafs:
        print('Decision Tree with min sample splits', mSS, 'and min sample leafs', mSL)

        decisionTree = DecisionTreeClassifier(
            min_samples_split=mSS,
            min_samples_leaf=mSL
        )

        # Train the algorithm
        start = time.time()
        decisionTree.fit(xTrain, trainLabels)
        end = time.time()
        trainTime = end - start
        trainTimes.append(end - start)

        # Predict
        start = time.time()
        predicted = decisionTree.predict(xTest)
        end = time.time()
        predictionTime = end - start
        predictionTimes.append(end - start)

        # Effectiveness measurement
        start = time.time()
        accuracyScore = accuracy_score(testLabels, predicted)
        end = time.time()
        accuracyScores.append(accuracyScore)
        accuracyTime = (end - start)
        accuracyTimes.append(accuracyTime)

        # To evaluate the result, we will use the f1_score
        start = time.time()
        f1Score = f1_score(testLabels, predicted, average='weighted')
        end = time.time()
        f1Scores.append(f1Score)
        f1Time = end - start
        f1Times.append(f1Time)

        print('Training time:', trainTime, 's')
        print('Testing time:', predictionTime, 's')
        print('Accuracy time:', accuracyTime, 's')
        print('F1 time:', f1Time, 's')
        print()

        print('Accuracy:', accuracyScore * 100, '%')
        print('F1 score:', f1Score * 100, '%')
        print()

decisionTreeResults = printTrainTimes('DecisionTree', trainTimes, predictionTimes, accuracyTimes, f1Times,
                                      accuracyScores, f1Scores)



Decision Tree with min sample splits 2 and min sample leafs 1
Training time: 0.5715327262878418 s
Testing time: 0.005532979965209961 s
Accuracy time: 0.0005567073822021484 s
F1 time: 0.0015556812286376953 s

Accuracy: 97.82788012097883 %
F1 score: 97.82036431907973 %

Decision Tree with min sample splits 2 and min sample leafs 25
Training time: 0.3790569305419922 s
Testing time: 0.006223440170288086 s
Accuracy time: 0.0005769729614257812 s
F1 time: 0.0013267993927001953 s

Accuracy: 98.7627165246082 %
F1 score: 98.74038754547475 %

Decision Tree with min sample splits 2 and min sample leafs 50
Training time: 0.3062736988067627 s
Testing time: 0.004942178726196289 s
Accuracy time: 0.0006055831909179688 s
F1 time: 0.002058744430541992 s

Accuracy: 98.7627165246082 %
F1 score: 98.74038754547475 %

Decision Tree with min sample splits 2 and min sample leafs 100
Training time: 0.2425541877746582 s
Testing time: 0.004795551300048828 s
Accuracy time: 0.0006315708160400391 s
F1 time: 0.0016608

The decision tree has the second-best runtime of the algorithms. The accuracy and the f1-score is constantly very high and has the best results of all three algorithms.

# Results

Here I compared the mean times and mean scores of all three algorithms.

In [236]:
print(
    tabulate(
        [
            knnResults,
            perceptronResults,
            decisionTreeResults
        ],
        ["Algorithm", "Mean Train Time", "Mean Prediction Time", "Mean Accuracy Time", "Mean F1 Time", "Accuracy",
         "F1"],
        tablefmt="grid"
    )
)

+--------------------+-------------------+------------------------+----------------------+----------------+------------+--------+
| Algorithm          | Mean Train Time   | Mean Prediction Time   | Mean Accuracy Time   | Mean F1 Time   | Accuracy   | F1     |
| k-Nearest-Neighbor | 1.60001s          | 38.22458s              | 0.00054s             | 0.00150s       | 85.01%     | 84.19% |
+--------------------+-------------------+------------------------+----------------------+----------------+------------+--------+
| Perceptron         | 0.23120s          | 0.01242s               | 0.00066s             | 0.00184s       | 77.83%     | 70.91% |
+--------------------+-------------------+------------------------+----------------------+----------------+------------+--------+
| DecisionTree       | 0.31189s          | 0.00545s               | 0.00061s             | 0.00174s       | 95.76%     | 94.36% |
+--------------------+-------------------+------------------------+----------------------+

Above we see that knn is the slowest of the three algorithms. knn performs the second-best accuracy and f1-score. The perceptron is the fastest of the three algorithms, but produces the least accurate accuracy- and f1-scores. Decision tree produces the best results. It has a very good accuracy and f1-score.

By removing some columns from the dataset the computation time was reduced drastically and the and the accuracy and f1-score increased.