# Libraries

In [44]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier

# Load Dataset

In [5]:
data_train = pd.read_csv("./dataset/data_train.csv", header=None)
data_test = pd.read_csv("./dataset/data_test.csv", header=None)

In [12]:
data_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,47,100,27,81,57,37,26,0,0,23,56,53,100,90,40,98,8
1,0,89,27,100,42,75,29,45,15,15,37,0,69,2,100,6,2
2,0,57,31,68,72,90,100,100,76,75,50,51,28,25,16,0,1
3,0,100,7,92,5,68,19,45,86,34,100,45,74,23,67,0,4
4,0,67,49,83,100,100,81,80,60,60,40,40,33,20,47,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7489,0,82,9,59,56,34,41,0,10,30,3,67,42,96,100,100,5
7490,49,100,0,70,24,56,100,65,86,85,44,77,21,38,6,0,4
7491,100,98,60,100,24,87,3,58,35,51,58,26,36,0,0,5,5
7492,59,65,91,100,84,96,72,50,51,8,0,0,45,1,100,0,1


In [11]:
set(data_train[16].values)

{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}

# Random Forest

In [29]:
def deep_evaluation(true_list, predict_list):
    """
    Report several evaluation metrics and the confusion matrix
    
    :param true_list: list of true labels
    :param predict_list: list of predict labels
    :return: Nothing
    """
    print(f"Confusion Matrix:\n {confusion_matrix(true_list, predict_list)}")
    print(f"Accuracy = {round(accuracy_score(true_list, predict_list)*100, 2)}%")
    print(f"Recall = {round(recall_score(true_list, predict_list, average='macro')*100, 2)}%")
    print(f"Precision = {round(precision_score(true_list, predict_list, average='macro')*100, 2)}%")
    print(f"F1 = {round(f1_score(true_list, predict_list, average='macro')*100, 2)}%")   

In [40]:
random_forest = RandomForestClassifier(n_estimators=15, max_depth=3, random_state=0, max_features=3)
random_forest.fit(data_train.drop(columns=[16]), data_train[16])

RandomForestClassifier(max_depth=3, max_features=3, n_estimators=15,
                       random_state=0)

In [41]:
predict_list = random_forest.predict(data_test.drop(columns=[16]))
true_list = data_test[16].values

deep_evaluation(true_list, predict_list)

Confusion Matrix:
 [[315   4   0   0   0   0   1   3  40   0]
 [  0 188 135   1   2   0   0   0   0  38]
 [  0  18 342   1   1   0   0   2   0   0]
 [  0   7   0 328   0   0   0   0   0   1]
 [  0   2   0   1 354   0   0   0   0   7]
 [  0   0   0 116  13 162   0   0   6  38]
 [  3   0   2   3   0   0 297  29   2   0]
 [  0  22  10   3   0   3   3 294   0  29]
 [ 16   0   0   0   0  12   2  22 283   1]
 [  0  52   0  63  13   0   2   0   2 204]]
Accuracy = 79.1%
Recall = 78.97%
Precision = 80.64%
F1 = 78.49%
Collecting xgboost
  Downloading xgboost-1.5.1-py3-none-win_amd64.whl (106.6 MB)
Installing collected packages: xgboost
Successfully installed xgboost-1.5.1


# AdaBoost

In [37]:
for number_trees in [5, 10, 20, 50]:
    print(f"\nNumber Trees = {number_trees}")
    ada_boost = AdaBoostClassifier(n_estimators=number_trees, random_state=0)
    ada_boost.fit(data_train.drop(columns=[16]), data_train[16])
    
    predict_list = ada_boost.predict(data_test.drop(columns=[16]))
    true_list = data_test[16].values
    
    deep_evaluation(true_list, predict_list)


Number Trees = 5
Confusion Matrix:
 [[328   0   1   1   1   0  25   0   7   0]
 [  0 135 216   3   9   0   1   0   0   0]
 [  0   0 338   5   0   0  13   8   0   0]
 [  0   2 329   4   0   0   0   0   0   1]
 [  0   5   0   3 336   0  18   0   0   2]
 [166   5  75  84   3   0   0   0   1   1]
 [  9   0   8   5   5   0 297  12   0   0]
 [  3  57  17   5  23   0   0 250   5   4]
 [165   0   2   0   1   0   1  63 104   0]
 [  0  33 188  41  34   0   0   0   1  39]]
Accuracy = 52.34%
Recall = 51.34%
Precision = 54.87%
F1 = 46.44%

Number Trees = 10
Confusion Matrix:
 [[298   0   1   0   0   0  62   2   0   0]
 [  0 215 143   0   0   0   1   1   0   4]
 [  0   8 330   4   0   0  13   8   0   1]
 [  0   2 328   4   0   0   0   1   0   1]
 [  0   0   0   0 327   0  18   0   0  19]
 [  7   5  75  48   2   4  31 132   0  31]
 [  1   0   8   0   1   0 313  11   0   2]
 [  0  31  12   2   0   0   0 297   0  22]
 [ 59   0   1   0   0   1 104 150  21   0]
 [  0  13 186  12  12   0   2   1   0 110]

  _warn_prf(average, modifier, msg_start, len(result))


Accuracy = 54.86%
Recall = 53.78%
Precision = 63.66%
F1 = 47.8%

Number Trees = 20
Confusion Matrix:
 [[323   0   1   0   1   0  26   0  12   0]
 [  0 207 138   5   0   0   2   0   0  12]
 [  0   8 334   1   0   0  13   8   0   0]
 [  0   2  60 245   0  19   9   0   0   1]
 [  1   9   3   0 324   0  19   0   0   8]
 [165  12   0 119   2  11  15   1   2   8]
 [  0   0   0   8   0   0 320   5   1   2]
 [  0  31  12   2   0   0   9 281   7  22]
 [184   0  11   0   0   0   6  32 103   0]
 [  0  16   9 168  12  30  20   0   1  80]]
Accuracy = 63.69%
Recall = 62.97%
Precision = 63.88%
F1 = 59.36%

Number Trees = 50
Confusion Matrix:
 [[336   0   1   0   0   0  23   0   3   0]
 [  0 212 138   5   4   0   2   0   0   3]
 [  0   8 334   1   0   0  13   8   0   0]
 [  0   2  60 245   0  19   9   0   0   1]
 [  1   9   3   0 325   0  19   0   0   7]
 [166  12   0 119   2  11  15   0   2   8]
 [  0   0   0   8   0   0 308   5  13   2]
 [  0  31  12   2   0   0   9 270  18  22]
 [264   0  10   0   

# XGBoost

In [58]:
xgboost = XGBClassifier()

xgboost.fit(data_train.drop(columns=[16]), data_train[16])

predict_list = xgboost.predict(data_test.drop(columns=[16]))
true_list = data_test[16].values

deep_evaluation(true_list, predict_list)



Confusion Matrix:
 [[343   0   0   0   0   0   0   0  20   0]
 [  0 345  18   0   0   0   0   1   0   0]
 [  0   1 363   0   0   0   0   0   0   0]
 [  0   2   0 332   0   0   0   1   0   1]
 [  0   1   0   0 361   2   0   0   0   0]
 [  0   0   0   5   0 320   1   0   2   7]
 [  0   0   1   0   0   1 334   0   0   0]
 [  0  20   3   1  13   0   1 326   0   0]
 [  2   0   0   0   0   0   0   0 334   0]
 [  0   4   0   6   0   0   0   1   1 324]]
Accuracy = 96.68%
Recall = 96.73%
Precision = 96.79%
F1 = 96.71%


In [57]:
xgboost

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=8,
              num_parallel_tree=1, objective='multi:softprob', predictor='auto',
              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=None,
              subsample=1, tree_method='exact', validate_parameters=1,
              verbosity=None)

In [78]:
train = data_train.head(int(len(data_train)*0.7))
validation = data_train.tail(len(data_train)-int(len(data_train)*0.7))

best_accuracy = 0
best_xgboost = None
for gamma in [0, 0.5, 1]:
    for max_depth in [3, 6, 9]:
        for subsample in [0.8, 1]:
            for learning_rate in [0.1, 0.3, 0.5]:
                xgboost = XGBClassifier(learning_rate=learning_rate, n_estimators=100
                                        , max_depth=max_depth, subsample=subsample, colsample_bytree=1
                                        , gamma=gamma, random_state=0)

                xgboost.fit(train.drop(columns=[16]), train[16])

                predict_list = xgboost.predict(validation.drop(columns=[16]))
                true_list = validation[16].values

                new_accuracy = accuracy_score(true_list, predict_list)
                if  new_accuracy > best_accuracy:
                    best_accuracy = new_accuracy
                    best_xgboost = xgboost
                

print(best_accuracy)
predict_list = best_xgboost.predict(data_test.drop(columns=[16]))
true_list = data_test[16].values

deep_evaluation(true_list, predict_list)





















































































































































































































0.9902178746109382
Confusion Matrix:
 [[342   0   0   0   0   0   0   0  21   0]
 [  0 344  16   0   1   2   0   1   0   0]
 [  0   4 359   0   0   0   0   1   0   0]
 [  0   2   0 332   0   0   0   1   0   1]
 [  0   0   1   0 363   0   0   0   0   0]
 [  0   0   0   6   0 321   0   0   2   6]
 [  0   0   1   0   0   1 332   0   2   0]
 [  0  29   3   3   2   0   0 318   0   9]
 [  1   0   0   0   0   0   0   0 335   0]
 [  0   2   0   9   0   0   0   1   1 323]]
Accuracy = 96.31%
Recall = 96.37%
Precision = 96.44%
F1 = 96.33%
