# 1. Libraries

In [None]:
import pandas as pd

# https://xgboost.readthedocs.io/en/latest/
import xgboost
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

# https://scikit-learn.org/stable/modules/svm.html
from sklearn import svm

# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegressionCV.html#sklearn.linear_model.LogisticRegressionCV
from sklearn.linear_model import LogisticRegression

# https://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics
# defining scoring strategy:
# https://scikit-learn.org/stable/modules/model_evaluation.html#defining-your-scoring-strategy-from-metric-functions
# scoring needs to be changed with string, ie : LogisticRegressionCV(cv=10, random_state=0,multi_class='multinomial', scoring="f1_score").fit(samples, labels)
# https://scikit-learn.org/stable/modules/cross_validation.html
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

# 2. Dataset

In [None]:
dataset_scaled = pd.read_csv("Tennessee_Event-Driven/datasets/dataset_standard_scaled.csv", index_col=False)

In [None]:
samples = dataset_scaled[dataset_scaled.columns[:-1]].values
labels = dataset_scaled["fault_id"].values

In [None]:
samples_train, samples_test, labels_train, labels_test = train_test_split(samples, labels, test_size=0.1)

# 3. Classificators

In [None]:
# logistic regression
# For multiclass problems, only ‘newton-cg’, ‘sag’, ‘saga’ and ‘lbfgs’ handle multinomial loss; ‘liblinear’ is limited to one-versus-rest schemes.
# solver = ?
LR_clf = LogisticRegression(multi_class="multinomial", solver="lbfgs")
# vdt
LRscores = cross_val_score(LR_clf, samples, labels, cv=10, scoring="f1_weighted")
LR_clf.fit(samples_train, labels_train)
LR_predicted = LR_clf.predict(samples_test)
print("LR 10CV f1_weighted scores : " + str(LRscores))
print("LR classification report :\n" + str(classification_report(labels_test, LR_predicted)))
print("LR confusion matrix :\n" + str(confusion_matrix(labels_test, LR_predicted)))

# SVM
# about gamma='scale' issue : https://stackoverflow.com/questions/52582796/support-vector-regression-typeerror-must-be-real-number-not-str
SVM_clf = svm.SVC(decision_function_shape="ovo")
SVMscores = cross_val_score(SVM_clf, samples, labels, cv=10, scoring="f1_weighted")
SVM_clf.fit(samples_train, labels_train)
SVM_predicted = SVM_clf.predict(samples_test)
print("SVM 10CV f1_weighted scores : " + str(SVMscores))
print("SVM classification report :\n" + str(classification_report(labels_test, SVM_predicted)))
print("SVM confusion matrix :\n" + str(confusion_matrix(labels_test, SVM_predicted)))

# xgboost
XGBOOST_clf = xgboost.XGBClassifier()
XGBOOSTscores = cross_val_score(XGBOOST_clf, samples, labels, cv=10, scoring="f1_weighted")
XGBOOST_clf.fit(samples_train, labels_train)
XGBOOST_predicted = XGBOOST_clf.predict(samples_test)
print("XGBOOST 10CV f1_weighted scores : " + str(XGBOOSTscores))
print("XGBOOST classification report :\n" + str(classification_report(labels_test, XGBOOST_predicted)))
print("XGBOOST confusion matrix :\n" + str(confusion_matrix(labels_test, XGBOOST_predicted)))
print("XGBOOST features importances :\n" + str(XGBOOST_clf.feature_importances_))

# Random Forest
RF_clf = RandomForestClassifier()
RFscores = cross_val_score(RF_clf, samples, labels, cv=10, scoring="f1_weighted")
RF_clf.fit(samples_train, labels_train)
RF_predicted = RF_clf.predict(samples_test)
print("Random Forest 10CV f1_weighted scores : " + str(RFscores))
print("Random Forest classification report :\n" + str(classification_report(labels_test, RF_predicted)))
print("Random Forest confusion matrix :\n" + str(confusion_matrix(labels_test, RF_predicted)))
print("Random Forest features importances :\n" + str(RF_clf.feature_importances_))

## 3.1. Results
LR 10CV f1_weighted scores : 
    
    [0.42448604 0.45865517 0.47881965 0.24104686 0.25607518 0.43588939 0.46481111 0.48854089 0.49014324 0.41764563]
LR classification report :

                  precision    recall  f1-score   support

               0       0.11      0.07      0.09       135
               1       0.94      0.84      0.89       151
               2       0.88      0.81      0.84       130
               3       0.12      0.14      0.13       137
               4       0.66      0.90      0.76       153
               5       0.88      0.92      0.90       137
               6       1.00      0.89      0.94       164
               7       0.98      0.91      0.94       139
               8       0.28      0.41      0.33       151
               9       0.12      0.11      0.12       134
              10       0.54      0.39      0.45       161
              11       0.05      0.07      0.06       134
              12       0.29      0.27      0.28       138
              13       0.46      0.51      0.49       139
              14       0.03      0.01      0.02       134
              15       0.12      0.14      0.13       132
              16       0.62      0.33      0.43       138
              17       0.72      0.71      0.71       154
              18       0.87      0.81      0.84       161
              19       0.23      0.19      0.21       160
              20       0.54      0.61      0.58       147
              21       0.27      0.43      0.33       141

        accuracy                           0.49      3170
       macro avg       0.49      0.48      0.48      3170
    weighted avg       0.50      0.49      0.49      3170

LR confusion matrix :

    [[ 10   0   0  25   0   0   0   0  16   9   1  16   3   4   7  13   1   0    2  12   1  15]
     [  3 127   0   1   0   0   0   0   0   1   1   4   0   2   2   5   1   0    0   2   0   2]
     [  1   0 105   2   0   0   0   0   1   3   1   4   0   0   2   1   1   0    0   3   2   4]
     [  9   0   0  19   0   0   0   0  12   7   6  21   4   5   7  14   0   0    2  13   2  16]
     [  0   0   0   4 138   0   0   0   1   3   1   2   0   0   0   2   0   0    0   1   1   0]
     [  0   0   0   3   0 126   0   0   0   1   1   3   0   0   0   0   0   0    0   1   0   2]
     [  1   0   0   1   0   0 146   0   3   3   0   3   1   0   1   0   0   0    0   0   0   5]
     [  1   0   0   0   0   0   0 127   1   2   0   1   0   0   1   5   0   0    0   0   0   1]
     [  2   7  14   2   1   0   0   0  62   1   4   5  17  12   1   3   4   1    0   4  10   1]
     [  9   0   0  12   0   0   0   0  12  15   4  18   6   2   7  10   2   0    2   9   7  19]
     [  3   0   0   8   0   0   0   0   9   9  63  10  10   5   2  14   0   0    0   6   4  18]
     [  6   0   0   5  28   0   0   1  15   8   3   9   3   8   2  16   1   5    1   9   4  10]
     [  2   0   0  10   0  15   0   1  13   5   3   5  37  21   1   5   1   0    9   4   2   4]
     [  2   0   1   2   2   0   0   0  14   0   0   5  13  71   2   2  10   0    1   1   0  13]
     [  4   1   0   3  33   0   0   0   4  21   3   8   1   0   2   7   2  37    0   4   2   2]
     [ 10   0   0  14   0   0   0   1  17   7   8  10   2   8   6  19   1   0    1   7   2  19]
     [  7   0   0  12   0   0   0   0   7   4   7   6   4   9   4  10  45   0    0  11   2  10]
     [  1   0   0   1   6   0   0   0   3   6   1   6   5   0   1   3   0 109    0   4   3   5]
     [  2   0   0   5   0   3   0   0   7   1   1   4   1   0   0   4   0   0  130   1   0   2]
     [ 10   0   0  11   0   0   0   0  14   9   3   6  18   0   3  11   4   0    1  31  30   9]
     [  1   0   0   8   0   0   0   0   5   3   2  13   1   1   6   4   0   0    1   5  90   7]
     [  3   0   0   6   1   0   0   0   6   7   3  14   2   5   9  15   0   0    0   5   4  61]]
SVM 10CV f1_weighted scores : 

    [0.5435794  0.6021387  0.5920755  0.30431432 0.32529879 0.5764909 0.59781791 0.62112622 0.63663635 0.66662602]
SVM classification report :

                  precision    recall  f1-score   support

               0       0.13      0.27      0.17       135
               1       1.00      0.83      0.91       151
               2       1.00      0.81      0.89       130
               3       0.13      0.28      0.17       137
               4       0.87      0.88      0.87       153
               5       0.96      0.88      0.92       137
               6       1.00      0.89      0.94       164
               7       1.00      0.91      0.95       139
               8       0.98      0.87      0.92       151
               9       0.14      0.24      0.18       134
              10       0.53      0.30      0.38       161
              11       0.36      0.19      0.25       134
              12       0.95      0.77      0.85       138
              13       0.97      0.85      0.90       139
              14       0.98      0.71      0.82       134
              15       0.13      0.33      0.18       132
              16       0.40      0.18      0.25       138
              17       0.88      0.75      0.81       154
              18       0.99      0.78      0.88       161
              19       0.60      0.56      0.58       160
              20       0.83      0.50      0.62       147
              21       0.93      0.27      0.42       141

        accuracy                           0.60      3170
       macro avg       0.72      0.59      0.63      3170
    weighted avg       0.72      0.60      0.64      3170

SVM confusion matrix :

    [[ 37   0   0  36   0   0   0   0   0  19   1   3   0   0   0  27   4   0    0   5   3   0]
     [  6 126   0   5   0   0   0   0   3   3   0   1   0   0   0   5   1   0    0   1   0   0]
     [  6   0 105   8   0   0   0   0   0   3   0   2   0   0   0   5   0   0    0   1   0   0]
     [ 22   0   0  39   0   0   0   0   0  24   2   4   0   0   0  35   2   0    0   6   2   1]
     [  3   0   0   4 135   0   0   0   0   4   0   2   0   0   0   4   0   0    0   1   0   0]
     [  2   0   0   3   0 121   0   0   0   0   0   1   0   0   0   4   1   0    0   5   0   0]
     [  3   0   0   0   0   0 146   0   0   6   0   2   0   0   0   6   0   0    0   0   0   1]
     [  1   0   0   2   0   0   0 127   0   3   1   0   0   0   0   5   0   0    0   0   0   0]
     [  4   0   0   3   0   0   0   0 132   3   1   1   2   1   0   2   2   0    0   0   0   0]
     [ 29   0   0  20   0   0   0   0   0  32   4   5   0   0   0  36   3   0    0   5   0   0]
     [ 23   0   0  25   0   0   0   0   0  13  48   4   0   1   0  25  10   0    0   7   5   0]
     [ 21   0   0  21  21   0   0   0   0  10   4  26   0   0   1  20   4   1    0   5   0   0]
     [  6   0   0   5   0   4   0   0   0   4   4   1 106   2   0   4   1   0    1   0   0   0]
     [  3   0   0   2   0   0   0   0   0   4   0   2   1 118   0   7   2   0    0   0   0   0]
     [  4   0   0   3   0   0   0   0   0   6   0   4   0   0  95   5   0  15    0   1   1   0]
     [ 34   0   0  26   0   0   0   0   0  19   1   1   0   0   0  43   2   0    0   5   1   0]
     [ 21   0   0  26   0   0   0   0   0  14  14   3   0   0   0  26  25   0    0   6   2   1]
     [  5   0   0  10   0   0   0   0   0   7   1   1   0   0   1  10   1 116    0   2   0   0]
     [  6   0   0   9   0   0   0   0   0   5   0   1   3   0   0   7   1   0  126   3   0   0]
     [ 17   0   0  22   0   0   0   0   0  17   1   1   0   0   0  11   1   0    0  90   0   0]
     [ 13   0   0  18   0   1   0   0   0  13   6   2   0   0   0  18   0   0    0   3  73   0]
     [ 22   0   0  24   0   0   0   0   0  14   2   5   0   0   0  28   2   0    0   5   1  38]]
XGBOOST 10CV f1_weighted scores : 
    
    [0.68392688 0.74462579 0.74093651 0.36879107 0.41598073 0.72445736 0.7336726  0.7663613  0.70552416 0.75007832]
XGBOOST classification report :

                  precision    recall  f1-score   support

               0       0.25      0.47      0.33       135
               1       1.00      0.85      0.92       151
               2       1.00      0.83      0.91       130
               3       0.34      0.52      0.41       137
               4       0.94      0.90      0.92       153
               5       0.94      0.91      0.93       137
               6       1.00      0.89      0.94       164
               7       0.99      0.91      0.95       139
               8       0.96      0.85      0.91       151
               9       0.34      0.43      0.38       134
              10       0.75      0.71      0.73       161
              11       0.86      0.69      0.77       134
              12       0.92      0.77      0.84       138
              13       0.93      0.85      0.89       139
              14       1.00      0.86      0.92       134
              15       0.26      0.43      0.32       132
              16       0.82      0.56      0.66       138
              17       0.94      0.84      0.89       154
              18       0.98      0.81      0.89       161
              19       0.71      0.72      0.72       160
              20       0.75      0.64      0.69       147
              21       1.00      0.91      0.95       141

        accuracy                           0.75      3170
       macro avg       0.80      0.74      0.77      3170
    weighted avg       0.81      0.75      0.77      3170
XGBOOST confusion matrix :

    [[ 63   0   0  23   0   0   0   0   0  22   1   2   0   0   0  17   1   0    0   6   0   0]
     [ 11 129   0   2   0   0   0   0   2   1   0   0   0   0   0   5   0   0    0   1   0   0]
     [ 12   0 108   2   0   0   0   0   0   2   0   0   0   0   0   2   0   0    0   2   2   0]
     [ 17   0   0  71   0   1   0   0   0  16   2   2   0   0   0  16   2   1    1   6   2   0]
     [  3   0   0   2 137   0   0   0   0   1   0   2   0   0   0   7   0   0    0   1   0   0]
     [  5   0   0   1   0 125   0   0   0   0   0   0   0   0   0   4   0   1    0   1   0   0]
     [  8   0   0   1   0   0 146   0   0   2   0   0   0   0   0   7   0   0    0   0   0   0]
     [  5   0   0   0   0   0   0 127   0   4   0   0   0   0   0   3   0   0    0   0   0   0]
     [  4   0   0   3   0   0   0   0 129   3   1   0   4   5   0   1   0   0    0   1   0   0]
     [ 15   0   0  18   0   0   0   0   0  58   3   1   0   0   0  26   2   1    0   6   4   0]
     [  9   0   0  10   0   0   0   0   0   6 114   0   1   0   0  12   7   0    0   1   1   0]
     [ 10   0   0   7   8   0   0   0   0   7   0  93   0   0   0   5   0   1    0   1   2   0]
     [  6   0   0   2   0   2   0   1   2   1   3   0 106   4   0   9   0   0    0   1   1   0]
     [  7   0   0   1   0   0   0   0   1   1   0   0   4 118   0   4   2   0    0   1   0   0]
     [  5   0   0   3   0   0   0   0   0   2   0   0   0   0 115   4   0   4    0   1   0   0]
     [ 24   0   0  17   0   0   0   0   0  14   7   2   0   0   0  57   1   0    0   5   5   0]
     [  7   0   0  12   0   1   0   0   0   7  16   0   0   0   0  10  77   0    0   2   6   0]
     [  6   0   0   2   0   0   0   0   0   6   0   0   0   0   0   7   0 130    0   1   2   0]
     [  4   0   0   3   0   4   0   0   0   5   0   0   0   0   0   8   0   0  131   5   1   0]
     [ 13   0   0  14   0   0   0   0   0   3   0   5   0   0   0   4   1   0    0 115   5   0]
     [ 14   0   0   8   0   0   0   0   0  10   4   1   0   0   0  10   1   0    1   4  94   0]
     [  4   0   0   4   0   0   0   0   0   2   0   0   0   0   0   2   0   0    0   0   1 128]]
 
XGBOOST features importances :
    
    [0.12379248 0.0047626  0.00850815 0.04373371 0.0364475  0.00078652 0.01043513 0.01692818 0.03933102 0.10539821 0.01340495 0.0005318 0.01391175 0.0010541  0.00046664 0.01039928 0.0308312  0.01555407 0.02439173 0.01242347 0.0381037  0.0296824  0.00856636 0.0043253 0.00754108 0.00109454 0.00474678 0.00864964 0.00869501 0.00915563 0.01988467 0.00742988 0.01122067 0.0132043  0.00754636 0.00789573 0.00406679 0.00796529 0.00421074 0.00391345 0.00462407 0.00291873 0.00429754 0.03122465 0.10151894 0.02033001 0.02878321 0.00032342 0.         0.01245736 0.03271366 0.03981758]
Random Forest 10CV f1_weighted scores : 

    [0.69059684 0.74095688 0.74139596 0.34923431 0.42912053 0.71319971 0.7265349  0.76775074 0.67521654 0.72273346]
Random Forest classification report :

                  precision    recall  f1-score   support

               0       0.39      0.72      0.51       135
               1       1.00      0.89      0.94       151
               2       1.00      0.83      0.91       130
               3       0.59      0.74      0.66       137
               4       0.88      0.89      0.88       153
               5       0.86      0.85      0.85       137
               6       0.99      0.90      0.95       164
               7       1.00      0.94      0.97       139
               8       1.00      0.96      0.98       151
               9       0.52      0.65      0.58       134
              10       0.96      0.84      0.90       161
              11       0.91      0.73      0.81       134
              12       1.00      0.90      0.95       138
              13       0.99      0.92      0.96       139
              14       1.00      0.87      0.93       134
              15       0.49      0.65      0.56       132
              16       0.93      0.82      0.87       138
              17       0.94      0.88      0.91       154
              18       1.00      0.84      0.92       161
              19       0.80      0.88      0.84       160
              20       0.85      0.72      0.78       147
              21       0.99      0.91      0.95       141

        accuracy                           0.84      3170
       macro avg       0.87      0.83      0.85      3170
    weighted avg       0.87      0.84      0.85      3170

Random Forest confusion matrix :

    [[ 97   0   0  10   0   0   0   0   0  14   0   1   0   0   0   9   0   0    0   4   0   0]
     [  8 135   0   1   0   0   0   0   0   2   0   0   0   0   0   5   0   0    0   0   0   0]
     [ 13   0 108   1   0   0   0   0   0   1   0   0   0   0   0   6   0   0    0   1   0   0]
     [ 12   0   0 102   0   1   0   0   0   4   0   1   0   0   0   8   1   1    0   5   2   0]
     [  0   0   0   0 136  14   0   0   0   1   0   2   0   0   0   0   0   0    0   0   0   0]
     [  4   0   0   2  10 116   0   0   0   3   0   0   0   0   0   1   0   1    0   0   0   0]
     [  9   0   0   1   0   0 148   0   0   1   0   0   0   0   0   5   0   0    0   0   0   0]
     [  3   0   0   0   0   0   0 131   0   1   0   0   0   0   0   4   0   0    0   0   0   0]
     [  3   0   0   2   0   0   0   0 145   0   0   0   0   0   0   1   0   0    0   0   0   0]
     [ 13   0   0  11   0   0   0   0   0  87   0   1   0   0   0   9   2   1    0   5   4   1]
     [  9   0   0   5   0   0   0   0   0   6 136   0   0   0   0   2   2   0    0   1   0   0]
     [  7   0   0   6   9   0   0   0   0   5   0  98   0   0   0   6   0   2    0   1   0   0]
     [  6   0   0   0   0   1   0   0   0   2   0   0 124   1   0   3   0   0    0   0   1   0]
     [  6   0   0   0   0   0   0   0   0   3   0   0   0 128   0   1   0   0    0   1   0   0]
     [  6   0   0   2   0   0   0   0   0   1   0   0   0   0 117   3   0   4    0   1   0   0]
     [ 14   0   0   7   0   1   0   0   0  15   2   0   0   0   0  86   1   0    0   3   3   0]
     [  7   0   0   4   0   0   0   0   0   4   2   0   0   0   0   2 113   0    0   2   4   0]
     [  4   0   0   4   0   0   0   0   0   4   0   0   0   0   0   5   0 135    0   1   1   0]
     [  6   0   0   2   0   1   0   0   0   5   0   0   0   0   0   7   0   0  136   3   1   0]
     [  1   0   0   7   0   0   0   0   0   3   0   3   0   0   0   3   2   0    0 140   1   0]
     [ 15   0   0   7   0   1   1   0   0   4   1   2   0   0   0   6   0   0    0   4 106   0]
     [  4   0   0   0   0   0   0   0   0   2   0   0   0   0   0   3   0   0    0   2   1 129]]
Random Forest features importances :

    [0.0373927  0.00810206 0.01020342 0.02006777 0.01795898 0.01199323 0.02037662 0.01540028 0.0343393  0.02601041 0.02151706 0.0064665 0.02009364 0.0069221  0.006431   0.02152803 0.0142307  0.02657733 0.03480786 0.02227191 0.04406107 0.01665294 0.01672006 0.01113766 0.01661354 0.00877211 0.01333297 0.01801175 0.01938749 0.01216083 0.01905983 0.0100468  0.01381571 0.01823266 0.01333095 0.01168955 0.01188482 0.01856537 0.01270057 0.01215902 0.01280374 0.00821088 0.01059093 0.03070568 0.06515119 0.02355875 0.02400868 0.00638174 0.00640568 0.0309878  0.06080765 0.01936067]

# GridSearchCV for better parameter values

In [None]:
param_grid = [
    {
        "n_estimators": [100, 200, 500],
        "max_features": ["auto", "log2"],
        "max_depth": [5, 10, 50, 100, None],
        "criterion": ["gini", "entropy"],
    }
]

RF_clf_gs = GridSearchCV(
    estimator=RandomForestClassifier(n_estimators=100),
    param_grid=param_grid,
    scoring="f1",
    n_jobs=4,
    cv=10,
)
RF_clf_gs.fit(samples, labels)
means = RF_clf_gs.cv_results_["mean_test_score"]
stds = RF_clf_gs.cv_results_["std_test_score"]
print("RF 10CV f1 score mean with 95% confidence interval : ")
for mean, std, params in zip(means, stds, RF_clf_gs.cv_results_["params"]):
    print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))