In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.manifold import TSNE
from time import time
import random
import warnings
warnings.filterwarnings("ignore")
from sklearn.metrics import f1_score as f1_score_rep
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
import matplotlib as plt

In [2]:
train = pd.read_csv("F://CyberLab/MQTT/Data/FINAL_CSV/train70.csv")
test = pd.read_csv("F://CyberLab/MQTT/Data/FINAL_CSV/test30.csv")

In [3]:
class_names = train.target.unique()
train = train.astype('category')
cat_col = train.select_dtypes(['category']).columns
train[cat_col] = train[cat_col].apply(lambda x: x.cat.codes)
x_col = train.columns.drop('target')
x_train = train[x_col].values
y_train = train['target']

In [4]:
class_names = test.target.unique()
test = test.astype('category')
cat_col = test.select_dtypes(['category']).columns
test[cat_col] = test[cat_col].apply(lambda x: x.cat.codes)
x_col = test.columns.drop('target')
x_test = test[x_col].values
y_test = test['target']

In [5]:
from sklearn import preprocessing
X_train_scaled = preprocessing.scale(x_train)
X_test_scaled = preprocessing.scale(x_test)

In [6]:
x_train = np.copy(X_train_scaled)
x_test = np.copy(X_test_scaled)

In [7]:
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.transform(y_test)

In [8]:
def report(test, pred):
    cm = confusion_matrix(test, pred)
    cm_df = pd.DataFrame(cm)
    
    tp_arr = []
    fp_arr = []
    tn_arr = []
    fn_arr = []
    for i in range(cm_df.shape[0]):
      tp = cm_df[i][i]
      fp = sum(cm_df[i])-tp
      fn = sum(cm_df.iloc[i])-tp
      total = cm_df.sum().sum()
      tn = total-fp-fn-tp
      tpr = tp/(tp+fn)
      fpr = fp/(tn+fp)
      fnr = fn/(fn+tp)
      tnr = tn/(tn+fp)
      tp_arr.append(tpr)
      tn_arr.append(tnr)
      fp_arr.append(fpr)
      fn_arr.append(fnr)
    
    print("TPR: ", np.mean(tp_arr))
    print("FPR: ", np.mean(fp_arr))
    print("FNR: ", np.mean(fn_arr))
    print("TNR: ", np.mean(tn_arr))
    return

In [8]:
### DesicionTree
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(random_state=0)
clf = clf.fit(x_train, y_train)
dt_pred = clf.predict(x_test)

In [9]:
print('****************** Decision Tree prediction results ******************')
print("Accuracy: ", accuracy_score(y_test, dt_pred))
print("Micro F1 Score: ", f1_score_rep(y_test, dt_pred, average="micro"))
print("Macro F1 Score: ", f1_score_rep(y_test, dt_pred, average="macro"))

****************** Decision Tree prediction results ******************
Accuracy:  0.6355815058412975
Micro F1 Score:  0.6355815058412975
Macro F1 Score:  0.2573396797163891


In [11]:
report(y_test, dt_pred)

NameError: name 'dt_pred' is not defined

In [11]:
### Gaussian Naive Bayes
from sklearn.naive_bayes import GaussianNB
gb = GaussianNB()
gb = gb.fit(x_train, y_train)
gb_pred = gb.predict(x_test)

In [12]:
print('****************** Gaussian NB prediction results ******************')
print("Accuracy: ", accuracy_score(y_test, gb_pred))
print("Micro F1 Score: ", f1_score_rep(y_test, gb_pred, average="micro"))
print("Macro F1 Score: ", f1_score_rep(y_test, gb_pred, average="macro"))

****************** Gaussian NB prediction results ******************
Accuracy:  0.011025100665881978
Micro F1 Score:  0.011025100665881978
Macro F1 Score:  0.06140052595282137


In [13]:
report(y_test, gb_pred)

TPR:  0.2010287808183532
FPR:  0.16662548287694698
FNR:  0.7989712191816468
TNR:  0.833374517123053


In [None]:
### k-Nearest Neighbors
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=3)
knn = knn.fit(x_train, y_train)
knn_pred = knn.predict(x_test)

In [None]:
print('****************** Gaussian NB prediction results ******************')
print("Accuracy: ", accuracy_score(y_test, knn_pred))
print("Micro F1 Score: ", f1_score_rep(y_test, knn_pred, average="micro"))
print("Macro F1 Score: ", f1_score_rep(y_test, knn_pred, average="macro"))

In [None]:
report(y_test, knn_pred)

### Support Vector Machine
from sklearn.svm import SVC
svm = SVC(kernel='rbf', C = 1)
t_s = time()
svm = svm.fit(x_train, y_train)
print(time() - t_s)
svm_pred = svm.predict(x_test)

print('****************** SVM prediction results ******************')
print("Accuracy: ", accuracy_score(y_test, svm_pred))
print("Micro F1 Score: ", f1_score_rep(y_test, svm_pred, average="micro"))
print("Macro F1 Score: ", f1_score_rep(y_test, svm_pred, average="macro"))

In [8]:
### Logistic Regression
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(random_state=0)
t_s = time()
lr = lr.fit(x_train, y_train)
print(time() - t_s)
lr_pred = lr.predict(x_test)

196.60500383377075


In [9]:
print('****************** Logistic Regression prediction results ******************')
print("Accuracy: ", accuracy_score(y_test, lr_pred))
print("Micro F1 Score: ", f1_score_rep(y_test, lr_pred, average="micro"))
print("Macro F1 Score: ", f1_score_rep(y_test, lr_pred, average="macro"))

****************** Logistic Regression prediction results ******************
Accuracy:  0.9947047290477838
Micro F1 Score:  0.9947047290477838
Macro F1 Score:  0.6106085502074924


In [12]:
def report(test, pred):
    cm = confusion_matrix(test, pred)
    cm_df = pd.DataFrame(cm)
    
    tp_arr = []
    fp_arr = []
    tn_arr = []
    fn_arr = []
    for i in range(cm_df.shape[0]):
      tp = cm_df[i][i]
      fp = sum(cm_df[i])-tp
      fn = sum(cm_df.iloc[i])-tp
      total = cm_df.sum().sum()
      tn = total-fp-fn-tp
      tpr = tp/(tp+fn)
      fpr = fp/(tn+fp)
      fnr = fn/(fn+tp)
      tnr = tn/(tn+fp)
      tp_arr.append(tpr)
      tn_arr.append(tnr)
      fp_arr.append(fpr)
      fn_arr.append(fnr)
    
    print("TPR: ", np.mean(tp_arr))
    print("FPR: ", np.mean(fp_arr))
    print("FNR: ", np.mean(fn_arr))
    print("TNR: ", np.mean(tn_arr))
    return
report(y_test, lr_pred)

TPR:  0.4970425933361226
FPR:  0.06236704128121057
FNR:  0.5029574066638774
TNR:  0.9376329587187894


In [None]:
### Multi-layer Perceptron
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(random_state=1, max_iter=100)
mlp = mlp.fit(x_train, y_train)
mlp_pred = mlp.predict(x_test)

In [None]:
print('****************** MLP prediction results ******************')
print("Accuracy: ", accuracy_score(y_test, mlp_pred))
print("Micro F1 Score: ", f1_score_rep(y_test, mlp_pred, average="micro"))
print("Macro F1 Score: ", f1_score_rep(y_test, mlp_pred, average="macro"))

In [None]:
report(y_test, mlp_pred)

In [9]:
### Random Forest
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(max_depth=5, random_state=0)
rf = rf.fit(x_train, y_train)
rf_pred = rf.predict(x_test)

In [10]:
print('****************** RF prediction results ******************')
print("Accuracy: ", accuracy_score(y_test, rf_pred))
print("Micro F1 Score: ", f1_score_rep(y_test, rf_pred, average="micro"))
print("Macro F1 Score: ", f1_score_rep(y_test, rf_pred, average="macro"))

****************** RF prediction results ******************
Accuracy:  0.9943054868078997
Micro F1 Score:  0.9943054868078997
Macro F1 Score:  0.40614848049847385


In [11]:
report(y_test, rf_pred)

TPR:  0.34227308742191437
FPR:  0.0690584344212764
FNR:  0.6577269125780857
TNR:  0.9309415655787237


In [12]:
### AdaBoost Classifier
from sklearn.ensemble import AdaBoostClassifier
ada = AdaBoostClassifier(n_estimators=100)
ada = ada.fit(x_train, y_train)
ada_pred = ada.predict(x_test)

In [13]:
print('****************** ADA prediction results ******************')
print("Accuracy: ", accuracy_score(y_test, ada_pred))
print("Micro F1 Score: ", f1_score_rep(y_test, ada_pred, average="micro"))
print("Macro F1 Score: ", f1_score_rep(y_test, ada_pred, average="macro"))

****************** ADA prediction results ******************
Accuracy:  0.7190854897104763
Micro F1 Score:  0.7190854897104763
Macro F1 Score:  0.14474168785613004


In [14]:
report(y_test, ada_pred)

TPR:  0.15309499822291842
FPR:  0.20649498780664255
FNR:  0.8469050017770816
TNR:  0.7935050121933576


In [None]:
### Gradient Boosting Classifier
from sklearn.ensemble import GradientBoostingClassifier
gb = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0)
gb = gb.fit(x_train, y_train)
gb_pred = gb.predict(x_test)

In [None]:
print('****************** GB prediction results ******************')
print("Accuracy: ", accuracy_score(y_test, gb_pred))
print("Micro F1 Score: ", f1_score_rep(y_test, gb_pred, average="micro"))
print("Macro F1 Score: ", f1_score_rep(y_test, gb_pred, average="macro"))

In [None]:
report(y_test, gb_pred)

In [15]:
### XGBoost
from xgboost import XGBClassifier
xgb = XGBClassifier()
xgb = xgb.fit(x_train, y_train)
xgb_pred = xgb.predict(x_test)

In [16]:
print('****************** GB prediction results ******************')
print("Accuracy: ", accuracy_score(y_test, xgb_pred))
print("Micro F1 Score: ", f1_score_rep(y_test, xgb_pred, average="micro"))
print("Macro F1 Score: ", f1_score_rep(y_test, xgb_pred, average="macro"))

****************** GB prediction results ******************
Accuracy:  0.6319052766745963
Micro F1 Score:  0.6319052766745963
Macro F1 Score:  0.39569944656616


In [17]:
report(y_test, xgb_pred)

TPR:  0.4995115399751138
FPR:  0.062004103249792615
FNR:  0.5004884600248861
TNR:  0.9379958967502074
