Import the libraries we'll be using

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, zero_one_loss
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
test_data = pd.read_csv('CMP417_testing dataset.csv')
train_data = pd.read_csv('CMP417_training dataset.csv')

In [None]:
sns.set()

train_data["proto"], train_protocols = pd.factorize(train_data["proto"])
train_data["service"], train_services = pd.factorize(train_data["service"])
train_data["state"], train_states = pd.factorize(train_data["state"])
train_data["attack_cat"], train_attacks = pd.factorize(train_data["attack_cat"])

test_data["proto"], test_protocols = pd.factorize(test_data["proto"])
test_data["service"], test_services = pd.factorize(test_data["service"])
test_data["state"], test_states = pd.factorize(test_data["state"])
test_data["attack_cat"], test_attacks = pd.factorize(test_data["attack_cat"])

train_data.head()

In [None]:
X_train = train_data.iloc[:, :train_data.shape[1]-1]
y_train = train_data.iloc[:, train_data.shape[1]-1:]

y_train = y_train.values.ravel()

X_test = test_data.iloc[:, :test_data.shape[1]-1]
y_test = test_data.iloc[:, test_data.shape[1]-1:]

y_test = y_test.values.ravel()

labels = pd.concat([pd.DataFrame(train_data), pd.DataFrame(test_data)])
labels = labels.iloc[:, labels.shape[1]-1:]
labels = labels.values.ravel()
labels


In [None]:
clf_default = RandomForestClassifier(n_jobs=-1, random_state=3, n_estimators=102) # params provided by Dr. Kang and Dr. Kavianpour
trained_model = clf_default.fit(X_train, y_train)
y_probabilities = clf_default.predict_proba(X_test)
print(f'Score: {trained_model.score(X_train, y_train)}')

In [None]:
y_pred = clf_default.predict(X_test)
results = confusion_matrix(y_test, y_pred)
error = zero_one_loss(y_test, y_pred)
print(f'Error: {error}')
print(type(y_test))

In [None]:
cd = pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted'])
print(f'Confusion matrix:\n {results}')

In [None]:
sns.heatmap(cd, square=True, annot=True, fmt='d', cbar=True)
plt.xlabel('True label Predicted')
plt.ylabel('predicted label Actual')
plt.show()

The training model seems to be a bit Marmite on the data. It's not a perfect 80:20 split, so let's concatenate the dataframes, and then try to split them using train_test_split instead...

In [None]:
df = pd.concat([train_data, test_data])

df["proto"], protocols= pd.factorize(df["proto"])
df["service"], services = pd.factorize(df["service"])
df["state"], flags    = pd.factorize(df["state"])
df["attack_cat"], attacks = pd.factorize(df["attack_cat"])

features= df.iloc[:,:df.shape[1]-1]
labels= df.iloc[:,df.shape[1]-1:]
labels= labels.values.ravel()

X_train, X_test, y_train, y_test = train_test_split(df, labels, train_size=0.8, test_size=0.2)
print ("X_train, y_train:", X_train.shape, y_train.shape)
print ("X_test, y_test:", X_test.shape, y_test.shape)

In [None]:
clf = RandomForestClassifier(n_jobs=-1, random_state=3, n_estimators=102) # params provided by Dr. Kang and Dr. Kavianpour
trained_model = clf.fit(X_train, y_train)

print(f'Score: {trained_model.score(X_train, y_train)}')

In [None]:
y_pred = clf.predict(X_test)
results = confusion_matrix(y_test, y_pred)
error = zero_one_loss(y_test, y_pred)
print(f'Error: {error}')
print(type(y_test))

That error value looks much better! Although, there's still some dependence on what data it gets fed from the split. It could be lower, or it could be a little higher. This is due to the distribution of test/train data being stochastic - distributed at random.

In [None]:
cd = pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted'])
print(f'Confusion matrix:\n {results}')

In [None]:
sns.heatmap(cd, square=True, annot=True, fmt='d', cbar=True)
plt.xlabel('True label Predicted')
plt.ylabel('predicted label Actual')
plt.show()

Now, let's optimise this, and see if the optimised parameters can reduce the margin of error further...

In [None]:
from sklearn.model_selection import GridSearchCV
params = {
    "n_estimators": [5,10,50,100,250],
    "max_depth":[2, 4, 8, 16],
    "warm_start": [True, False],
    "n_jobs": [1, None, -1],
    "max_features": ["auto", "sqrt", "log2", None],
    "criterion": ["gini", "entropy"],
    
}
cv = GridSearchCV(clf,params,cv=5)
cv.fit(X_train, y_train)

In [None]:
def display(result):
    print(f'Best params: {result.best_params_}\n')
    mean_score = result.cv_results_['mean_test_score']
    std_score = result.cv_results_['std_test_score']
    parameters = result.cv_results_['params']
    
    for mean, std, params in zip(mean_score,std_score,parameters):
        print(f'{round(mean,3)} + or - {round(std,3)} for the {params}')
        
display(cv)

In [None]:
clf_tuned = RandomForestClassifier(criterion="entropy", max_depth=8, max_features=None, n_estimators=10, n_jobs=1, warm_start=True) # params provided by Dr. Kang and Dr. Kavianpour
trained_model_tuned = clf_tuned.fit(X_train, y_train)
print(f'Score: {trained_model_tuned.score(X_train, y_train)}')

In [None]:
y_pred_tuned = clf_tuned.predict(X_test)
y_probabilities = clf_tuned.predict_proba(X_test)
results = confusion_matrix(y_test, y_pred_tuned)
error = zero_one_loss(y_test, y_pred_tuned)
print(f'Error: {error}')
print(type(y_test))

In [None]:
cd = pd.crosstab(y_test, y_pred_tuned, rownames=['Actual'], colnames=['Predicted'])
print(f'Confusion matrix:\n {results}')

In [None]:
sns.heatmap(cd, square=True, annot=True, fmt='d', cbar=True)
plt.xlabel('True label Predicted')
plt.ylabel('predicted label Actual')
plt.show()

In [None]:
from sklearn.metrics import accuracy_score, roc_curve, auc, precision_recall_curve
acc = accuracy_score(y_test, y_pred_tuned)
print("Accuracy: %.3f" % acc)

While the confusion matrix shows the classifier could correctly classify every point in the data set, we can evaluate performance further using ROC curves and the area under that curve (AUC).

The Receiver Operating Characteristic curve represents the model's True Positive Rate (TPR) against its False Positive Rate (FPR). For a perfect classifier, there are no false positives, and the ROC curve's AUC will be 1. In other words, the more area under the curve, the better the classifier.

In [None]:
y_test_binary = []
y_score = []
for n in range(len(y_test)):
    l = y_test[n]
    y_score.append(sum(y_probabilities[n,:])-y_probabilities[n,0])
    if attacks[l] == 'Normal':
        assert(l == 0)
        y_test_binary.append(0)
    else:
        y_test_binary.append(1)
y_test_binary = np.array(y_test_binary)
y_score = np.array(y_score)

In [None]:
fpr, tpr, threstholds = roc_curve(y_test_binary, y_score, pos_label=1)
roc_auc = auc(fpr, tpr)
precision, recall, thresholds_precision_recall = precision_recall_curve(y_test_binary, y_score, pos_label=1)
auc_pr = auc(recall, precision)

In [None]:
plt.figure()
plt.plot(fpr, tpr, label='AUC = %0.6f' % (roc_auc))
plt.xlabel('False Positive Rate (1-specificity)')
plt.ylabel('True Positive Rate (sensitivity)')
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.title('Receiver Operating Characteristic (ROC) curve (for detecting classified disruptive events)')
plt.legend(loc="lower right", prop={'size': 'small'})
plt.show()

In [None]:
plt.figure()
plt.plot(recall, precision, label="AUC = %0.6f" % (roc_auc))
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.title('Precision Recall Curve (for detecting attacks of any kind)')
plt.legend(loc="lower right", prop={'size': 'small'})
plt.show()