In [None]:
import pandas as pd
import numpy as np
from pandas import read_csv
from sklearn.preprocessing import StandardScaler, LabelEncoder
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import seaborn as sns
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn import preprocessing
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.cluster import KMeans
from sklearn.feature_selection import SelectKBest, f_classif

In [None]:
df=pd.read_csv('Dataset.txt',sep=',').merge(pd.read_csv('Attack_types.txt',sep=' '),on='attack_category',how='left')
df['attack_type']=df['attack_type'].fillna('normal')
df.drop('attack_category',axis=1,inplace=True)
object_cols = df.select_dtypes(include=['object']).columns
df[object_cols] = df[object_cols].apply(LabelEncoder().fit_transform)
scaler = StandardScaler().fit(df)
df_scaled = pd.DataFrame(scaler.transform(df), columns=df.columns)

In [None]:
y = df_scaled['attack_type']
X_train, X_test, y_train, y_test =train_test_split(df_scaled.drop('attack_type',axis=1),y,test_size=0.3,random_state =0)
corr_matrix = df_scaled.corr()
target_col = 'attack_type'
corr_with_target = corr_matrix[target_col]
relevant_features = corr_with_target[abs(corr_with_target) > 0.4].index.tolist()
relevant_features.remove(target_col)
X_train_new = X_train.drop(relevant_features,axis=1)
X_test_new = X_test.drop(relevant_features,axis=1)

In [None]:
for col in df.columns:
    plt.figure(figsize=(10,5))
    plt.hist(df[col],bins=5)
    plt.ylabel("Frequency")
    plt.title(col +" Histogram of Original Datagram")
    plt.show()
    plt.figure(figsize=(10,5))
    plt.title(f'Density plot of {col} in Original Dataframe')
    df[col].plot(kind='density')
    plt.show()
    plt.figure(figsize=(10,5))
    plt.boxplot(df[col])
    plt.title(f'Box plot of {col} in Original Dataframe')
    plt.show()

In [None]:
for col in df_scaled.columns:
    plt.figure(figsize=(10,5))
    plt.hist(df[col],bins=5)
    plt.ylabel("Frequency")
    plt.title(col +" Histogram of Scaled Dataframe")
    plt.show()
    plt.figure(figsize=(10,5))
    plt.title(f'Density plot of {col} in Scaled Dataframe')
    df[col].plot(kind='density')
    plt.show()
    plt.figure(figsize=(10,5))
    plt.boxplot(df[col])
    plt.title(f'Box plot of {col} in Scaled Dataframe')
    plt.show()
    

K-Nearest Neighbors

In [None]:

k, accuracy = [], []
for i in [3, 5, 7]:
    k.append(i)
    knn = KNeighborsClassifier(n_neighbors=i)
    label_encoder = preprocessing.LabelEncoder()
    y_train = label_encoder.fit_transform(y_train)
    knn.fit(X_train_new, y_train)
    X_test_new = X_test_new.dropna()
    y_pred = knn.predict(X_test_new)
    accuracy.append(accuracy_score(y_test.round(), y_pred.round(), normalize=False))
    print("Accuracy (K = ", i, "):", accuracy[-1])

plt.scatter(k, accuracy)
plt.xlabel('Value of K for KNN')
plt.ylabel('Accuracy Score')
plt.title('Accuracy Scores for Values of K in KNN')
plt.show()

Decision Tree

In [None]:
# X = df.drop('attack_type', axis=1)
# y = df['attack_type']
# a = X.columns

# clf_gin = DecisionTreeClassifier(criterion="gini")
# clf_ent = DecisionTreeClassifier(criterion="entropy")
# clf_gin.fit(X, y)
# clf_ent.fit(X, y)

# y_pred_ent = clf_ent.predict(X_test_new)
# y_pred_gin = clf_gin.predict(X_test_new)

# acc_ent = accuracy_score(y_test, y_pred_ent)
# acc_gin = accuracy_score(y_test, y_pred_gin)

# print("Entropy Accuracy:", acc_ent)
# print("Gini Accuracy:", acc_gin)

# rep_ent = classification_report(y_test, y_pred_ent)
# rep_gin = classification_report(y_test, y_pred_gin)

# print(rep_ent)
# print(rep_gin)

# plt.figure(figsize=(40,10))
# plt.subplot(1, 2, 1)

# tree.plot_tree(clf_gin, feature_names=a, class_names=y.astype(str).unique(), filled=True)
# plt.title("Decision Tree (Gini Criterion)")

# plt.subplot(1, 2, 2)

# tree.plot_tree(clf_ent, feature_names=a, class_names=y.astype(str).unique(), filled=True)
# plt.title("Decision Tree (Entropy Criterion)")

# plt.show()

X = df.drop('attack_type', axis=1)
y = df['attack_type']
a = X.columns

clf_gin = DecisionTreeClassifier(criterion="gini")
clf_ent = DecisionTreeClassifier(criterion="entropy")
clf_gin.fit(X_train_new, y_train)
clf_ent.fit(X_train_new, y_train)

y_pred_ent = clf_ent.predict(X_test_new)
y_pred_gin = clf_gin.predict(X_test_new)

acc_ent = accuracy_score(y_test, y_pred_ent)
acc_gin = accuracy_score(y_test, y_pred_gin)

print("Entropy Accuracy:", acc_ent)
print("Gini Accuracy:", acc_gin)

rep_ent = classification_report(y_test, y_pred_ent)
rep_gin = classification_report(y_test, y_pred_gin)

print(rep_ent)
print(rep_gin)

plt.figure(figsize=(40,10))
plt.subplot(1, 2, 1)

tree.plot_tree(clf_gin, feature_names=a, class_names=y.astype(str).unique(), filled=True)
plt.title("Decision Tree (Gini Criterion)")

plt.subplot(1, 2, 2)

tree.plot_tree(clf_ent, feature_names=a, class_names=y.astype(str).unique(), filled=True)
plt.title("Decision Tree (Entropy Criterion)")

plt.show()


Multi-layer Perceptron (MLP)

In [None]:
mlp = MLPClassifier(hidden_layer_sizes=(10,), max_iter=1000, random_state=42)
mlp.fit(X_train_new, y_train)
y_pred = mlp.predict(X_test_new)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred,average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
print("Before Fine Tuning")
print("Accuracy: ", accuracy)
print("Precision Score: ", precision)
print("Recall Score: ",recall)
print("F1 Score: ",f1)

param_grid = {'hidden_layer_sizes': [(10,), (50,), (100,), (10, 10), (50, 50), (100, 100)],
'activation': ['identity', 'logistic', 'tanh', 'relu'],
'solver': ['lbfgs', 'sgd', 'adam'],
'learning_rate': ['constant', 'invscaling', 'adaptive']}

grid_search = GridSearchCV(mlp, param_grid, cv=5)
grid_search.fit(X_train_new, y_train)

print("After Fine Tuning")
print("Best hyperparameters:", grid_search.best_params_)
print("Best accuracy score:", grid_search.best_score_)
new_y_pred = grid_search.predict(X_test_new)

accuracy = accuracy_score(y_test, new_y_pred)
precision = precision_score(y_test, new_y_pred,average='weighted')
recall = recall_score(y_test, new_y_pred,average='weighted')
f1 = f1_score(y_test, new_y_pred,average='weighted')

print("After Fine Tuning")
print("Accuracy of Fine-Tuned MLP: ", accuracy)
print("Precision Score of Fine-Tuned MLP: ", precision)
print("Recall Score of Fine-Tuned MLP: ",recall)
print("F1 Score of Fine-Tuned MLP: ",f1)



K-Mean

In [None]:
k = 2
cluster = df.drop(columns=['attack_type'])
selector = SelectKBest(f_classif, k=k)
x = cluster
y = df['attack_type']
selector.fit(x,y)
data = selector.transform(cluster)
best_features = np.array(x.columns)[selector.get_support()]

print("Top Features: ",best_features)

sse = []
for k in range(1, 11):
    kmeans = KMeans(n_clusters=k,max_iter=300, random_state=0)
    kmeans.fit(data)
    sse.append(kmeans.inertia_)

number_of_clusters=range(1,11)
plt.plot(number_of_clusters,sse,marker='x')
plt.title('Elbow Method')
plt.xlabel("Number of clusters")
plt.ylabel("SSE")
plt.show()

new_k = 5
kmeans = KMeans(n_clusters=new_k,max_iter=300, random_state=0)
kmeans.fit(data)
labels = kmeans.labels_

scatter = plt.scatter(data[:, 0], data[:, 1], c=labels, cmap='rainbow')
plt.xlabel(best_features[0])
plt.ylabel(best_features[1])
plt.title('Cyber-attacks in network traffic')
plt.colorbar(scatter)
plt.show()