In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, ConfusionMatrixDisplay

This part is used to perform some tests over the various dataset in order to compare each one's performance with different possible set of parameters. The results will allow us to find out which particular combination is giving us the best results in terms of accuracy.

In [None]:

files = ['dataset_0.1.csv',
         'dataset_0.3.csv',
         'dataset_0.5.csv',
         'dataset_1.csv',
         'dataset_2.csv',
         'dataset_5.csv']

# All possible features:
# 'avg datalen dl'
# 'std datalen dl'
# 'n_packets dl',
# 'avg iat dl',
# 'std iat dl',
# 'avg datalen ul',
# 'std datalen ul',
# 'n_packets ul',
# 'avg iat ul',
# 'std iat ul'

# Here it is possible to select different features
features = [ 'avg datalen dl', 'std datalen dl', 'avg iat dl', 'std iat dl', 'n_packets dl']

metric = 'euclidean'

# Here you can set the maximum k for the test --> usually performance reach the maximum when K is between 0 and 25 and tend to decrease as K increases
K_max = 50

print(f'Params: {features}')
print(f'Metric: {metric}')

# We use a dict in order to easily check the results we are looking for
output = {}

for l in files:
  output[l] = {}
  df = pd.read_csv(l)

  # These lines are used to get rid of the rows with 0 and -1 since both mean and std are computed when packets are more than 2
  df.drop(df[df['n_packets dl'] < 2].index, inplace = True)
  df.drop(df[df['n_packets ul'] < 2].index, inplace = True)

  # Select the features
  df = df[features + ['supervised']]
  df.dropna(inplace = True, subset= features)

  # Randomly scramble the dataset
  df = df.sample(frac=1)

  X = df.drop(['supervised'],axis=1);
  Y = df['supervised'];

  acc = []
  # We repeat for all K up to K_max
  for k in range(1,K_max):
    knn = KNeighborsClassifier(n_neighbors=k, weights='distance', metric = metric)
    # We use cross-validation 10 fold
    score = cross_val_score(knn, X, Y, cv = 10)
    # We add to the accuracy list the mean of the cross-validations scores
    acc.append(np.mean(score))

  output[l]['Avg Accuracy'] = sum(acc)/len(acc)
  output[l]['Max Accuracy'] = max(acc)
  output[l]['Best K'] = np.argmax(acc)+1

for k in output.keys():
  print(f"For {k} we have :")
  print(output[k])
  print('')

The following part is used to see the results graphically and we used it to analyze singularly each dataset's performance

In [None]:
# Here it is possible to select some set of parameters, each one will also have a consideration on given performance or a rank

# ONLY UL PARAMS:
params = ['avg datalen ul','std datalen ul', 'avg iat ul', 'std iat ul', 'n_packets ul'] #SCARCELY PERFORMING
# ONLY DL PARAMS:
#params = ['avg datalen dl','std datalen dl', 'avg iat dl', 'std iat dl', 'n_packets dl'] #1 BEST PERFORMING
# ALL PARAMS :
#params = ['avg datalen dl', 'std datalen dl', 'n_packets dl', 'avg iat dl', 'std iat dl', 'avg datalen ul', 'std datalen ul', 'n_packets ul', 'avg iat ul', 'std iat ul'] #2
# ONLY DATALEN PARAMS :
#params = ['avg datalen dl','std datalen dl','avg datalen ul','std datalen ul'] #3

K_max = 50
metric = 'euclidean'

# Here the dataset can be selected --> on average the dataset_5 is the one giving best performances
df = pd.read_csv('dataset_5.csv')


df.drop(df[df['n_packets dl'] < 2].index, inplace = True)
df.drop(df[df['n_packets ul'] < 2].index, inplace = True)
df = df[params + ['supervised']]
df.dropna(inplace = True, subset= params)
df = df.sample(frac=1)

# 10-fold cross-validation : split the df in a training one and in a test one
X = df.drop(['supervised'],axis = 1)
Y = df['supervised']
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.1)


acc = []
for k in range(1,K_max):
  knn = KNeighborsClassifier(n_neighbors=k, weights='distance', metric = metric)
  score = cross_val_score(knn, X, Y, cv = 10)
  acc.append(np.mean(score))

print(f'Max Accuracy: {max(acc)}')
print(f'Avg Accuracy: {sum(acc)/len(acc)}')
print(f'Best K: {np.argmax(acc)+1}')

# Computing best k
bestk = np.argmax(acc)+1

# Re-running the algorithm
knn = KNeighborsClassifier(n_neighbors=bestk, weights='distance')
knn.fit(X_train, y_train)

# Compute predictions
knn_predict = knn.predict(X_test)

# Plot the confusion matrix
fig, ax = plt.subplots(figsize=(15, 15));
ConfusionMatrixDisplay.from_predictions(knn_predict, y_test, ax=ax, normalize='true');

fig = plt.figure();
plt.plot(range(1,K_max),acc);
plt.grid(visible=True);
plt.xlabel('K');
plt.ylabel('Accuracy');