In [None]:
import pandas as pd
import numpy as np

import itertools
import multiprocessing as mp

import findspark
import pyspark

from pyspark.sql import SparkSession
from pyspark import SparkContext

import matplotlib.pyplot as plt
from matplotlib import style
style.use("ggplot")

import scipy.cluster.hierarchy as sch
from sklearn.cluster import KMeans

from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator
from pyspark.sql.types import StructType, StructField, NumericType
from pyspark.sql.functions import col #To drop stuff

#Changing the non-attack labels applied from clustering to 0
#All attack traffic will be labelled as 1
panda_df = pd.read_csv(r'C:\Users\cayos\OneDrive\Desktop\BOUN_DDoS dataset\BOUN_TCP_LABELS.csv')
panda_df['label'] = panda_df['label'].replace([2], 0)
panda_df['label'] = panda_df['label'].replace([3], 0)
panda_df['label'] = panda_df['label'].replace([4], 0)
panda_df['label'] = panda_df['label'].replace([5], 0)
panda_df.to_csv(r'C:\Users\cayos\OneDrive\Desktop\BOUN_DDoS dataset\BOUN_TCP_NB_LABELS.csv', index=False)

In [None]:
#Data into Spark
#spark = SparkSession.builder.appName('BOUN_TCP_Anon').getOrCreate()
spark = SparkSession.builder     .master('local[*]')     .config("spark.driver.memory", "15g")     .appName('BOUN_TCP_Anon')     .getOrCreate()

#READ INTO THE CSV FILE FROM THE MERGE
Labelled_data = spark.read.csv(
    path=r'C:\Users\cayos\OneDrive\Desktop\BOUN_DDoS dataset\BOUN_TCP_LABELS.csv',
    sep=",",
    header=True,
    quote='"',
    inferSchema=True,
)

In [None]:
Labelled_data_drop=['PCA_1','PCA_2', 'PCA_3', 'PCA_4', 'PCA_5' ]
Labelled_data = Labelled_data.drop(*Labelled_data_drop)
Labelled_data.show(5)

In [None]:
from pyspark.ml.feature import StringIndexer
from pyspark.sql.functions import col


#Index all IP address with unique INT value from String in new column
indexer_dst = StringIndexer(inputCol="Dst_IP", outputCol="Dst_IP_index")
indexer_model = indexer_dst.fit(Labelled_data)
indexed_data_dst= indexer_model.transform(Labelled_data)

indexer_src = StringIndexer(inputCol="Src_ip", outputCol="Src_IP_index")
indexer_model_src = indexer_src.fit(indexed_data_dst)
indexed_data_src = indexer_model_src.transform(indexed_data_dst)

indexer_ttl = StringIndexer(inputCol="TTL", outputCol="TTL_index")
indexer_model_ttl = indexer_ttl.fit(indexed_data_src)
indexed_data = indexer_model_ttl.transform(indexed_data_src)

indexed_data.show(5)

In [None]:
#if label = 1 and IP != 10.50.199.86 drop row
for i in range(1, len(indexed_data)):
    if indexed_data.filter(col('label') == 1) and indexed_data.filter(col('Src_ip') != ['10.50.199.86']):
        dropped_indexed_data = indexed_data.drop(i)
        
indexed_data.show(5)

In [None]:
IP_filter6 = indexed_data1.filter(col('Dst_ip').isin(['10.50.199.86']) == True).drop()
IP_filter7 = IP_filter6.filter(col('Dst_ip').isin(['10.50.199.86']) == False)
IP_filter6.show(5)

In [None]:
#Extract features
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
features_columns = ('SYN','Frame_lng', 'Dst_IP_index', 'TTL_index', 'ACK', 'Dst_Port',) #The columns to vecotrize intro features collumn

assemble=VectorAssembler(inputCols=features_columns, outputCol='features') #Vectorize

assembled_data=assemble.transform(indexed_data) #Dataframe with vectorized column
assembled_data.show(5)
print(assemble)

In [None]:
label_features = assembled_data.drop('SYN','Frame_lng', 'Dst_IP_index', 'TTL_index', 'ACK', 'Dst_Port', 'Src_ip', 'Dst_IP', 'Src_Port', 'RST', 'TTL', 'Pro', 'Src_IP_index', 'Frame_No', 'Time')
label_features.show(5)

In [None]:
#Split data into training and testing set

splits = label_features.randomSplit([0.7, 0.3], 1234)
train = splits[0]
test = splits[1]

In [None]:
#Smoothing = 1 to account for 0 probability score of some attributes

from pyspark.ml.classification import NaiveBayes
Naive_Bayes = NaiveBayes(labelCol="label",featuresCol="features", smoothing=1,modelType="multinomial")
model = Naive_Bayes.fit(train)

In [None]:
predictions = model.transform(test)
predictions.select("label", 
"probability", "prediction").show(5)

In [None]:
#Test set accuracy = 0.7068809374191402
predictions.show(20000)

In [None]:
test = predictions.filter(col('prediction').isin([1]) == True).count()
test

In [None]:
#Count true predictions in classification
IP_filter = predictions.filter(col('Dst_ip').isin('10.50.199.86') == True)
#IP_filter = predictions.filter(col('label').isin('1') == True)
IP_filter0 = IP_filter.filter(col('prediction').isin([0]) == True).count()
IP_filter1 = IP_filter.filter(col('prediction').isin([1]) == True).count()
IP_filter2 = IP_filter.filter(col('prediction').isin([2]) == True).count()
IP_filter3 = IP_filter.filter(col('prediction').isin([3]) == True).count()
IP_filter4 = IP_filter.filter(col('prediction').isin([4]) == True).count()
IP_filter5 = IP_filter.filter(col('prediction').isin([5]) == True).count()
IP_Filter_Array = [IP_filter0,IP_filter1,IP_filter2,IP_filter3,IP_filter4,IP_filter5]
IP_Filter_Array

In [None]:
#K CLUSTERS

#100% Total points: 297953

#29.999% Total points: 89197

#Predicted attacks: 13356
#Correctly predicted attacks: 12987
#False positive rate: 0.2206

#Predicted legitimate: 75658
#All correct