In [1]:
%load_ext nb_black
import findspark

findspark.init()
from pyspark.sql import SparkSession
import pyspark.mllib.feature as fs
from pyspark.mllib.stat import _statistics
from pyspark.ml.feature import VectorAssembler
import pyspark.sql.functions as F
import pyspark.sql.types as T
import pandas as pd
import numpy as np


spark = SparkSession.builder.getOrCreate()

<IPython.core.display.Javascript object>

In [2]:
# SparkSession.builder.getOrCreate() instead.

<IPython.core.display.Javascript object>

In [3]:
spark

<IPython.core.display.Javascript object>

In [4]:
data = spark.read.csv("DDoS attack Big DataF.csv", inferSchema=True, header=True)

<IPython.core.display.Javascript object>

In [5]:
# df.show()

<IPython.core.display.Javascript object>

In [6]:
# df.dtypes

<IPython.core.display.Javascript object>

In [7]:
# df.printSchema()

<IPython.core.display.Javascript object>

In [8]:
# print(df.describe().show())

<IPython.core.display.Javascript object>

In [9]:
df = data.select(*(F.col(c).cast("float") for c in data.columns))

<IPython.core.display.Javascript object>

In [10]:
df.printSchema()

root
 |-- Flow_Bytes_Sec0: float (nullable = true)
 |-- Flow_Bytes_Sec1: float (nullable = true)
 |-- SYN_Flag_Count: float (nullable = true)
 |-- Init_Win_bytes_backward: float (nullable = true)
 |-- Idle_Max: float (nullable = true)
 |-- PSH_Flag_Count: float (nullable = true)
 |-- ACK_Flag_Count: float (nullable = true)
 |-- URG_Flag_Count: float (nullable = true)
 |-- Packet_Length_Mean: float (nullable = true)
 |-- Packet_Length_Std: float (nullable = true)
 |-- Packet_Length_Variance: float (nullable = true)
 |-- Bwd_IAT_Total: float (nullable = true)
 |-- Fwd_IAT_Std: float (nullable = true)
 |-- Fwd_IAT_Max: float (nullable = true)
 |-- Flow_IAT_Mean: float (nullable = true)
 |-- Flow_IAT_Std: float (nullable = true)
 |-- Flow_IAT_Max: float (nullable = true)
 |-- Fwd_Packet_Length_Std: float (nullable = true)
 |-- Bwd_Packet_Length_Max: float (nullable = true)
 |-- Bwd_Packet_Length_Min: float (nullable = true)
 |-- Bwd_Packet_Length_Mean: float (nullable = true)
 |-- Bwd_Pack

<IPython.core.display.Javascript object>

In [11]:
required_features = [
    "Flow_Bytes_Sec0",
    "Flow_Bytes_Sec1",
    "SYN_Flag_Count",
    "Init_Win_bytes_backward",
    "Idle_Max",
    "PSH_Flag_Count",
    "ACK_Flag_Count",
    "URG_Flag_Count",
    "Packet_Length_Mean",
    "Packet_Length_Std",
    "Packet_Length_Variance",
    "Bwd_IAT_Total",
    "Fwd_IAT_Std",
    "Fwd_IAT_Max",
    "Flow_IAT_Mean",
    "Flow_IAT_Std",
    "Flow_IAT_Max",
    "Fwd_Packet_Length_Std",
    "Bwd_Packet_Length_Max",
    "Bwd_Packet_Length_Min",
    "Bwd_Packet_Length_Mean",
    "Bwd_Packet_Length_Std",
    "Fwd_Packet_Length_Min",
    "Destination_Port",
    "Flow_Duration",
]

<IPython.core.display.Javascript object>

In [12]:
vectoe_assembler = VectorAssembler(inputCols=required_features, outputCol="features")

<IPython.core.display.Javascript object>

In [13]:
vec_df = vectoe_assembler.transform(df)

<IPython.core.display.Javascript object>

In [14]:
vec_df.show(5, truncate=False)

+---------------+---------------+--------------+-----------------------+--------+--------------+--------------+--------------+------------------+-----------------+----------------------+-------------+-----------+-----------+-------------+------------+------------+---------------------+---------------------+---------------------+----------------------+---------------------+---------------------+----------------+-------------+-----+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|Flow_Bytes_Sec0|Flow_Bytes_Sec1|SYN_Flag_Count|Init_Win_bytes_backward|Idle_Max|PSH_Flag_Count|ACK_Flag_Count|URG_Flag_Count|Packet_Length_Mean|Packet_Length_Std|Packet_Length_Variance|Bwd_IAT_Total|Fwd_IAT_Std|Fwd_IAT_Max|Flow_IAT_Mean|Flow_IAT_Std|Flow_IAT_Max|Fwd_Packet_Length_Std|Bwd_Packet_Length_Max|Bwd_Packet_Length_

<IPython.core.display.Javascript object>

In [15]:
vec_df.count()

809361

<IPython.core.display.Javascript object>

In [16]:
train_df, test_df = vec_df.randomSplit([0.7, 0.3])

<IPython.core.display.Javascript object>

In [17]:
train_df.count()

566165

<IPython.core.display.Javascript object>

In [18]:
test_df.count()

243196

<IPython.core.display.Javascript object>

In [19]:
from pyspark.ml.classification import (
    RandomForestClassifier,
    MultilayerPerceptronClassifier,
)

<IPython.core.display.Javascript object>

In [20]:
rf = RandomForestClassifier(featuresCol="features", labelCol="Label")

<IPython.core.display.Javascript object>

In [21]:
%%time
rf_model = rf.fit(train_df)

CPU times: total: 15.6 ms
Wall time: 20.7 s


<IPython.core.display.Javascript object>

In [22]:
%%time
y_pred = rf_model.transform(test_df)

CPU times: total: 0 ns
Wall time: 178 ms


<IPython.core.display.Javascript object>

In [23]:
y_pred.show(truncate=False)

+---------------+---------------+--------------+-----------------------+--------+--------------+--------------+--------------+------------------+-----------------+----------------------+-------------+-----------+-----------+-------------+------------+------------+---------------------+---------------------+---------------------+----------------------+---------------------+---------------------+----------------+-------------+-----+---------------------------------------------+----------------------------------------+-----------------------------------------+----------+
|Flow_Bytes_Sec0|Flow_Bytes_Sec1|SYN_Flag_Count|Init_Win_bytes_backward|Idle_Max|PSH_Flag_Count|ACK_Flag_Count|URG_Flag_Count|Packet_Length_Mean|Packet_Length_Std|Packet_Length_Variance|Bwd_IAT_Total|Fwd_IAT_Std|Fwd_IAT_Max|Flow_IAT_Mean|Flow_IAT_Std|Flow_IAT_Max|Fwd_Packet_Length_Std|Bwd_Packet_Length_Max|Bwd_Packet_Length_Min|Bwd_Packet_Length_Mean|Bwd_Packet_Length_Std|Fwd_Packet_Length_Min|Destination_Port|Flow_Durati

<IPython.core.display.Javascript object>

In [24]:
print(y_pred.columns)

['Flow_Bytes_Sec0', 'Flow_Bytes_Sec1', 'SYN_Flag_Count', 'Init_Win_bytes_backward', 'Idle_Max', 'PSH_Flag_Count', 'ACK_Flag_Count', 'URG_Flag_Count', 'Packet_Length_Mean', 'Packet_Length_Std', 'Packet_Length_Variance', 'Bwd_IAT_Total', 'Fwd_IAT_Std', 'Fwd_IAT_Max', 'Flow_IAT_Mean', 'Flow_IAT_Std', 'Flow_IAT_Max', 'Fwd_Packet_Length_Std', 'Bwd_Packet_Length_Max', 'Bwd_Packet_Length_Min', 'Bwd_Packet_Length_Mean', 'Bwd_Packet_Length_Std', 'Fwd_Packet_Length_Min', 'Destination_Port', 'Flow_Duration', 'Label', 'features', 'rawPrediction', 'probability', 'prediction']


<IPython.core.display.Javascript object>

In [25]:
y_pred.select("rawPrediction", "probability", "prediction").show()

+--------------------+--------------------+----------+
|       rawPrediction|         probability|prediction|
+--------------------+--------------------+----------+
|[15.0743428030911...|[0.75371714015455...|       0.0|
|[15.0743428030911...|[0.75371714015455...|       0.0|
|[15.0743428030911...|[0.75371714015455...|       0.0|
|[16.6058475401928...|[0.83029237700964...|       0.0|
|[16.6058475401928...|[0.83029237700964...|       0.0|
|[0.11856119439408...|[0.00592805971970...|       1.0|
|[0.11856119439408...|[0.00592805971970...|       1.0|
|[0.11856119439408...|[0.00592805971970...|       1.0|
|[0.11856119439408...|[0.00592805971970...|       1.0|
|[0.11856119439408...|[0.00592805971970...|       1.0|
|[0.11856119439408...|[0.00592805971970...|       1.0|
|[0.11856119439408...|[0.00592805971970...|       1.0|
|[0.11856119439408...|[0.00592805971970...|       1.0|
|[0.11856119439408...|[0.00592805971970...|       1.0|
|[0.11856119439408...|[0.00592805971970...|       1.0|
|[0.118561

<IPython.core.display.Javascript object>

In [26]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

<IPython.core.display.Javascript object>

In [27]:
multi_evaluator = MulticlassClassificationEvaluator(
    labelCol="Label", metricName="accuracy"
)

<IPython.core.display.Javascript object>

In [28]:
%%time
multi_evaluator.evaluate(y_pred)

CPU times: total: 0 ns
Wall time: 4.73 s


0.9995024589220217

<IPython.core.display.Javascript object>

In [29]:
from pyspark.mllib.evaluation import MulticlassMetrics

<IPython.core.display.Javascript object>

In [30]:
rf_metric = MulticlassMetrics(y_pred["Label", "prediction"].rdd)



<IPython.core.display.Javascript object>

In [31]:
dir(rf_metric)

['__class__',
 '__del__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_java_model',
 '_sc',
 'accuracy',
 'call',
 'confusionMatrix',
 'fMeasure',
 'falsePositiveRate',
 'logLoss',
 'precision',
 'recall',
 'truePositiveRate',
 'weightedFMeasure',
 'weightedFalsePositiveRate',
 'weightedPrecision',
 'weightedRecall',
 'weightedTruePositiveRate']

<IPython.core.display.Javascript object>

In [32]:
print("precision", rf_metric.precision(1.0))

precision 0.9994237185319987


<IPython.core.display.Javascript object>

In [33]:
print("accuracy", rf_metric.accuracy)

accuracy 0.9995024589220217


<IPython.core.display.Javascript object>

In [34]:
rf_metric.weightedFalsePositiveRate

0.0005263093832384718

<IPython.core.display.Javascript object>