In [57]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pyspark.sql import SparkSession
from pyspark.ml.feature import OneHotEncoder, VectorAssembler, StringIndexer
from pyspark.ml import Pipeline



In [58]:
# Load data
df = pd.read_csv('../UNSW_NB15_testing-set.csv')

## Pre-processing data

In [59]:
# Initialize a Spark session
spark = SparkSession.builder \
    .appName("MySparkApp") \
    .getOrCreate()

# Sample DataFrame df
columns = ["id", "dur", "proto", "service", "state", "spkts", "dpkts", "sbytes", "dbytes",
           "rate", "sttl", "dttl", "sload", "dload", "sloss", "dloss", "sinpkt", "dinpkt",
           "sjit", "djit", "swin", "stcpb", "dtcpb", "dwin", "tcprtt", "synack", "ackdat",
           "smean", "dmean", "trans_depth", "response_body_len", "ct_srv_src", "ct_state_ttl",
           "ct_dst_ltm", "ct_src_dport_ltm", "ct_dst_sport_ltm", "ct_dst_src_ltm", "is_ftp_login",
           "ct_ftp_cmd", "ct_flw_http_mthd", "ct_src_ltm", "ct_srv_dst", "is_sm_ips_ports",
           "attack_cat", "label"]
df = spark.createDataFrame(df, columns)


In [60]:
# # List of columns to index
# columns_to_index = ["proto", "service", "state", "attack_cat"]

# # Create a list of StringIndexer stages for each column
# indexers = [StringIndexer(inputCol=col, outputCol=col + "_index") for col in columns_to_index]

# # Create a pipeline
# pipeline = Pipeline(stages=indexers)

# # Fit and transform the data
# indexed_df = pipeline.fit(df).transform(df)

# # Replace the original columns with their respective indexed columns
# for col in columns_to_index:
#     indexed_col = col + "_index"
#     df = indexed_df.drop(col).withColumnRenamed(indexed_col, col)



In [61]:
# Create a StringIndexer for the 'proto' column
proto_indexer = StringIndexer(inputCol="proto", outputCol="protoIndex")

# Fit and transform the data
proto_indexed = proto_indexer.fit(df).transform(df)

# Replace the 'proto' column with 'protoIndex' column
df = proto_indexed.drop("proto").withColumnRenamed("protoIndex", "proto")

# Convert PySpark DataFrame to Pandas DataFrame
proto_pandas = proto_indexed.toPandas()


In [62]:
# Create a StringIndexer for the 'proto' column
service_indexer = StringIndexer(inputCol="service", outputCol="serviceIndex")

# Fit and transform the data
service_indexed = service_indexer.fit(df).transform(df)

# Replace the 'proto' column with 'protoIndex' column
df = service_indexed.drop("service").withColumnRenamed("serviceIndex", "service")

# Convert PySpark DataFrame to Pandas DataFrame
service_pandas = service_indexed.toPandas()


In [63]:
# Create a StringIndexer for the 'state' column
state_indexer = StringIndexer(inputCol="state", outputCol="stateIndex")

# Fit and transform the data
state_indexed = state_indexer.fit(df).transform(df)

# Replace the 'state' column with 'stateIndex' column
df = state_indexed.drop("state").withColumnRenamed("stateIndex", "state")

# Convert PySpark DataFrame to Pandas DataFrame
state_pandas = state_indexed.toPandas()


In [64]:
# Create a StringIndexer for the 'attack_cat' column
attack_cat_indexer = StringIndexer(inputCol="attack_cat", outputCol="attack_catIndex")

# Fit and transform the data
attack_cat_indexed = attack_cat_indexer.fit(df).transform(df)

# Replace the 'attack_cat' column with 'attack_catIndex' column
df = attack_cat_indexed.drop("attack_cat").withColumnRenamed("attack_catIndex", "attack_cat")

# Convert PySpark DataFrame to Pandas DataFrame
attack_cat_pandas = attack_cat_indexed.toPandas()


In [66]:
# Drop label and arrage the columns back in order
columns = ["id", "dur", "proto", "service", "state", "spkts", "dpkts", "sbytes", "dbytes",
           "rate", "sttl", "dttl", "sload", "dload", "sloss", "dloss", "sinpkt", "dinpkt",
           "sjit", "djit", "swin", "stcpb", "dtcpb", "dwin", "tcprtt", "synack", "ackdat",
           "smean", "dmean", "trans_depth", "response_body_len", "ct_srv_src", "ct_state_ttl",
           "ct_dst_ltm", "ct_src_dport_ltm", "ct_dst_sport_ltm", "ct_dst_src_ltm", "is_ftp_login",
           "ct_ftp_cmd", "ct_flw_http_mthd", "ct_src_ltm", "ct_srv_dst", "is_sm_ips_ports",
           "attack_cat"]
df = df.select(columns)

# Show data
df.show()

+---+--------+-----+-------+-----+-----+-----+------+------+-----------+----+----+-----------+-----------+-----+-----+----------+----------+-----------+-----------+----+----------+----------+----+--------+--------+--------+-----+-----+-----------+-----------------+----------+------------+----------+----------------+----------------+--------------+------------+----------+----------------+----------+----------+---------------+----------+
| id|     dur|proto|service|state|spkts|dpkts|sbytes|dbytes|       rate|sttl|dttl|      sload|      dload|sloss|dloss|    sinpkt|    dinpkt|       sjit|       djit|swin|     stcpb|     dtcpb|dwin|  tcprtt|  synack|  ackdat|smean|dmean|trans_depth|response_body_len|ct_srv_src|ct_state_ttl|ct_dst_ltm|ct_src_dport_ltm|ct_dst_sport_ltm|ct_dst_src_ltm|is_ftp_login|ct_ftp_cmd|ct_flw_http_mthd|ct_src_ltm|ct_srv_dst|is_sm_ips_ports|attack_cat|
+---+--------+-----+-------+-----+-----+-----+------+------+-----------+----+----+-----------+-----------+-----+-----+--