# Group 4 Assignment 2 

### Authors: 
-  Chin Yee Wan 
-  Darrel Koh
-  Nguyen Gia Khanh 
-  Ngo Vu Anh	

# Discover and Visualise the Data

## Import Libraries

In [34]:
# import libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pylab import rcParams
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import StandardScaler, VectorAssembler

from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml import Pipeline
from pyspark.ml.linalg import Vectors

from pyspark.mllib.regression import LabeledPoint

### Read in file

In [None]:
train_df = pd.read_csv('GA2Datasets/UNSW_NB15_training-set.csv')
test_df = pd.read_csv('GA2Datasets/UNSW_NB15_testing-set.csv')

In [None]:
spark = SparkSession.builder.appName("CSCI316GP2").getOrCreate()

In [None]:
spark_df = spark.createDataFrame(train_df)
spark_df.show()

## Functions definition 

#### Custom pipeline for data pre-processing

In [None]:
class PreProcessPipeline:
    def __init__(self, label_encode = True, process_label = True):
        self.label_encode = label_encode
        self.process_label = process_label
    
    def fit(self):
        return self

    def transform(self, df):
        df = df.drop('id', axis=1)
        df = df.copy()
        if self.label_encode:
            columns = ['proto', 'service', 'state']
            for column in columns:
                unique_values = df[column].unique()
                mapping = {value: index for index, value in enumerate(unique_values)}
                df[column] = df[column].map(mapping)

        if self.process_label:
            def label_transformer(category):
                if category == 'Normal':
                    return 0
                elif category in ['Reconnaissance', 'Analysis', 'Fuzzers', 'Shellcode', 'Generic']:
                    return 0
                elif category in ['Backdoor', 'DoS', 'Exploits', 'Worms']:
                    return 1

            df['label'] = df['attack_cat'].apply(label_transformer)
            df.drop('attack_cat', axis=1, inplace=True)

        return df

#### For visualisation

In [None]:
spark.conf.set('spark.sql.repl.eagerEval.enabled', True)

## Data Exploration

### Convert Spark to Pandas

In [None]:
# Convert Spark DataFrames to Pandas
# train_df = train_df.toPandas()
# test_df_pandas = test_df.toPandas()
# features_df_pandas = features_df.toPandas()

### Explore the Features set

In [None]:
# features_df.head(20)
# features_df.show()

### Explore Training Dataset

In [None]:
train_df.isnull().sum()

In [None]:
train_df.info()
# train_df.printSchema()

In [None]:
# Calculate the number of zeros in each column:
zero_counts = {}

for column in train_df.columns:
    zero_counts[column] = (train_df[column] == "0").sum()
    
# Create a DataFrame from the zero_counts dictionary
zero_counts_df = pd.DataFrame(list(zero_counts.items()), columns=['Column', 'Zero Count'])

# Display the DataFrame
print(zero_counts_df)


In [None]:
train_df.describe()

#### There are 4 Objects that will require Encoding
- proto 
- service
- state
- attack_cat (1 of target variables)

##### Proto Attribute:

In [None]:
# Read the unique values of the column 'proto' in the dataframe 'train_df' 
unique_values = train_df['proto'].unique()
unique_values

In [None]:
# Group data by 'proto' and calculate the sum for each category
proto_sum = train_df.groupby('proto').size()

# Plotting
plt.figure(figsize=(10, 6))
proto_sum.plot(kind='bar')
plt.title('Sum of Records by Proto')
plt.xlabel('Service')
plt.ylabel('Sum of Records')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
proto_sum

##### Service Attributes:
- Convert '-' to 0
- The rest normal

In [None]:
# Read the unique values of the column 'service' in the dataframe 'train_df' 
unique_values = train_df['service'].unique()
unique_values

In [None]:
# Group data by 'service' and calculate the sum for each category
service_sum = train_df.groupby('service').size()

# Plotting
plt.figure(figsize=(10, 6))
service_sum.plot(kind='bar')
plt.title('Sum of Records by Service')
plt.xlabel('Service')
plt.ylabel('Sum of Records')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
service_sum

##### State Attribute:

In [None]:
# Read the unique values of the column 'state' in the dataframe 'train_df' 
unique_values = train_df['state'].unique()
unique_values

In [None]:
# Group data by 'state' and calculate the sum for each category
state_sum = train_df.groupby('state').size()

# Plotting
plt.figure(figsize=(10, 6))
state_sum.plot(kind='bar')
plt.title('Sum of Records by State')
plt.xlabel('State')
plt.ylabel('Sum of Records')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
state_sum

##### Attack_cat Attribute:

In [None]:
# Read the unique values of the column 'attack_cat' in the dataframe 'train_df' 
# Target variable: attack_cat
unique_values = train_df['attack_cat'].unique()
unique_values

In [None]:
# Group data by 'attack_cat' and calculate the sum for each category
attack_cat_sum = train_df.groupby('attack_cat').size()

# Plotting
plt.figure(figsize=(10, 6))
attack_cat_sum.plot(kind='bar')
plt.title('Sum of Records by Attack Category')
plt.xlabel('Attack Category')
plt.ylabel('Sum of Records')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
attack_cat_sum

In [None]:
data = {
    'attack_cat': ['Analysis', 'Backdoor', 'DoS', 'Exploits', 'Fuzzers', 'Generic', 'Normal', 'Reconnaissance', 'Shellcode', 'Worms'],
    'count': [677, 583, 4089, 11132, 6062, 18871, 37000, 3496, 378, 44]
}

# Create a DataFrame from the data
df = pd.DataFrame(data)

# Calculate the total count of records
total_count = df['count'].sum()

# Calculate the percentage for each attribute
df['percentage'] = (df['count'] / total_count) * 100

print(df)

#### Can consider dropping

- tcprtt,Float,"TCP connection setup round-trip time, the sum of ’synack’ and ’ackdat’." -use this is sufficient, 41k '0' values
- synack,Float,"TCP connection setup time, the time between the SYN and the SYN_ACK packets." - drop
- ackdat,Float,"TCP connection setup time, the time between the SYN_ACK and the ACK packets." -  drop
- ct_ftp_cmd,, No of flows that has a command in ftp session. 81652 '0' values - drop
- ct_flw_http_mthd, No. of flows that has methods such as Get and Post in http service. 74752 '0' values - drop
#### Can consider User-Transformed features
- Involved in the creation of the following features:
    - srcip: Source IP address - not in 
    - dstip: Destination IP address - not in
    - sport: Source port number - not in
    - dsport: Destination port number- not in
    - sttl: Source to destination time to live value 
    - dttl: Destination to source time to live value
    - state: "Indicates to the state and its dependent protocol, e.g. ACC, CLO, CON, ECO, ECR, FIN, INT, MAS, PAR, REQ, RST, TST, TXD, URH, URN, and (-) (if not used state)"
    - service: "http, ftp, smtp, ssh, dns, ftp-data ,irc  and (-) if not much used service"
    - response_body_len: Actual uncompressed content size of the data transferred from the server’s http service
    
##### Attributes not in the dataset:
- is_sm_ips_ports,Binary,"If source (srcip) and destination (dstip) IP addresses equal and port numbers (sport)(dsport)  equal then, this variable takes value 1 else 0"
- ct_state_ttl,Integer,No. for each state (state) according to specific range of values for source/destination time to live (sttl) (dttl).
##### Attributes still in the dataset:
- ct_srv_src,integer,No. of connections that contain the same service (service) and source address (srcip) in 100 connections according to the last time (response_body_len).
- ct_srv_dst,integer,No. of connections that contain the same service (service) and destination address (dstip) in 100 connections according to the last time (response_body_len).
- ct_dst_ltm,integer,No. of connections of the same destination address (dstip) in 100 connections according to the last time (response_body_len).
- ct_src_ ltm,integer,No. of connections of the same source address (srcip) in 100 connections according to the last time (response_body_len).
- ct_src_dport_ltm,integer,No of connections of the same source address (srcip) and the destination port (dsport) in 100 connections according to the last time (response_body_len).
- ct_dst_sport_ltm,integer,No of connections of the same destination address (dstip) and the source port (sport) in 100 connections according to the last time (response_body_len).
- ct_dst_src_ltm,integer,No of connections of the same source (srcip) and the destination (dstip) address in in 100 connections according to the last time (response_body_len).

  


##### tcprtt attribute:

In [None]:
unique_values = train_df.groupby('tcprtt').size()
unique_values

# train_df['tcprtt']

##### is_sm_ips_ports attribute:

In [None]:
unique_values = train_df['is_sm_ips_ports'].unique()
unique_values

## Data Visualisation

In [None]:
rcParams["figure.figsize"]=(20,22)
train_df.hist()
plt.grid()

# Prepare the data

In [None]:
pipeline = PreProcessPipeline(label_encode=True, process_label=True)
train_df = pipeline.transform(train_df)

### Convert Pandas back to Spark

In [38]:
# Convert Pandas DataFrames back to Spark DataFrames
# train_df_spark = spark.createDataFrame(train_df)
# test_df_spark = spark.createDataFrame(test_df_pandas)

spark_df = spark.createDataFrame(train_df)
spark_df.show()

print(type(spark_df))

+------+-----+-------+-----+-----+-----+------+------+-----------+----+----+-------------+-----+-----+-----+---------+------+----+----+----+-----+-----+----+------+------+------+-----+-----+-----------+-----------------+----------+------------+----------+----------------+----------------+--------------+------------+----------+----------------+----------+----------+---------------+-----+
|   dur|proto|service|state|spkts|dpkts|sbytes|dbytes|       rate|sttl|dttl|        sload|dload|sloss|dloss|   sinpkt|dinpkt|sjit|djit|swin|stcpb|dtcpb|dwin|tcprtt|synack|ackdat|smean|dmean|trans_depth|response_body_len|ct_srv_src|ct_state_ttl|ct_dst_ltm|ct_src_dport_ltm|ct_dst_sport_ltm|ct_dst_src_ltm|is_ftp_login|ct_ftp_cmd|ct_flw_http_mthd|ct_src_ltm|ct_srv_dst|is_sm_ips_ports|label|
+------+-----+-------+-----+-----+-----+------+------+-----------+----+----+-------------+-----+-----+-----+---------+------+----+----+----+-----+-----+----+------+------+------+-----+-----+-----------+-----------------+

23/08/11 12:32:59 WARN TaskSetManager: Stage 8 contains a task of very large size (1366 KiB). The maximum recommended task size is 1000 KiB.


# Model selection and training
- Select machine learning models (Logistic Regression , Decision Tree, Random Forest, Multilayer perceptron).
- Split the data into training and validation sets.
- Train the selected models using the training data.

## Logistic Regression model

### Default Model

In [53]:
# Define the input columns (excluding the 'label' column)
input_columns = spark_df.columns[:-1]  # Exclude the last column ('label')

# Create a VectorAssembler to combine the input columns into a single 'features' column
assembler = VectorAssembler(inputCols=input_columns, outputCol='features')

# Transform the DataFrame to add the 'features' column
assembled_df = assembler.transform(spark_df)

In [54]:
# Split the data into training and validation sets
train_data, val_data = assembled_df.randomSplit([0.8, 0.2], seed=123)

# Build the logistic regression model
lr = LogisticRegression(featuresCol='features', labelCol='label', maxIter=10)

# Create a pipeline for the model
pipeline = Pipeline(stages=[lr])

# Fit the model on the training data
model = pipeline.fit(train_data)

# Make predictions on the validation data
predictions = model.transform(val_data)

# Evaluate the model using a BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(labelCol='label')
auc = evaluator.evaluate(predictions)

print("AUC = ", auc)

23/08/11 12:41:10 WARN TaskSetManager: Stage 16 contains a task of very large size (1366 KiB). The maximum recommended task size is 1000 KiB.
23/08/11 12:41:12 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
23/08/11 12:41:12 WARN TaskSetManager: Stage 18 contains a task of very large size (1366 KiB). The maximum recommended task size is 1000 KiB.
23/08/11 12:41:14 WARN TaskSetManager: Stage 20 contains a task of very large size (1366 KiB). The maximum recommended task size is 1000 KiB.
23/08/11 12:41:14 WARN TaskSetManager: Stage 22 contains a task of very large size (1366 KiB). The maximum recommended task size is 1000 KiB.
23/08/11 12:41:14 WARN TaskSetManager: Stage 24 contains a task of very large size (1366 KiB). The maximum recommended task size is 1000 KiB.
23/08/11 12:41:14 WARN TaskSetManager: Stage 26 contains a task of very large size (1366 KiB). The maximum recommended task size is 1000 KiB.
23/08/11 12:41:14 WARN TaskSetManager: St

AUC =  0.9402999357832275


###  Fine-Tuning 

### Evaluation

## Decision Tree Model

### Default Model

###  Fine-Tuning 

### Evaluation

## Random Forest Model

### Default Model

###  Fine-Tuning 

### Evaluation

## Multilayer perceptron Model

### Default Model

###  Fine-Tuning 

### Evaluation

# Final comparison between Models