In [None]:
# import libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pylab import rcParams
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import StandardScaler, VectorAssembler, StringIndexer
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator


In [None]:
train_df = pd.read_csv('GA2Datasets/UNSW_NB15_training-set.csv')
test_df = pd.read_csv('GA2Datasets/UNSW_NB15_testing-set.csv')

### Custom pipeline for dat pre-processing

In [None]:
class PreProcessPipeline:
    def __init__(self, label_encode = True, process_label = True):
        self.label_encode = label_encode
        self.process_label = process_label
    
    def fit(self):
        return self

    def transform(self, df):
        df = df.drop('id', axis=1)
        df = df.copy()
        if self.label_encode:
            columns = ['proto', 'service', 'state']
            for column in columns:
                unique_values = df[column].unique()
                mapping = {value: index for index, value in enumerate(unique_values)}
                df[column] = df[column].map(mapping)

        if self.process_label:
            def label_transformer(category):
                if category == 'Normal':
                    return 0
                elif category in ['Reconnaissance', 'Analysis', 'Fuzzers', 'Shellcode', 'Generic']:
                    return 0
                elif category in ['Backdoor', 'DoS', 'Exploits', 'Worms']:
                    return 1

            df['label'] = df['attack_cat'].apply(label_transformer)
            df.drop('attack_cat', axis=1, inplace=True)

        return df

In [None]:
train_df.isnull().sum()

In [None]:
pipeline = PreProcessPipeline(label_encode=True, process_label=True)

# Transform both training and test data
train_df = pipeline.transform(train_df)
test_df = pipeline.transform(test_df)

In [None]:
rcParams["figure.figsize"]=(20,22)
train_df.hist()
plt.grid()

In [None]:
spark = SparkSession.builder.appName("CSCI316GP2").getOrCreate()

# Set the random seed
seed = 42
spark.conf.set("spark.seed", seed)

In [None]:
# Convert the preprocessed training data into a Spark DataFrame
spark_train_df = spark.createDataFrame(train_df)
spark_test_df = spark.createDataFrame(test_df)
spark_train_df.show()
spark_test_df.show()

In [None]:
# Define the feature columns
feature_columns = spark_train_df.columns[:-1]  # Exclude the "label" column

In [None]:
# Assemble features into a single vector column
feature_assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
spark_train_df = feature_assembler.transform(spark_train_df)
spark_test_df = feature_assembler.transform(spark_test_df)

In [None]:
# Index the label column
label_indexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(spark_train_df)
label_indexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(spark_test_df)

In [None]:
# Train a RandomForest model
rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="features")

In [None]:
# Create a pipeline
pipeline = Pipeline(stages=[label_indexer, rf])

In [None]:
# Train the pipeline
model = pipeline.fit(spark_train_df)

In [None]:
# Make predictions on the test data
predictions = model.transform(spark_test_df)

In [None]:
# Create an evaluator for accuracy
accuracy_evaluator = MulticlassClassificationEvaluator(
    labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
accuracy = accuracy_evaluator.evaluate(predictions)
print("Accuracy = %g" % accuracy)

# Create an evaluator for f1 score
f1_evaluator = MulticlassClassificationEvaluator(
    labelCol="indexedLabel", predictionCol="prediction", metricName="f1")
f1_score = f1_evaluator.evaluate(predictions)
print("F1 Score = %g" % f1_score)

# Create an evaluator for precision
precision_evaluator = MulticlassClassificationEvaluator(
    labelCol="indexedLabel", predictionCol="prediction", metricName="weightedPrecision")
precision = precision_evaluator.evaluate(predictions)
print("Precision = %g" % precision)

In [None]:
# Train a RandomForest model
rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="features")

# Define the parameter grid
param_grid = ParamGridBuilder() \
    .addGrid(rf.numTrees, [10, 20, 30]) \
    .addGrid(rf.maxDepth, [5, 10, 15]) \
    .build()

# Create a CrossValidator instance
crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=param_grid,
                          evaluator=accuracy_evaluator,
                          numFolds=3)  # Number of cross-validation folds

# Run cross-validation, and choose the best set of parameters
cv_model = crossval.fit(spark_train_df)

best_model = cv_model.bestModel
best_predictions = best_model.transform(spark_test_df)

In [None]:
# Create an evaluator for accuracy
accuracy_evaluator = MulticlassClassificationEvaluator(
    labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
best_accuracy = accuracy_evaluator.evaluate(best_predictions)
print("Accuracy = %g" % best_accuracy)

# Create an evaluator for f1 score
f1_evaluator = MulticlassClassificationEvaluator(
    labelCol="indexedLabel", predictionCol="prediction", metricName="f1")
best_f1_score = f1_evaluator.evaluate(best_predictions)
print("F1 Score = %g" % best_f1_score)

# Create an evaluator for precision
precision_evaluator = MulticlassClassificationEvaluator(
    labelCol="indexedLabel", predictionCol="prediction", metricName="weightedPrecision")
best_precision = precision_evaluator.evaluate(best_predictions)
print("Precision = %g" % best_precision)

In [None]:
# spark.stop()