In [None]:
# import libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pylab import rcParams
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import StandardScaler, VectorAssembler
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from pyspark.sql.functions import col

In [None]:
train_df = pd.read_csv('GA2Datasets/UNSW_NB15_training-set.csv')
test_df = pd.read_csv('GA2Datasets/UNSW_NB15_testing-set.csv')

### Custom pipeline for data pre-processing

In [None]:
class PreProcessPipeline:
    def __init__(self, label_encode = True, process_label = True):
        self.label_encode = label_encode
        self.process_label = process_label
    
    def fit(self):
        return self

    def transform(self, df):
        df = df.drop('id', axis=1)
        df = df.copy()
        if self.label_encode:
            columns = ['proto', 'service', 'state'] 
            for column in columns:
                unique_values = df[column].unique()
                mapping = {value: index for index, value in enumerate(unique_values)}
                df[column] = df[column].map(mapping)

        if self.process_label:
            def label_transformer(category):
                if category == 'Normal':
                    return 0
                elif category in ['Reconnaissance', 'Analysis', 'Fuzzers', 'Shellcode', 'Generic']:
                    return 0
                elif category in ['Backdoor', 'DoS', 'Exploits', 'Worms']:
                    return 1

            df['label'] = df['attack_cat'].apply(label_transformer)
            df.drop('attack_cat', axis=1, inplace=True)

        return df

In [None]:
train_df.isnull().sum()

In [None]:
pipeline = PreProcessPipeline(label_encode=True, process_label=True)
train_df = pipeline.transform(train_df)
test_df = pipeline.transform(test_df)

In [None]:
rcParams["figure.figsize"]=(20,22)
train_df.hist()
plt.grid()

In [None]:
spark = SparkSession.builder.appName("CSCI316GP2").getOrCreate()

In [None]:
# Create Spark data frame

spark_train_df = spark.createDataFrame(train_df)
spark_test_df = spark.createDataFrame(test_df)

# Define the feature columns
feature_columns = spark_train_df.columns[:-1]  # Exclude the "label" column

# Assemble features into a single vector column
feature_assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
train = feature_assembler.transform(spark_train_df)
test = feature_assembler.transform(spark_test_df)

In [None]:
# Initialize StandardScaler
scaler = StandardScaler(inputCols=feature_columns, outputCol="scaled_features")

# Fit and transform the scaler on the training data
scaler_model = scaler.fit(train)
train_scaled = scaler_model.transform(train)
test_scaled = scaler_model.transform(test)

# Initialize SMOTE
smote = SMOTE(samplingStrategy="auto", k=5, percentage=100)

# Apply SMOTE to the scaled training data
train_resampled = smote.fit(train_scaled).transform(train_scaled)


In [None]:
# Tools for SVM
from pyspark.ml.classification import LinearSVC
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.mllib.evaluation import BinaryClassificationMetrics

In [None]:
# Train an SVM model
svm = LinearSVC(featuresCol="features", labelCol="label", maxIter=100)
svm_model = svm.fit(train)

In [None]:
# Make prediction 

predictions = svm_model.transform(test)

# Evaluate the model's accuracy
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)

print("Accuracy:", accuracy)

# Create an evaluator for accuracy
roc_evaluator = BinaryClassificationEvaluator(
    labelCol="label", rawPredictionCol="prediction", metricName="areaUnderROC")
roc_score = roc_evaluator.evaluate(predictions)
print("Area under ROC = %g" % roc_score)

# Create an evaluator for f1 score
pr_evaluator = BinaryClassificationEvaluator(
    labelCol="label", rawPredictionCol="prediction", metricName="areaUnderPR")
pr_score = pr_evaluator.evaluate(predictions)
print("Area under PR = %g" % pr_score)


In [None]:
# With hyper parameter

# Train an SVM model with different hyperparameters
svm = LinearSVC(featuresCol="features", labelCol="label", maxIter=100, regParam=0.01)
svm_model = svm.fit(train)

In [None]:
# spark.stop()