In [1]:
# Spark imports
import pyspark
import numpy as np
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window

# Python imports
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MultiLabelBinarizer

# Initialize Spark session
spark = SparkSession.builder \
    .appName("Pre-Preprocess Mission Log") \
    .master("spark://192.168.1.2:7077") \
    .config("spark.driver.cores", "2") \
    .config("spark.driver.memory", "10g") \
    .config("spark.executor.memory", "12g") \
    .config("spark.executor.cores", "3") \
    .config("spark.dynamicAllocation.shuffleTracking.enabled", "true") \
    .config("spark.dynamicAllocation.enabled", "true") \
    .config("spark.dynamicAllocation.minExecutors", "5") \
    .config("spark.dynamicAllocation.maxExecutors", "8") \
    .config("spark.executor.instances", "5") \
    .getOrCreate()

# Paths containing 2024 network data
data_paths_2024 = [
    "hdfs://192.168.1.2:9000/datasets-uwf-edu/UWF-TestZeekData24/parquet/2024-02-25 - 2024-03-03/part-00000-8b838a85-76eb-4896-a0b6-2fc425e828c2-c000.snappy.parquet",
    "hdfs://192.168.1.2:9000/datasets-uwf-edu/UWF-TestZeekData24/parquet/2024-03-03 - 2024-03-10/part-00000-0955ed97-8460-41bd-872a-7375a7f0207e-c000.snappy.parquet",
    "hdfs://192.168.1.2:9000/datasets-uwf-edu/UWF-TestZeekData24/parquet/2024-03-10 - 2024-03-17/part-00000-071774ae-97f3-4f31-9700-8bfcdf41305a-c000.snappy.parquet",
    "hdfs://192.168.1.2:9000/datasets-uwf-edu/UWF-TestZeekData24/parquet/2024-03-17 - 2024-03-24/part-00000-5f556208-a1fc-40a1-9cc2-a4e24c76aeb3-c000.snappy.parquet",
    "hdfs://192.168.1.2:9000/datasets-uwf-edu/UWF-TestZeekData24/parquet/2024-03-24 - 2024-03-31/part-00000-ea3a47a3-0973-4d6b-a3a2-8dd441ee7901-c000.snappy.parquet",
    "hdfs://192.168.1.2:9000/datasets-uwf-edu/UWF-TestZeekData24/parquet/2024-10-27 - 2024-11-03/part-00000-69700ccb-c1c1-4763-beb7-cd0f1a61c268-c000.snappy.parquet",
    "hdfs://192.168.1.2:9000/datasets-uwf-edu/UWF-TestZeekData24/parquet/2024-11-03 - 2024-11-10/part-00000-f078acc1-ab56-40a6-a6e1-99d780645c57-c000.snappy.parquet"
]

# Paths containing 2022 network data
data_paths_2022 = [
    "hdfs://192.168.1.2:9000/datasets-uwf-edu/UWF-ZeekData22/parquet/2022-01-16 - 2022-01-23/part-00000-cbf26680-106d-40e7-8278-60520afdbb0e-c000.snappy.parquet",
    "hdfs://192.168.1.2:9000/datasets-uwf-edu/UWF-ZeekData22/parquet/2022-02-06 - 2022-02-13/part-00000-df678a79-4a73-452b-8e72-d624b2732f17-c000.snappy.parquet",
    "hdfs://192.168.1.2:9000/datasets-uwf-edu/UWF-ZeekData22/parquet/2022-02-13 - 2022-02-20/part-00000-1da06990-329c-4e38-913a-0f0aa39b388d-c000.snappy.parquet"
]

# Function to load and preprocess data
def load_and_preprocess_data(data_paths):
    df_list = []
    for path in data_paths:
        df = spark.read.parquet(path)
        df = df.select("ts", "duration", "orig_bytes", "resp_bytes", "orig_ip_bytes", "resp_ip_bytes", "label_tactic")
        df = df.fillna({
            "duration": 0,
            "orig_bytes": 0,
            "resp_bytes": 0,
            "orig_ip_bytes": 0,
            "resp_ip_bytes": 0
        })
        df_list.append(df)
    combined_df = df_list[0]
    for df in df_list[1:]:
        combined_df = combined_df.union(df)
    return combined_df

# Load and preprocess 2024 data
combined_df_2024 = load_and_preprocess_data(data_paths_2024)

# Load and preprocess 2022 data
combined_df_2022 = load_and_preprocess_data(data_paths_2022)

# Function to train and evaluate models
def train_and_evaluate(train_df, test_df, tactic):
    from pyspark.ml.feature import VectorAssembler
    from pyspark.ml.classification import DecisionTreeClassifier
    from pyspark.ml.evaluation import MulticlassClassificationEvaluator

    feature_columns = ["duration", "orig_bytes", "resp_bytes", "orig_ip_bytes", "resp_ip_bytes"]
    assembler = VectorAssembler(inputCols=feature_columns, outputCol="features", handleInvalid="skip")
    train_df = assembler.transform(train_df)
    test_df = assembler.transform(test_df)

    train_df = train_df.withColumn("label", F.when(F.col("label_tactic") == tactic, 1).otherwise(0))
    test_df = test_df.withColumn("label", F.when(F.col("label_tactic") == tactic, 1).otherwise(0))

    dt = DecisionTreeClassifier(featuresCol="features", labelCol="label")
    model = dt.fit(train_df)

    predictions = model.transform(test_df)
    evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
    accuracy = evaluator.evaluate(predictions)

    precision_evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="weightedPrecision")
    recall_evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="weightedRecall")
    f1_evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1")

    precision = precision_evaluator.evaluate(predictions)
    recall = recall_evaluator.evaluate(predictions)
    f1_score = f1_evaluator.evaluate(predictions)

    print(f"\nModel for label_tactic: {tactic}")
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 Score: {f1_score}")

# Get distinct values of label_tactic for 2024 data
distinct_label_tactics_2024 = combined_df_2024.select("label_tactic").distinct().collect()
distinct_label_tactics_2024 = [row['label_tactic'] for row in distinct_label_tactics_2024]

# Get distinct values of label_tactic for 2022 data
distinct_label_tactics_2022 = combined_df_2022.select("label_tactic").distinct().collect()
distinct_label_tactics_2022 = [row['label_tactic'] for row in distinct_label_tactics_2022]

# Train and evaluate models for each combination
for tactic in distinct_label_tactics_2024:
    if tactic != "none":
        print(f"\nTraining and testing for tactic: {tactic}")

        # Train 2022 data, Test 2022 data
        print("\nTrain 2022 data, Test 2022 data")
        train_and_evaluate(combined_df_2022, combined_df_2022, tactic)

        # Train 2024 data, Test 2024 data
        print("\nTrain 2024 data, Test 2024 data")
        train_and_evaluate(combined_df_2024, combined_df_2024, tactic)

        # Train 2022 data, Test 2024 data
        print("\nTrain 2022 data, Test 2024 data")
        
        train_and_evaluate(combined_df_2022, combined_df_2024, tactic)

        # Train 2024 data, Test 2022 data
        print("\nTrain 2024 data, Test 2022 data")
        train_and_evaluate(combined_df_2024, combined_df_2022, tactic)

25/02/04 19:40:46 WARN Utils: Your hostname, ubuntu-virtual-machine resolves to a loopback address: 127.0.1.1; using 192.168.1.112 instead (on interface ens33)
25/02/04 19:40:46 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/02/04 19:40:47 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
                                                                                


Training and testing for tactic: Privilege Escalation

Train 2022 data, Test 2022 data


                                                                                


Model for label_tactic: Privilege Escalation
Accuracy: 0.9999987545367297
Precision: 0.9999987545382809
Recall: 0.9999987545367297
F1 Score: 0.9999982207671445

Train 2024 data, Test 2024 data





Model for label_tactic: Privilege Escalation
Accuracy: 0.996844670451184
Precision: 0.9936992970069295
Recall: 0.996844670451184
F1 Score: 0.9952694986359703

Train 2022 data, Test 2024 data


                                                                                


Model for label_tactic: Privilege Escalation
Accuracy: 0.996844670451184
Precision: 0.9936992970069295
Recall: 0.996844670451184
F1 Score: 0.9952694986359703

Train 2024 data, Test 2022 data


                                                                                


Model for label_tactic: Privilege Escalation
Accuracy: 0.9999986507481239
Precision: 0.9999973014980682
Recall: 0.9999986507481239
F1 Score: 0.9999979761226409

Training and testing for tactic: Reconnaissance

Train 2022 data, Test 2022 data


                                                                                


Model for label_tactic: Reconnaissance
Accuracy: 0.996806839752157
Precision: 0.9967981817089702
Recall: 0.996806839752157
F1 Score: 0.9967470439447002

Train 2024 data, Test 2024 data


                                                                                


Model for label_tactic: Reconnaissance
Accuracy: 0.9960631420675652
Precision: 0.9960082103880623
Recall: 0.9960631420675652
F1 Score: 0.9960222779731265

Train 2022 data, Test 2024 data


                                                                                


Model for label_tactic: Reconnaissance
Accuracy: 0.35073929559145994
Precision: 0.9710427841708207
Recall: 0.35073929559145994
F1 Score: 0.48427676165403083

Train 2024 data, Test 2022 data


                                                                                


Model for label_tactic: Reconnaissance
Accuracy: 0.634604636506874
Precision: 0.960208512424857
Recall: 0.6346046365068742
F1 Score: 0.7443173324894445

Training and testing for tactic: Credential Access

Train 2022 data, Test 2022 data


                                                                                


Model for label_tactic: Credential Access
Accuracy: 0.9999976128620652
Precision: 0.9999972842024448
Recall: 0.9999976128620652
F1 Score: 0.9999970854723029

Train 2024 data, Test 2024 data


                                                                                


Model for label_tactic: Credential Access
Accuracy: 0.9999979131418328
Precision: 0.9999979131418328
Recall: 0.9999979131418328
F1 Score: 0.9999979131418328

Train 2022 data, Test 2024 data


                                                                                


Model for label_tactic: Credential Access
Accuracy: 0.545488551756952
Precision: 0.29755776009789686
Recall: 0.545488551756952
F1 Score: 0.3850662753336159

Train 2024 data, Test 2022 data


                                                                                


Model for label_tactic: Credential Access
Accuracy: 0.9942037177286196
Precision: 0.99999417307005
Recall: 0.9942037177286196
F1 Score: 0.9970902260102475

Training and testing for tactic: Persistence

Train 2022 data, Test 2022 data


                                                                                


Model for label_tactic: Persistence
Accuracy: 0.9999998962113942
Precision: 0.9999997924227991
Recall: 0.9999998962113942
F1 Score: 0.9999998443170939

Train 2024 data, Test 2024 data


                                                                                


Model for label_tactic: Persistence
Accuracy: 0.996844670451184
Precision: 0.9936992970069295
Recall: 0.996844670451184
F1 Score: 0.9952694986359703

Train 2022 data, Test 2024 data


                                                                                


Model for label_tactic: Persistence
Accuracy: 0.996844670451184
Precision: 0.9936992970069295
Recall: 0.996844670451184
F1 Score: 0.9952694986359703

Train 2024 data, Test 2022 data


                                                                                


Model for label_tactic: Persistence
Accuracy: 0.9999998962113942
Precision: 0.9999997924227991
Recall: 0.9999998962113942
F1 Score: 0.9999998443170939

Training and testing for tactic: Initial Access

Train 2022 data, Test 2022 data


                                                                                


Model for label_tactic: Initial Access
Accuracy: 0.9999998962113942
Precision: 0.9999997924227991
Recall: 0.9999998962113942
F1 Score: 0.9999998443170939

Train 2024 data, Test 2024 data


                                                                                


Model for label_tactic: Initial Access
Accuracy: 0.9966276372017945
Precision: 0.9963959642100139
Recall: 0.9966276372017945
F1 Score: 0.9960234822227075

Train 2022 data, Test 2024 data


                                                                                


Model for label_tactic: Initial Access
Accuracy: 0.9944374795553114
Precision: 0.9889059007443204
Recall: 0.9944374795553114
F1 Score: 0.9916639763155786

Train 2024 data, Test 2022 data


                                                                                


Model for label_tactic: Initial Access
Accuracy: 0.9997606634748902
Precision: 0.9999997923979635
Recall: 0.9997606634748902
F1 Score: 0.9998802136390539

Training and testing for tactic: Exfiltration

Train 2022 data, Test 2022 data


                                                                                


Model for label_tactic: Exfiltration
Accuracy: 0.999999273479759
Precision: 0.999998546960046
Recall: 0.999999273479759
F1 Score: 0.9999989102197703

Train 2024 data, Test 2024 data


                                                                                


Model for label_tactic: Exfiltration
Accuracy: 0.9997083615711329
Precision: 0.9994168081952389
Recall: 0.9997083615711329
F1 Score: 0.9995625636230436

Train 2022 data, Test 2024 data


                                                                                


Model for label_tactic: Exfiltration
Accuracy: 0.9997083615711329
Precision: 0.9994168081952389
Recall: 0.9997083615711329
F1 Score: 0.9995625636230436

Train 2024 data, Test 2022 data


                                                                                


Model for label_tactic: Exfiltration
Accuracy: 0.999999273479759
Precision: 0.999998546960046
Recall: 0.999999273479759
F1 Score: 0.9999989102197703

Training and testing for tactic: Defense Evasion

Train 2022 data, Test 2022 data


                                                                                


Model for label_tactic: Defense Evasion
Accuracy: 0.9999998962113942
Precision: 0.9999997924227991
Recall: 0.9999998962113942
F1 Score: 0.9999998443170939

Train 2024 data, Test 2024 data


                                                                                


Model for label_tactic: Defense Evasion
Accuracy: 0.996844670451184
Precision: 0.9936992970069295
Recall: 0.996844670451184
F1 Score: 0.9952694986359703

Train 2022 data, Test 2024 data


                                                                                


Model for label_tactic: Defense Evasion
Accuracy: 0.996844670451184
Precision: 0.9936992970069295
Recall: 0.996844670451184
F1 Score: 0.9952694986359703

Train 2024 data, Test 2022 data





Model for label_tactic: Defense Evasion
Accuracy: 0.9999998962113942
Precision: 0.9999997924227991
Recall: 0.9999998962113942
F1 Score: 0.9999998443170939


                                                                                