# Spark Tasks

## Imports and environment initialization

In [69]:
import os
import pyspark
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, split, regexp_replace, posexplode, struct, collect_list, posexplode_outer, hash as spark_hash, concat_ws, when, abs
from pyspark.sql.types import ArrayType, StructType, StructField, StringType, LongType
from pyspark.sql.functions import udf
import numpy as np
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import KMeans




In [29]:
# Set environment variables
os.environ["JAVA_HOME"] = "C:/Program Files/Java/jdk-22"
os.environ["SPARK_HOME"] = "C:/Spark/spark-3.5.1-bin-hadoop3"
os.environ["HADOOP_HOME"] = "C:/Hadoop"
os.environ["PYSPARK_PYTHON"] = "C:/Users/chris/AppData/Local/Programs/Python/Python311/python.exe" 
os.environ["PATH"] = (
    os.path.join(os.environ["JAVA_HOME"], "bin") + os.pathsep +
    os.path.join(os.environ["SPARK_HOME"], "bin") + os.pathsep +
    os.path.join(os.environ["HADOOP_HOME"], "bin") + os.pathsep +
    os.path.join(os.environ["PYSPARK_PYTHON"]) + os.pathsep +
    os.environ["PATH"])

## Create & Configure Session

In [30]:
def create_session():
    # create the session
    conf = SparkConf()
    conf.setAppName("DIS-lab-1")    # Sets name of the Spark Application
    conf.setMaster("local[*]")    # Master URL. In this case local[*] uses all the available cores in the machine
    conf.set("spark.driver.memory", "2G")   # Memory allocated to driver process
    conf.set("spark.driver.maxResultSize", "2G")    # Maximum size of results that can be returned to driver
    conf.set("spark.executor.memory", "1G")    # Memory allocated to each executor     
    sc = pyspark.SparkContext(conf=conf)    # Initializes tha Spark context with this specific configuration
    spark = SparkSession.builder.getOrCreate()    # Creates Spark session
    
    return sc, spark

In [31]:
try:
    if 'sc' in globals() and sc is not None:
        sc.stop()
        print("--Stopped existing SparkContext")
    if 'spark' in globals() and isinstance(spark, SparkSession):
        spark.stop()
        print("--Stopped existing SparkSession")
except Exception as e:
    print(f"Error stopping existing Spark session or context: {e}")

# Create a new Spark session
sc, spark = create_session()
print("Spark session created successfully!")
spark

--Stopped existing SparkContext
--Stopped existing SparkSession
Spark session created successfully!


## Load and Split Data

### Load the data from the CSVs

In [32]:
logs = spark.read.csv('datasets/dataset_1.csv', header=True, inferSchema=True)
logs.show(5)

+--------------------+
|                Logs|
+--------------------+
|<user, ui_server_...|
|<user, ui_server_...|
|<user, ui_server_...|
|<user, ui_server_...|
|<user, ui_server_...|
+--------------------+
only showing top 5 rows



### Split the data into 5 separate columns

In [33]:
logs_splitted = logs \
    .withColumn("from_server", regexp_replace(split(col("Logs"), ", ").getItem(0), "[<>]", "")) \
    .withColumn("to_server", split(col("Logs"), ", ").getItem(1)) \
    .withColumn("time", split(col("Logs"), ", ").getItem(2)) \
    .withColumn("action", split(col("Logs"), ", ").getItem(3)) \
    .withColumn("process_id", regexp_replace(split(col("Logs"), ", ").getItem(4), "[<>]", "")) \
    .drop("Logs")

# Cast the "time" and "process_id" columns to integers
logs_casted = logs_splitted \
    .withColumn("time", col("time").cast("integer")) \
    .withColumn("process_id", col("process_id").cast("integer"))

logs_casted.show(20)

+-----------+------------+----+-------+----------+
|from_server|   to_server|time| action|process_id|
+-----------+------------+----+-------+----------+
|       user| ui_server_2|   0|Request|      7127|
|       user|ui_server_14|   0|Request|      6463|
|       user|ui_server_14|   0|Request|      8002|
|       user|ui_server_13|   0|Request|      6557|
|       user|ui_server_10|   0|Request|      8193|
|       user| ui_server_8|   0|Request|      4888|
|       user| ui_server_8|   0|Request|      1412|
|       user| ui_server_5|   0|Request|      3842|
|       user|ui_server_16|   1|Request|      3161|
|       user|ui_server_10|   1|Request|      5707|
|       user| ui_server_5|   1|Request|      8909|
|       user| ui_server_8|   1|Request|        53|
|       user| ui_server_2|   1|Request|      9748|
|       user| ui_server_4|   1|Request|      3333|
|       user| ui_server_8|   1|Request|      6116|
|       user|ui_server_17|   2|Request|      5921|
|       user|ui_server_11|   2|

## Group data

### Group by process_id

In [67]:
logs_grouped = logs_casted.groupBy("process_id").agg(
    collect_list("from_server").alias("from_servers"),
    collect_list("to_server").alias("to_servers"),
    collect_list("time").alias("times"),
    collect_list("action").alias("actions")
)
logs_grouped.show(20, truncate=True)


+----------+--------------------+--------------------+--------------------+--------------------+
|process_id|        from_servers|          to_servers|               times|             actions|
+----------+--------------------+--------------------+--------------------+--------------------+
|        28|[user, ui_server_...|[ui_server_1, pur...|[771, 774, 782, 7...|[Request, Request...|
|        31|[user, ui_server_...|[ui_server_6, pur...|[972, 974, 981, 1...|[Request, Request...|
|        34|[user, ui_server_...|[ui_server_19, pu...|[856, 867, 874, 8...|[Request, Request...|
|        53|[limit_check_amer...|[american_express...|[358, 362, 370, 3...|[Response, Respon...|
|        65|[inventory_update...|[seasonal_adjustm...|[375, 404, 428, 4...|[Request, Request...|
|        78|[user, ui_server_...|[ui_server_12, pu...|[381, 385, 390, 3...|[Request, Request...|
|        81|[user, ui_server_...|[ui_server_3, pur...|[797, 800, 809, 8...|[Request, Request...|
|        85|[review_server_4,.

### Sort all the columns based on times so the sub-processes are in the right order

In [74]:
def sort_lists(times, from_servers, to_servers, actions):
    combined = list(zip(times, from_servers, to_servers, actions))
    sorted_combined = sorted(combined, key=lambda x: x[0])
    times_sorted, from_servers_sorted, to_servers_sorted, actions_sorted = zip(*sorted_combined)
    return list(times_sorted), list(from_servers_sorted), list(to_servers_sorted), list(actions_sorted)

# Define the schema for the sorted columns
sorted_lists_schema = StructType([
    StructField("times", ArrayType(LongType()), nullable=True),
    StructField("from_servers", ArrayType(StringType()), nullable=True),
    StructField("to_servers", ArrayType(StringType()), nullable=True),
    StructField("actions", ArrayType(StringType()), nullable=True)
])

# Register the function as a UDF
sort_lists_udf = udf(sort_lists, sorted_lists_schema)


In [85]:
# Apply the UDF to sort the lists based on the "times" column
logs_grouped = logs_grouped.withColumn("sorted_lists", sort_lists_udf("times", "from_servers", "to_servers", "actions"))

# Split the sorted lists into separate columns
logs_grouped = logs_grouped.withColumn("times", col("sorted_lists.times")) \
                           .withColumn("from_servers", col("sorted_lists.from_servers")) \
                           .withColumn("to_servers", col("sorted_lists.to_servers")) \
                           .withColumn("actions", col("sorted_lists.actions")) \
                           .drop("sorted_lists")

# Show the sorted logs
logs_grouped.select("from_servers", "to_servers", "times", "actions").show(truncate=True)

+--------------------+--------------------+--------------------+--------------------+
|        from_servers|          to_servers|               times|             actions|
+--------------------+--------------------+--------------------+--------------------+
|[user, ui_server_...|[ui_server_1, pur...|[771, 774, 782, 7...|[Request, Request...|
|[user, ui_server_...|[ui_server_6, pur...|[972, 974, 981, 1...|[Request, Request...|
|[user, ui_server_...|[ui_server_19, pu...|[856, 867, 874, 8...|[Request, Request...|
|[user, ui_server_...|[ui_server_8, pur...|[1, 8, 18, 22, 32...|[Request, Request...|
|[user, ui_server_...|[ui_server_8, pur...|[253, 256, 266, 2...|[Request, Request...|
|[user, ui_server_...|[ui_server_12, pu...|[381, 385, 390, 3...|[Request, Request...|
|[user, ui_server_...|[ui_server_3, pur...|[797, 800, 809, 8...|[Request, Request...|
|[user, ui_server_...|[ui_server_14, pu...|[166, 172, 181, 1...|[Request, Request...|
|[user, ui_server_...|[ui_server_12, pu...|[379, 385, 

### Keep the from_servers sequence as feature for hashing

In [86]:
''' 
Using the withColumn method to create a new column "process_string".
The concat_ws function concatenates multiple column values into a single string, separated by a comma.
col("from_servers"): Selects the column "from_servers".
col("to_servers"): Selects the column "to_servers".
col("times"): Selects the column "times".
col("actions"): Selects the column "actions".
The concatenated string is stored in the new column "process_string".
'''

logs_grouped = logs_grouped.withColumn(
    "process_string",
    concat_ws(
        ",",
        col("from_servers"),
    )
)

logs_grouped.show(10, truncate=True)

+----------+--------------------+--------------------+--------------------+--------------------+--------------------+------------+------+
|process_id|        from_servers|          to_servers|               times|             actions|      process_string|process_hash|bucket|
+----------+--------------------+--------------------+--------------------+--------------------+--------------------+------------+------+
|        28|[user, ui_server_...|[ui_server_1, pur...|[771, 774, 782, 7...|[Request, Request...|user,ui_server_1,...|   189349086|    86|
|        31|[user, ui_server_...|[ui_server_6, pur...|[972, 974, 981, 1...|[Request, Request...|user,ui_server_6,...|  1025501846|    46|
|        34|[user, ui_server_...|[ui_server_19, pu...|[856, 867, 874, 8...|[Request, Request...|user,ui_server_19...|   932983399|   199|
|        53|[user, ui_server_...|[ui_server_8, pur...|[1, 8, 18, 22, 32...|[Request, Request...|user,ui_server_8,...|  1849055925|   125|
|        65|[user, ui_server_...|[

### Determine number of buckets based on Sturges' formula and hash based on process_string

In [87]:
# Compute a hash value for each process string
logs_grouped = logs_grouped.withColumn("process_hash", abs(spark_hash(col("process_string"))))
# withColumn is used to create a new column "process_hash" by applying the spark_hash function to the "process_string" column.

# Calculate the number of processes
num_processes = logs_grouped.count()

# Determine the number of buckets using Sturges' formula
num_buckets = int((num_processes ** 0.5) * 2)
print(f"Num buckets: {num_buckets}")

# Assign each process to a bucket
logs_grouped = logs_grouped.withColumn("bucket", (col("process_hash") % num_buckets).cast("int"))
# withColumn is used to create a new column "bucket".
# The process hash value is divided by the number of buckets and the remainder is taken (modulus operation).
# The result is cast to an integer, assigning each process to one of the buckets.

# Show the grouped logs with bucket assignments
logs_grouped.select("process_id", "process_string", "process_hash", "bucket").show(10, truncate=True)
# Display the columns "process_id", "process_string", "process_hash", and "bucket" 

# Group processes into buckets and collect results
bucketed_processes = logs_grouped.groupBy("bucket").agg(collect_list("process_id").alias("process_ids"))
# Group the logs by "bucket" and collect the "process_id"s of the logs in each bucket.
# The collected "process_id"s are stored in a new column named "process_ids".

# Show the bucketed processes
bucketed_processes.show(25, truncate=True)

Num buckets: 200
+----------+--------------------+------------+------+
|process_id|      process_string|process_hash|bucket|
+----------+--------------------+------------+------+
|        28|user,ui_server_1,...|   189349086|    86|
|        31|user,ui_server_6,...|  1025501846|    46|
|        34|user,ui_server_19...|   932983399|   199|
|        53|user,ui_server_8,...|  1849055925|   125|
|        65|user,ui_server_8,...|  1160153491|    91|
|        78|user,ui_server_12...|   408352463|    63|
|        81|user,ui_server_3,...|  2018519219|    19|
|        85|user,ui_server_14...|  1932070369|   169|
|       101|user,ui_server_12...|  1679723677|    77|
|       108|user,ui_server_13...|   771513310|   110|
+----------+--------------------+------------+------+
only showing top 10 rows

