# Spark Tasks

## Imports and environment initialization

In [48]:
import os
import pyspark
from pyspark import SparkConf
from pyspark.sql.functions import col, split, regexp_replace
from pyspark.sql import SparkSession
import numpy as np
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import KMeans



# Set environment variables
os.environ["JAVA_HOME"] = "C:/Program Files/Java/jdk-22"
os.environ["SPARK_HOME"] = "C:/Spark/spark-3.5.1-bin-hadoop3"
os.environ["HADOOP_HOME"] = "C:/Hadoop"
os.environ["PYSPARK_PYTHON"] = "C:/Users/chris/AppData/Local/Programs/Python/Python311/python.exe" 
os.environ["PATH"] = (
    os.path.join(os.environ["JAVA_HOME"], "bin") + os.pathsep +
    os.path.join(os.environ["SPARK_HOME"], "bin") + os.pathsep +
    os.path.join(os.environ["HADOOP_HOME"], "bin") + os.pathsep +
    os.path.join(os.environ["PYSPARK_PYTHON"]) + os.pathsep +
    os.environ["PATH"]
)

## Create & Configure Session

In [40]:
def create_session():
    # create the session
    conf = SparkConf()
    conf.setAppName("DIS-lab-1")    # Sets name of the Spark Application
    conf.setMaster("local[*]")    # Master URL. In this case local[*] uses all the available cores in the machine
    conf.set("spark.driver.memory", "2G")   # Memory allocated to driver process
    conf.set("spark.driver.maxResultSize", "2G")    # Maximum size of results that can be returned to driver
    conf.set("spark.executor.memory", "1G")    # Memory allocated to each executor     
    sc = pyspark.SparkContext(conf=conf)    # Initializes tha Spark context with this specific configuration
    spark = SparkSession.builder.getOrCreate()    # Creates Spark session
    
    return sc, spark

In [41]:
try:
    if 'sc' in globals() and sc is not None:
        sc.stop()
        print("--Stopped existing SparkContext")
    if 'spark' in globals() and isinstance(spark, SparkSession):
        spark.stop()
        print("--Stopped existing SparkSession")
except Exception as e:
    print(f"Error stopping existing Spark session or context: {e}")

# Create a new Spark session
sc, spark = create_session()
print("Spark session created successfully!")
spark

--Stopped existing SparkContext
--Stopped existing SparkSession
Spark session created successfully!


## Load and Split Data

### Load the data from the CSVs

In [42]:
logs = spark.read.csv('datasets/dataset_1.csv', header=True, inferSchema=True)
logs.show(5)

+--------------------+
|                Logs|
+--------------------+
|<user, ui_server_...|
|<user, ui_server_...|
|<user, ui_server_...|
|<user, ui_server_...|
|<user, ui_server_...|
+--------------------+
only showing top 5 rows



### Split the data into 5 separate columns

In [45]:
logs_splitted = logs \
    .withColumn("from_server", regexp_replace(split(col("Logs"), ", ").getItem(0), "[<>]", "")) \
    .withColumn("to_server", split(col("Logs"), ", ").getItem(1)) \
    .withColumn("time", split(col("Logs"), ", ").getItem(2)) \
    .withColumn("action", split(col("Logs"), ", ").getItem(3)) \
    .withColumn("process_id", regexp_replace(split(col("Logs"), ", ").getItem(4), "[<>]", "")) \
    .drop("Logs")

logs_splitted.show(5)

+-----------+------------+----+-------+----------+
|from_server|   to_server|time| action|process_id|
+-----------+------------+----+-------+----------+
|       user|ui_server_16|   0|Request|0000000931|
|       user| ui_server_5|   2|Request|0000000237|
|       user|ui_server_17|   3|Request|0000000782|
|       user| ui_server_3|   3|Request|0000000992|
|       user|ui_server_17|   4|Request|0000000074|
+-----------+------------+----+-------+----------+
only showing top 5 rows



### Cast the "time" and "process_id" columns to integers

In [55]:
logs_casted = logs_splitted \
    .withColumn("time", col("time").cast("integer")) \
    .withColumn("process_id", col("process_id").cast("integer"))

logs_casted.show(20)

+------------+--------------------+----+--------+----------+
| from_server|           to_server|time|  action|process_id|
+------------+--------------------+----+--------+----------+
|        user|        ui_server_16|   0| Request|       931|
|        user|         ui_server_5|   2| Request|       237|
|        user|        ui_server_17|   3| Request|       782|
|        user|         ui_server_3|   3| Request|       992|
|        user|        ui_server_17|   4| Request|        74|
|        user|        ui_server_19|   4| Request|       346|
|        user|         ui_server_6|   5| Request|       384|
|        user|         ui_server_3|  10| Request|       613|
|        user|        ui_server_20|  10| Request|       440|
| ui_server_3|                user|  11|Response|       992|
| ui_server_6|                user|  11|Response|       384|
| ui_server_6|purchase_book_ser...|  12| Request|       384|
|ui_server_16|                user|  12|Response|       931|
|ui_server_19|          

## Clustering

### Prepare data using VectorAssembler

In [54]:
# List all the columns to include in the clustering
feature_columns = ['time', 'process_id']  

# Assemble these columns into a single feature vector
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
logs_with_features = assembler.transform(logs_casted)

logs_with_features.show(20)

+------------+--------------------+----+--------+----------+------------+
| from_server|           to_server|time|  action|process_id|    features|
+------------+--------------------+----+--------+----------+------------+
|        user|        ui_server_16|   0| Request|       931| [0.0,931.0]|
|        user|         ui_server_5|   2| Request|       237| [2.0,237.0]|
|        user|        ui_server_17|   3| Request|       782| [3.0,782.0]|
|        user|         ui_server_3|   3| Request|       992| [3.0,992.0]|
|        user|        ui_server_17|   4| Request|        74|  [4.0,74.0]|
|        user|        ui_server_19|   4| Request|       346| [4.0,346.0]|
|        user|         ui_server_6|   5| Request|       384| [5.0,384.0]|
|        user|         ui_server_3|  10| Request|       613|[10.0,613.0]|
|        user|        ui_server_20|  10| Request|       440|[10.0,440.0]|
| ui_server_3|                user|  11|Response|       992|[11.0,992.0]|
| ui_server_6|                user|  1

### Clustering with K-means

In [53]:
# KMeans algorithm setup
kmeans = KMeans().setK(10).setSeed(1).setFeaturesCol("features")
model = kmeans.fit(logs_with_features)

# Apply model to data to categorize each row into a cluster
predictions = model.transform(logs_with_features)

# Show a few results
predictions.select("process_id", "features", "prediction").show(5)

+----------+-----------+----------+
|process_id|   features|prediction|
+----------+-----------+----------+
|       931|[0.0,931.0]|         1|
|       237|[2.0,237.0]|         2|
|       782|[3.0,782.0]|         1|
|       992|[3.0,992.0]|         1|
|        74| [4.0,74.0]|         2|
+----------+-----------+----------+
only showing top 5 rows

