In [1]:
!pip install pyspark
!pip install -U -q PyDrive
!apt install openjdk-8-jdk-headless -qq

# Set Environment Variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=fb0212e80b0908c4bf7b465c39b7cb1e37d533ff713acf58e6b0389705861232
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1
The following additional packages will be installed:
  libxtst6 openjdk-8-jre-headless
Suggested packages:
  openjdk-8-demo openjdk-8-source libnss-mdns fonts-dejavu-extra fonts-nanum fonts-ipafont-gothic
  fonts-ipafont-mincho fonts-wqy-microhei fonts-wqy-zenhei fonts-indic

In [8]:
import pyspark
from pyspark import SparkConf
from pyspark.sql import SparkSession, Row, types as T
from pyspark.sql.functions import col, split, regexp_replace, collect_list, hash as spark_hash, concat_ws, when, abs, explode, desc, size
from pyspark.sql.types import ArrayType, StructType, StructField, StringType, LongType, DoubleType, IntegerType
from pyspark.sql.functions import udf
import numpy as np
from collections import Counter
from random import shuffle
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import MinHashLSH
import pandas as pd
from pyspark.sql import functions as F
from pyspark.sql.window import Window

In [11]:
def create_session():
    # create the session
    conf = SparkConf()
    conf.setAppName("DIS-lab-1")    # Sets name of the Spark Application
    conf.setMaster("local[16]")    # Master URL. In this case local[*] uses all the available cores in the machine
    conf.set("spark.driver.memory", "10G")   # Memory allocated to driver process
    conf.set("spark.driver.maxResultSize", "6G")    # Maximum size of results that can be returned to driver
    conf.set("spark.executor.instances", "4")
    conf.set("spark.executor.cores", "4")
    conf.set("spark.executor.memory", "4G")    # Memory allocated to each executor
    conf.set("spark.network.timeout", "600s")  # Increase network timeout
    conf.set("spark.executor.heartbeatInterval", "60s")  # Increase heartbeat interval
    conf.set("spark.rpc.message.maxSize", "512")  # Increase max message size
    conf.set("spark.driver.maxResultSize", "4G")  # Increase driver max result size
    conf.set("spark.sql.broadcastTimeout", "600")  # Increase broadcast timeout
    conf.set("spark.sql.shuffle.partitions", "200")  # Increase the number of shuffle partitions
    conf.set("spark.yarn.executor.memoryOverhead", "2048")  # Increase memory overhead
    conf.set("spark.memory.offHeap.enabled","true")
    conf.set("spark.memory.offHeap.size","10g")
    conf.set("spark.jars.packages", "graphframes:graphframes:0.8.2-spark3.0-s_2.12")  # Add GraphFrames to the spark configuration
    conf.set("spark.hadoop.fs.file.impl", "org.apache.hadoop.fs.LocalFileSystem")  # Ensure local file system is used for checkpointing

    sc = pyspark.SparkContext(conf=conf)    # Initializes the Spark context with this specific configuration
    spark = SparkSession.builder.config(conf=sc.getConf()).getOrCreate()    # Creates Spark session

    # Set checkpoint directory
    sc.setCheckpointDir("checkpoints")

    return sc, spark

try:
    if 'sc' in globals() and sc is not None:
        sc.stop()
        print("--Stopped existing SparkContext")
    if 'spark' in globals() and isinstance(spark, SparkSession):
        spark.stop()
        print("--Stopped existing SparkSession")
except Exception as e:
    print(f"Error stopping existing Spark session or context: {e}")

# Create a new Spark session
sc, spark = create_session()
print("Spark session created successfully!")
spark

Spark session created successfully!


In [110]:
logs = spark.read.csv('output2.csv', header=True, inferSchema=True)
logs = logs.dropDuplicates(['process_id'])

In [111]:
logs.limit(1).show(truncate=False)

+---+----------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|_c0|process_id|from_servers                  

In [112]:
logs.show(truncate=False)

+---+----------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [113]:
# Split the from_servers string by comma and trim spaces
logs = logs.withColumn('servers_list', F.split(F.trim(logs['from_servers']), ',\s*'))

# Explode the servers_list column to have each server name on a new row
logs = logs.withColumn('server', F.explode('servers_list'))

# Remove any leading or trailing spaces in server names
logs = logs.withColumn('server', F.trim(logs['server']))

# Get unique server names
unique_servers = logs.select('server').distinct().rdd.map(lambda row: row[0]).collect()

# Create one-hot encoded columns for each unique server
for server in unique_servers:
    logs = logs.withColumn(server, F.when(F.array_contains(logs['servers_list'], server), 1).otherwise(0))

In [114]:
logs.limit(1).show(truncate=False)

+---+----------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------

In [115]:
# Split the times column, convert to integer and sum the values
logs = logs.withColumn("sum_times",
    F.expr("aggregate(transform(split(times, ','), x -> int(trim(x))), 0, (acc, x) -> acc + x)")
)

In [121]:
final_logs = logs.select(['process_id'] + unique_servers + ['sum_times'])
final_logs = final_logs.dropDuplicates(['process_id'])

In [117]:
final_logs.limit(1).show(truncate=False)

+----------+-------------------------+--------------------------------------+---------------------+-------------------------+------------+-------------------+------------+-----------+-------------------------------------+--------------------------+-------------------------+--------------------------+-----------+-------------------------+--------------------------------------+--------------------+-----------+-------------------------+----------------------------+----------------------+--------------------------+-----------+-------------------------------------+----------------------+--------------------------------+-------------------------+------------------------------+-------------------------------------+---------------+-------------------------+--------------------------+-------------------+-------------------------+--------------------------------------+--------------------------+-----------+----------------------------+-------------------------------------+-----------+-----------

In [120]:
final_logs.show(truncate=False)

+----------+-------------------------+--------------------------------------+---------------------+-------------------------+------------+-------------------+------------+-----------+-------------------------------------+--------------------------+-------------------------+--------------------------+-----------+-------------------------+--------------------------------------+--------------------+-----------+-------------------------+----------------------------+----------------------+--------------------------+-----------+-------------------------------------+----------------------+--------------------------------+-------------------------+------------------------------+-------------------------------------+---------------+-------------------------+--------------------------+-------------------+-------------------------+--------------------------------------+--------------------------+-----------+----------------------------+-------------------------------------+-----------+-----------

In [122]:
from pyspark.ml.clustering import BisectingKMeans
from pyspark.ml.feature import VectorAssembler

# Select columns for input features
cols_except_process_id = [col for col in final_logs.columns if col != 'process_id']

# Assemble features into a single vector column
assembler = VectorAssembler(inputCols=cols_except_process_id, outputCol="features")
feature_logs = assembler.transform(final_logs)

# Train the Bisecting K-Means model
kmeans = BisectingKMeans(k=3, seed=123)
model = kmeans.fit(feature_logs)

# Make predictions
predictions = model.transform(feature_logs)

# Show cluster centers
print("Cluster Centers:")
centers = model.clusterCenters()
for idx, center in enumerate(centers):
    print(f"Cluster {idx}: {center}")

# Show predictions
predictions.select("process_id", "prediction").show()

Cluster Centers:
Cluster 0: [1.33333333e-01 0.00000000e+00 1.33333333e-01 1.00000000e-01
 0.00000000e+00 1.33333333e-01 1.00000000e-01 1.00000000e-01
 0.00000000e+00 6.66666667e-02 6.66666667e-02 1.00000000e-01
 1.00000000e-01 6.66666667e-02 0.00000000e+00 1.33333333e-01
 3.33333333e-02 3.33333333e-01 3.66666667e-01 6.66666667e-02
 2.00000000e-01 6.66666667e-02 3.33333333e-02 1.00000000e-01
 0.00000000e+00 1.00000000e-01 6.66666667e-02 3.33333333e-02
 2.66666667e-01 1.33333333e-01 6.66666667e-02 1.33333333e-01
 6.66666667e-02 3.33333333e-02 3.33333333e-02 3.33333333e-02
 5.66666667e-01 0.00000000e+00 3.33333333e-02 1.00000000e-01
 0.00000000e+00 3.33333333e-02 3.33333333e-02 1.66666667e-01
 3.00000000e-01 3.33333333e-02 6.66666667e-02 3.66666667e-01
 3.33333333e-02 6.66666667e-02 3.33333333e-02 1.33333333e-01
 1.33333333e-01 2.33333333e-01 1.00000000e-01 3.33333333e-02
 1.33333333e-01 3.33333333e-02 1.33333333e-01 1.33333333e-01
 0.00000000e+00 1.00000000e-01 0.00000000e+00 1.00000000e