# Spark Tasks

## Imports and environment initialization

In [31]:
import os
import pyspark
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, split, regexp_replace, collect_list, hash as spark_hash, concat_ws, when, abs, explode
from pyspark.sql.types import ArrayType, StructType, StructField, StringType, LongType, DoubleType, IntegerType
from pyspark.sql.functions import udf
import numpy as np
from collections import Counter
from random import shuffle


In [2]:
# Set environment variables
os.environ["JAVA_HOME"] = "C:/Program Files/Java/jdk-22"
os.environ["SPARK_HOME"] = "C:/Spark/spark-3.5.1-bin-hadoop3"
os.environ["HADOOP_HOME"] = "C:/Hadoop"
os.environ["PYSPARK_PYTHON"] = "C:/Users/chris/AppData/Local/Programs/Python/Python311/python.exe" 
os.environ["PATH"] = (
    os.path.join(os.environ["JAVA_HOME"], "bin") + os.pathsep +
    os.path.join(os.environ["SPARK_HOME"], "bin") + os.pathsep +
    os.path.join(os.environ["HADOOP_HOME"], "bin") + os.pathsep +
    os.path.join(os.environ["PYSPARK_PYTHON"]) + os.pathsep +
    os.environ["PATH"])

## Create & Configure Session

In [3]:
def create_session():
    # create the session
    conf = SparkConf()
    conf.setAppName("DIS-lab-1")    # Sets name of the Spark Application
    conf.setMaster("local[*]")    # Master URL. In this case local[*] uses all the available cores in the machine
    conf.set("spark.driver.memory", "2G")   # Memory allocated to driver process
    conf.set("spark.driver.maxResultSize", "2G")    # Maximum size of results that can be returned to driver
    conf.set("spark.executor.memory", "1G")    # Memory allocated to each executor     
    sc = pyspark.SparkContext(conf=conf)    # Initializes tha Spark context with this specific configuration
    spark = SparkSession.builder.getOrCreate()    # Creates Spark session
    
    return sc, spark

try:
    if 'sc' in globals() and sc is not None:
        sc.stop()
        print("--Stopped existing SparkContext")
    if 'spark' in globals() and isinstance(spark, SparkSession):
        spark.stop()
        print("--Stopped existing SparkSession")
except Exception as e:
    print(f"Error stopping existing Spark session or context: {e}")

# Create a new Spark session
sc, spark = create_session()
print("Spark session created successfully!")
spark

Spark session created successfully!


## Load and Split Data

### Load the data from the CSVs

In [4]:
logs = spark.read.csv('datasets/dataset_1.csv', header=True, inferSchema=True)
logs.show(2)

+--------------------+
|                Logs|
+--------------------+
|<user, ui_server_...|
|<user, ui_server_...|
+--------------------+
only showing top 2 rows



### Split the data into 5 separate columns

In [5]:
logs_splitted = logs \
    .withColumn("from_server", regexp_replace(split(col("Logs"), ", ").getItem(0), "[<>]", "")) \
    .withColumn("to_server", split(col("Logs"), ", ").getItem(1)) \
    .withColumn("time", split(col("Logs"), ", ").getItem(2)) \
    .withColumn("action", split(col("Logs"), ", ").getItem(3)) \
    .withColumn("process_id", regexp_replace(split(col("Logs"), ", ").getItem(4), "[<>]", "")) \
    .drop("Logs")

# Cast the "time" and "process_id" columns to integers
logs_casted = logs_splitted \
    .withColumn("time", col("time").cast("integer")) \
    .withColumn("process_id", col("process_id").cast("integer"))

logs_casted.show(5)

+-----------+------------+----+-------+----------+
|from_server|   to_server|time| action|process_id|
+-----------+------------+----+-------+----------+
|       user| ui_server_2|   0|Request|      7127|
|       user|ui_server_14|   0|Request|      6463|
|       user|ui_server_14|   0|Request|      8002|
|       user|ui_server_13|   0|Request|      6557|
|       user|ui_server_10|   0|Request|      8193|
+-----------+------------+----+-------+----------+
only showing top 5 rows



## Group data

### Group by process_id

In [6]:
logs_grouped = logs_casted.groupBy("process_id").agg(
    collect_list("from_server").alias("from_servers"),
    collect_list("to_server").alias("to_servers"),
    collect_list("time").alias("times"),
    collect_list("action").alias("actions")
)
logs_grouped.show(5, truncate=True)


+----------+--------------------+--------------------+--------------------+--------------------+
|process_id|        from_servers|          to_servers|               times|             actions|
+----------+--------------------+--------------------+--------------------+--------------------+
|        28|[user, ui_server_...|[ui_server_1, pur...|[771, 774, 782, 7...|[Request, Request...|
|        31|[user, ui_server_...|[ui_server_6, pur...|[972, 974, 981, 1...|[Request, Request...|
|        34|[user, ui_server_...|[ui_server_19, pu...|[856, 867, 874, 8...|[Request, Request...|
|        53|[limit_check_amer...|[american_express...|[358, 362, 370, 3...|[Response, Respon...|
|        65|[inventory_update...|[seasonal_adjustm...|[375, 404, 428, 4...|[Request, Request...|
+----------+--------------------+--------------------+--------------------+--------------------+
only showing top 5 rows



### Sort all the columns based on times so the sub-processes are in the right order

In [7]:
def sort_lists(times, from_servers, to_servers, actions):
    combined = list(zip(times, from_servers, to_servers, actions))
    sorted_combined = sorted(combined, key=lambda x: x[0])
    times_sorted, from_servers_sorted, to_servers_sorted, actions_sorted = zip(*sorted_combined)
    return list(times_sorted), list(from_servers_sorted), list(to_servers_sorted), list(actions_sorted)

# Define the schema for the sorted columns
sorted_lists_schema = StructType([
    StructField("times", ArrayType(LongType()), nullable=True),
    StructField("from_servers", ArrayType(StringType()), nullable=True),
    StructField("to_servers", ArrayType(StringType()), nullable=True),
    StructField("actions", ArrayType(StringType()), nullable=True)
])

# Register the function as a UDF
sort_lists_udf = udf(sort_lists, sorted_lists_schema)

# Apply the UDF to sort the lists based on the "times" column
logs_grouped = logs_grouped.withColumn("sorted_lists", sort_lists_udf("times", "from_servers", "to_servers", "actions"))

# Split the sorted lists into separate columns
logs_grouped = logs_grouped.withColumn("times", col("sorted_lists.times")) \
                           .withColumn("from_servers", col("sorted_lists.from_servers")) \
                           .withColumn("to_servers", col("sorted_lists.to_servers")) \
                           .withColumn("actions", col("sorted_lists.actions")) \
                           .drop("sorted_lists")

# Show the sorted logs
logs_grouped.select("from_servers", "to_servers", "times", "actions").show(5, truncate=True)

+--------------------+--------------------+--------------------+--------------------+
|        from_servers|          to_servers|               times|             actions|
+--------------------+--------------------+--------------------+--------------------+
|[user, ui_server_...|[ui_server_1, pur...|[771, 774, 782, 7...|[Request, Request...|
|[user, ui_server_...|[ui_server_6, pur...|[972, 974, 981, 1...|[Request, Request...|
|[user, ui_server_...|[ui_server_19, pu...|[856, 867, 874, 8...|[Request, Request...|
|[user, ui_server_...|[ui_server_8, pur...|[1, 8, 18, 22, 32...|[Request, Request...|
|[user, ui_server_...|[ui_server_8, pur...|[253, 256, 266, 2...|[Request, Request...|
+--------------------+--------------------+--------------------+--------------------+
only showing top 5 rows



### Keep the from_servers sequence as feature for hashing

In [8]:
''' 
Using the withColumn method to create a new column "process_string".
The concat_ws function concatenates multiple column values into a single string, separated by a comma.
col("from_servers"): Selects the column "from_servers".
col("to_servers"): Selects the column "to_servers".
col("times"): Selects the column "times".
col("actions"): Selects the column "actions".
The concatenated string is stored in the new column "process_string".
'''

logs_grouped = logs_grouped.withColumn(
    "process_string",
    concat_ws(
        ",",
        col("from_servers"),
    )
)

logs_grouped.show(5, truncate=True)

+----------+--------------------+--------------------+--------------------+--------------------+--------------------+
|process_id|        from_servers|          to_servers|               times|             actions|      process_string|
+----------+--------------------+--------------------+--------------------+--------------------+--------------------+
|        28|[user, ui_server_...|[ui_server_1, pur...|[771, 774, 782, 7...|[Request, Request...|user,ui_server_1,...|
|        31|[user, ui_server_...|[ui_server_6, pur...|[972, 974, 981, 1...|[Request, Request...|user,ui_server_6,...|
|        34|[user, ui_server_...|[ui_server_19, pu...|[856, 867, 874, 8...|[Request, Request...|user,ui_server_19...|
|        53|[user, ui_server_...|[ui_server_8, pur...|[1, 8, 18, 22, 32...|[Request, Request...|user,ui_server_8,...|
|        65|[user, ui_server_...|[ui_server_8, pur...|[253, 256, 266, 2...|[Request, Request...|user,ui_server_8,...|
+----------+--------------------+--------------------+--

# Approach 1 (LSH)

## 1. Create shingles

### 1.1. Pre-processing of shingles

In [9]:
def get_all_substrings(string, min_length):
    length = len(string)
    return [string[i:j] for i in range(length) for j in range(i + min_length, length + 1)]

def find_common_substring(server_names, min_length=3, min_occurrences_ratio=0.6):
    substr_counter = Counter()
    total_names = len(server_names)
    min_occurrences = int(total_names * min_occurrences_ratio)
    
    for name in server_names:
        substrings = get_all_substrings(name, min_length)
        substr_counter.update(substrings)
    
    # Filter substrings that occur at least min_occurrences times
    common_substrings = [substr for substr, count in substr_counter.items() if count >= min_occurrences]
    
    if not common_substrings:
        return ""
    
    # Return the longest common substring
    return max(common_substrings, key=len)


def remove_common_substring(strings, common_substring):
    return [string.replace(common_substring, "") for string in strings]

In [10]:
sample_rows = logs_grouped.select("from_servers").limit(5).collect()

# Extract the "from_servers" lists from the sample rows
sample_from_servers = [row["from_servers"] for row in sample_rows]

# Flatten the list of lists to a single list of server names
flattened_server_names = [server for sublist in sample_from_servers for server in sublist]

# Get unique server names
unique_server_names = list(set(flattened_server_names))

print("Unique Server Names:", unique_server_names)

# Use the function on the unique server names
common_substring = find_common_substring(unique_server_names, min_length=3, min_occurrences_ratio=0.6)

print("Most Common Substring:", common_substring)

Unique Server Names: ['inventory_update_server_3', 'user', 'seasonal_adjustments_server_2', 'ui_server_8', 'bundle_offer_server_1', 'user_history_server_3', 'limit_check_american_express_server_17', 'limit_check_american_express_server_8', 'automatic_response_server_4', 'card_check_server_9', 'review_verification_server_1', 'review_analysis_server_1', 'gift_wrap_server_1', 'customer_support_server_8', 'ui_server_19', 'book_availability_server_8', 'bundle_offer_server_2', 'american_express_server_2', 'ui_server_1', 'automatic_response_server_3', 'review_server_3', 'user_credentials_server_4', 'user_credentials_server_5', 'review_verification_server_2', 'review_server_1', 'customer_support_server_10', 'currency_conversion_server_1', 'supplier_notification_server_3', 'discount_server_1', 'ui_server_6', 'inventory_update_server_7', 'seasonal_adjustments_server_1', 'fraud_check_american_express_server_4', 'limit_check_american_express_server_19', 'ad_server_7', 'limit_check_visa_server_2', 

In [11]:
# Create a new DataFrame with the common substring removed from the process_string
logs_grouped_cleaned = logs_grouped.withColumn(
    "process_string_cleaned",
    regexp_replace(col("process_string"), common_substring, "")
)

# Show the cleaned DataFrame
logs_grouped_cleaned.select("process_string", "process_string_cleaned").show(5, truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------

### 1.2. Define the shingles of each function

In [12]:
# Define the shingling function for process strings
def shingle_process_string(process_string, k):
    shingle_set = set()
    for i in range(len(process_string) - k + 1):
        shingle = process_string[i:i + k]
        shingle_set.add(shingle)
    return list(shingle_set)

# Define the shingle length based on the average length of the servers
rows = 5
first_rows = logs_grouped_cleaned.take(rows)
average_length = 0

# Calculate the average length of server names
total_length = 0
total_servers = 0

for row in first_rows:
    servers = row["from_servers"]
    total_length += sum(len(server) for server in servers)
    total_servers += len(servers)

average_server_name_length = total_length / total_servers
shingle_length = max(2, int(np.ceil(average_server_name_length / 2)))

print(f'Average length of server names: {average_server_name_length}')
print(f'Determined shingle length: {shingle_length}')

# Register the UDF for shingling
shingle_udf = udf(lambda process_string: shingle_process_string(process_string, shingle_length), ArrayType(StringType()))

# Apply the UDF to create shingles from process strings
logs_grouped_cleaned_with_shingles = logs_grouped_cleaned.withColumn("shingles", shingle_udf(col("process_string")))

# Show the resulting DataFrame with shingles
logs_grouped_cleaned_with_shingles.select("process_id", "process_string", "shingles").show(5, truncate=False)

Average length of server names: 23.027027027027028
Determined shingle length: 12
+----------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

### 1.3. Create Vocabulary from shingles

In [13]:
# Collect all shingles from the DataFrame into a single list
all_unique_shingles = logs_grouped_cleaned_with_shingles.select("shingles").rdd.flatMap(lambda row: row.shingles).distinct().collect()

# Create a vocabulary dictionary where each shingle is assigned a unique index
vocab = {shingle: idx for idx, shingle in enumerate(all_unique_shingles)}
# Print the vocabulary to verify
print(vocab)

{'ver_3,automa': 0, 'view_verific': 1, 'isa_server_2': 2, 'upport_serve': 3, '5,bundle_off': 4, 'ver_5,purcha': 5, 'xpress_deliv': 6, 'rver_4,revie': 7, 'lity_server_': 8, '1,currency_c': 9, 'erver_1,curr': 10, '_conversion_': 11, 'g_options_se': 12, 'ver_1,limit_': 13, 'sa_server_2,': 14, 'imit_check_v': 15, 'rver_2,book_': 16, '_server_2,vi': 17, 'ping_options': 18, 'r_3,automati': 19, ',shipping_op': 20, 'r_1,shipping': 21, 'adjustments_': 22, 'er_1,limit_c': 23, 'esponse_serv': 24, 'ver_4,custom': 25, 'ials_server_': 26, 'analysis_ser': 27, 'rver_4,autom': 28, 'e_book_serve': 29, 'server_3,rev': 30, 'redentials_s': 31, 'ess_delivery': 32, 'r_9,ad_serve': 33, 'erver_1,revi': 34, 'sponse_serve': 35, 's_server_3,r': 36, 'server_4,rev': 37, 'er_8,user_hi': 38, 'y_server_1,s': 39, 'ons_server_1': 40, 'erver_1,expr': 41, 'rver_2,user_': 42, 'k_availabili': 43, 'w_verificati': 44, 'delivery_ser': 45, '_adjustments': 46, 'justments_se': 47, 'er,ui_server': 48, 'story_server': 49, 'ver_1,ex

In [37]:
# Define a function to create sparse vectors based on the vocabulary
def create_sparse_vector(shingles, vocab):
    vector = [0] * len(vocab)
    for shingle in shingles:
        if shingle in vocab:
            idx = vocab[shingle]
            vector[idx] = 1
    return vector

# Register the UDF to create sparse vectors
sparse_vector_udf = udf(lambda shingles: create_sparse_vector(shingles, vocab), ArrayType(IntegerType()))

# Apply the UDF to create sparse vectors
logs_grouped_sparse = logs_grouped_cleaned_with_shingles.withColumn("sparse_vector", sparse_vector_udf(col("shingles")))

# Show the resulting DataFrame with sparse vectors
logs_grouped_sparse.select("process_id", "sparse_vector").show(5, truncate=False)

+----------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

### 1.4. Generate randomized MinHash Vectors

In [44]:
def create_hash_func(size):
    indices = list(range(size))
    shuffle(indices)
    return indices

# Generate multiple random hash functions
def build_minhash_func(vocab_size, nbits):
    return [create_hash_func(vocab_size) for _ in range(nbits)]

num_minhash_functions = 10
minhash_functions = build_minhash_func(len(vocab), num_minhash_functions)


### 1.5. Create MinHash signature

In [45]:
def create_minhash_signature(sparse_vector, minhash_functions):
    signature = []
    for func in minhash_functions:
        for i in range(1, len(sparse_vector) + 1):
            idx = func.index(i)
            if sparse_vector[idx] == 1:
                signature.append(idx)
                break
    return signature

# Register the UDF to create minhash signatures
minhash_signature_udf = udf(lambda sparse_vector: create_minhash_signature(sparse_vector, minhash_functions), ArrayType(IntegerType()))

# Apply the UDF to create minhash signatures
logs_grouped_with_signatures = logs_grouped_sparse.withColumn("minhash_signature", minhash_signature_udf(col("sparse_vector")))

# Show the resulting DataFrame with minhash signatures
logs_grouped_with_signatures.select("process_id", "minhash_signature").show(5, truncate=False)


+----------+------------------------------------------------------------+
|process_id|minhash_signature                                           |
+----------+------------------------------------------------------------+
|28        |[3773, 786, 0, 813, 770, 1511, 2268, 1123, 407, 1130]       |
|31        |[2327, 3795, 4188, 813, 770, 4152, 2311, 1541, 407, 1536]   |
|34        |[2327, 2727, 2277, 3107, 3081, 3452, 3847, 1134, 1201, 1567]|
|53        |[2368, 3114, 2277, 1985, 3081, 3024, 4169, 1541, 870, 4225] |
|65        |[3067, 3795, 4188, 813, 770, 4232, 4211, 3484, 1231, 834]   |
+----------+------------------------------------------------------------+
only showing top 5 rows



### 1.6. LSH

In [58]:
# Function to split vector into bands
def split_vector(signature, b):
    r = int(len(signature) / b)
    return [signature[i:i + r] for i in range(0, len(signature), r)]

def split_and_hash_bands(process_id, split_signature):
    rows = []
    for band_index, band in enumerate(split_signature):
        band_hash = hash(tuple(band))
        rows.append(Row(process_id=process_id, band_index=band_index, band_hash=band_hash))
    return rows

# Register UDF to split signatures into bands and hash them
split_and_hash_bands_udf = udf(lambda process_id, split_signature: split_and_hash_bands(process_id, split_signature), ArrayType(StructType([
    StructField("process_id", StringType(), False),
    StructField("band_index", IntegerType(), False),
    StructField("band_hash", LongType(), False)
])))

# Number of bands (tune this parameter)
num_bands = num_minhash_functions // 2

# Split the minhash signatures into bands
split_vector_udf = udf(lambda minhash_signature: split_vector(minhash_signature, num_bands), ArrayType(ArrayType(IntegerType())))
logs_grouped_with_split_signatures = logs_grouped_with_signatures.withColumn("split_signature", split_vector_udf(col("minhash_signature")))


logs_grouped_with_split_signatures.select("process_id", "split_signature").show(5, truncate=False)

+----------+----------------------------------------------------------------------+
|process_id|split_signature                                                       |
+----------+----------------------------------------------------------------------+
|28        |[[3773, 786], [0, 813], [770, 1511], [2268, 1123], [407, 1130]]       |
|31        |[[2327, 3795], [4188, 813], [770, 4152], [2311, 1541], [407, 1536]]   |
|34        |[[2327, 2727], [2277, 3107], [3081, 3452], [3847, 1134], [1201, 1567]]|
|53        |[[2368, 3114], [2277, 1985], [3081, 3024], [4169, 1541], [870, 4225]] |
|65        |[[3067, 3795], [4188, 813], [770, 4232], [4211, 3484], [1231, 834]]   |
+----------+----------------------------------------------------------------------+
only showing top 5 rows



In [60]:
# Apply the UDF to create a DataFrame with band information
bands_df = logs_grouped_with_split_signatures.withColumn("band_data", explode(split_and_hash_bands_udf(col("process_id"), col("split_signature")))).select("band_data.*")

# Show the resulting DataFrame with band information
bands_df.show(50, truncate=False)

+----------+----------+--------------------+
|process_id|band_index|band_hash           |
+----------+----------+--------------------+
|28        |0         |8592741092786240490 |
|28        |1         |-2379899519197344981|
|28        |2         |-4061268462228921400|
|28        |3         |-5787718789710405080|
|28        |4         |-5047217765397752772|
|31        |0         |5226422060323236660 |
|31        |1         |6598252748410545848 |
|31        |2         |8007048908190004980 |
|31        |3         |-2267081639038832560|
|31        |4         |-8763097002251327314|
|34        |0         |5297483383327920061 |
|34        |1         |3200533815691919446 |
|34        |2         |3638706654424526523 |
|34        |3         |-8611145061112535286|
|34        |4         |-7225153615661538250|
|53        |0         |-9107268935361308642|
|53        |1         |8147568492122556986 |
|53        |2         |-6816933379387402398|
|53        |3         |-6315481163944152142|
|53       

In [63]:
# Join bands_df with itself to find candidate pairs
candidate_pairs_df = bands_df.alias("a").join(
    bands_df.alias("b"),
    (col("a.band_hash") == col("b.band_hash")) & (col("a.band_index") == col("b.band_index")) & (col("a.process_id") < col("b.process_id")),
    how="inner"
).select(
    col("a.process_id").alias("process_id_1"),
    col("b.process_id").alias("process_id_2")
)

# Show candidate pairs
candidate_pairs_df.show(truncate=False)


+------------+------------+
|process_id_1|process_id_2|
+------------+------------+
|28          |9773        |
|28          |9673        |
|28          |9100        |
|28          |8518        |
|28          |8057        |
|28          |5636        |
|28          |4783        |
|28          |4707        |
|28          |3581        |
|28          |9810        |
|28          |7950        |
|28          |6041        |
|28          |5607        |
|28          |5168        |
|28          |42          |
|28          |9487        |
|28          |9337        |
|28          |9285        |
|28          |8021        |
|28          |7697        |
+------------+------------+
only showing top 20 rows



In [62]:
# Function to calculate Jaccard similarity
def jaccard_similarity(set1, set2):
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    return intersection / union

# Identify candidate pairs using a join and Jaccard similarity
candidate_pairs = []

bands = bands_df.collect()
for i in range(len(bands)):
    for j in range(i + 1, len(bands)):
        if bands[i]["band_hash"] == bands[j]["band_hash"] and bands[i]["band_index"] == bands[j]["band_index"]:
            id1 = bands[i]["process_id"]
            id2 = bands[j]["process_id"]
            # Retrieve the corresponding minhash signatures
            sig1 = logs_grouped_with_signatures.filter(col("process_id") == id1).select("minhash_signature").collect()[0][0]
            sig2 = logs_grouped_with_signatures.filter(col("process_id") == id2).select("minhash_signature").collect()[0][0]
            # Calculate Jaccard similarity
            if jaccard_similarity(set(sig1), set(sig2)) > 0.5:  # You can adjust the threshold
                candidate_pairs.append((id1, id2))
    print(i)

# Show candidate pairs
print(candidate_pairs)


ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "c:\Users\chris\AppData\Local\Programs\Python\Python311\Lib\site-packages\py4j\java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\chris\AppData\Local\Programs\Python\Python311\Lib\site-packages\py4j\clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\chris\AppData\Local\Programs\Python\Python311\Lib\socket.py", line 705, in readinto
    return self._sock.recv_into(b)
           ^^^^^^^^^^^^^^^^^^^^^^^
KeyboardInterrupt


KeyboardInterrupt: 

In [48]:
from pyspark.sql.functions import col

# Create pairs of processes that have the same band_hash in the same band_index
candidate_pairs_df = bands_df.alias("a").join(
    bands_df.alias("b"),
    (col("a.band_hash") == col("b.band_hash")) & (col("a.band_index") == col("b.band_index")) & (col("a.process_id") < col("b.process_id")),
    how="inner"
).select(
    col("a.process_id").alias("process_id_1"),
    col("b.process_id").alias("process_id_2"),
    col("a.band_index")
)

# Show the candidate pairs
candidate_pairs_df.show(truncate=False)


+------------+------------+----------+
|process_id_1|process_id_2|band_index|
+------------+------------+----------+
|28          |9773        |0         |
|28          |9673        |0         |
|28          |9100        |0         |
|28          |8518        |0         |
|28          |8057        |0         |
|28          |5636        |0         |
|28          |4783        |0         |
|28          |4707        |0         |
|28          |3581        |0         |
|28          |9810        |0         |
|28          |7950        |0         |
|28          |6041        |0         |
|28          |5607        |0         |
|28          |5168        |0         |
|28          |42          |0         |
|28          |9487        |0         |
|28          |9337        |0         |
|28          |9285        |0         |
|28          |8021        |0         |
|28          |7697        |0         |
+------------+------------+----------+
only showing top 20 rows



In [49]:
from pyspark.sql import functions as F

def adjust_num_bands(bands_df, current_num_bands, target_fill_ratio=0.8, min_bands=5, max_bands=20):
    # Compute the count of entries per band
    band_counts = bands_df.groupBy("band_index").count().collect()
    
    # Calculate the average count per band
    total_entries = sum([row['count'] for row in band_counts])
    average_count_per_band = total_entries / current_num_bands
    
    # Calculate the fill ratio
    fill_ratio = average_count_per_band / total_entries
    
    # Adjust the number of bands
    if fill_ratio > target_fill_ratio:
        # Too many entries per band, increase the number of bands
        new_num_bands = min(current_num_bands + 1, max_bands)
    else:
        # Too few entries per band, decrease the number of bands
        new_num_bands = max(current_num_bands - 1, min_bands)
    
    return new_num_bands

# Initial number of bands
num_bands = 5

# Iteratively adjust the number of bands
for iteration in range(10):
    # Split the minhash signatures into bands
    split_vector_udf = udf(lambda minhash_signature: split_vector(minhash_signature, num_bands), ArrayType(ArrayType(IntegerType())))
    logs_grouped_with_split_signatures = logs_grouped_with_signatures.withColumn("split_signature", split_vector_udf(col("minhash_signature")))
    
    # Apply the UDF to create a DataFrame with band information
    bands_df = logs_grouped_with_split_signatures.withColumn("band_data", explode(split_and_hash_bands_udf(col("process_id"), col("split_signature")))).select("band_data.*")
    
    # Adjust the number of bands based on the current distribution
    num_bands = adjust_num_bands(bands_df, num_bands)
    print(f"Iteration {iteration}: Adjusted number of bands to {num_bands}")

# Final bands DataFrame after adjustment
bands_df.show(5, truncate=False)


Iteration 0: Adjusted number of bands to 5
Iteration 1: Adjusted number of bands to 5
Iteration 2: Adjusted number of bands to 5
Iteration 3: Adjusted number of bands to 5
Iteration 4: Adjusted number of bands to 5


ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "c:\Users\chris\AppData\Local\Programs\Python\Python311\Lib\site-packages\py4j\java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\chris\AppData\Local\Programs\Python\Python311\Lib\site-packages\py4j\clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\chris\AppData\Local\Programs\Python\Python311\Lib\socket.py", line 705, in readinto
    return self._sock.recv_into(b)
           ^^^^^^^^^^^^^^^^^^^^^^^
KeyboardInterrupt


KeyboardInterrupt: 

### 1.6. Banding and Hashing for LSH

In [42]:
# Function to split vector into bands
def split_vector(signature, b):
    r = int(len(signature) / b)
    return [signature[i:i + r] for i in range(0, len(signature), r)]

def split_and_hash_bands(process_id, split_signature):
    rows = []
    for band_index, band in enumerate(split_signature):
        band_hash = hash(tuple(band))
        rows.append(row(process_id=process_id, band_index=band_index, band_hash=band_hash))
    return rows


# Register UDF to split signatures into bands and hash them
split_and_hash_bands_udf = udf(lambda process_id, split_signature: split_and_hash_bands(process_id, split_signature), ArrayType(StructType([
    StructField("process_id", StringType(), False),
    StructField("band_index", IntegerType(), False),
    StructField("band_hash", LongType(), False)
])))

# Number of bands (tune this parameter)
num_bands = num_minhash_functions // 2

# Create minhash signatures (use the minhash function defined earlier)
minhash_func = build_minhash_func(len(vocab), num_bands * 2)  # Assuming each band has length 2




In [43]:
# Function to create minhash signatures
def create_hash(vector):
    signature = []
    for func in minhash_func:
        for i in range(len(func)):
            idx = func[i]
            if vector[idx] == 1:
                signature.append(idx)
                break
    return signature

# Register UDF to create minhash signatures
create_hash_udf = udf(lambda sparse_vector: create_hash(sparse_vector), ArrayType(IntegerType()))

# Split the minhash signatures into bands
split_vector_udf = udf(lambda minhash_signature: split_vector(minhash_signature, num_bands), ArrayType(ArrayType(IntegerType())))
logs_grouped_with_split_signatures = logs_grouped_with_signatures.withColumn("split_signature", split_vector_udf(col("minhash_signature")))

# Apply the UDF to create a DataFrame with band information
bands_df = logs_grouped_with_split_signatures.withColumn("band_data", explode(split_and_hash_bands_udf(col("process_id"), col("split_signature")))).select("band_data.*")

# Show the resulting DataFrame with band information
bands_df.show(5, truncate=False)

PythonException: 
  An exception was thrown from the Python worker. Please see the stack trace below.
Traceback (most recent call last):
  File "C:\Users\chris\AppData\Local\Temp\ipykernel_5456\2547438897.py", line 15, in <lambda>
  File "C:\Users\chris\AppData\Local\Temp\ipykernel_5456\2547438897.py", line 10, in split_and_hash_bands
TypeError: Row.__call__() got an unexpected keyword argument 'process_id'


In [19]:
print(len(vocab))

4489


In [20]:
logs_grouped_with_split_signatures.show(2)

+----------+--------------------+--------------------+--------------------+--------------------+--------------------+----------------------+--------------------+--------------------+--------------------+--------------------+
|process_id|        from_servers|          to_servers|               times|             actions|      process_string|process_string_cleaned|            shingles|       sparse_vector|   minhash_signature|     split_signature|
+----------+--------------------+--------------------+--------------------+--------------------+--------------------+----------------------+--------------------+--------------------+--------------------+--------------------+
|        28|[user, ui_server_...|[ui_server_1, pur...|[771, 774, 782, 7...|[Request, Request...|user,ui_server_1,...|  user,ui1,purchase...|[urrency_conv, _s...|[1, 1, 1, 1, 1, 1...|[2272, 4151, 435,...|[[2272, 4151], [4...|
|        31|[user, ui_server_...|[ui_server_6, pur...|[972, 974, 981, 1...|[Request, Request...|user

In [21]:
from pyspark.sql.functions import explode, col, monotonically_increasing_id
from pyspark.sql import Row

# Function to split vector into bands and create a DataFrame with band information
def split_and_hash_bands(process_id, split_signature):
    rows = []
    for band_index, band in enumerate(split_signature):
        band_hash = hash(tuple(band))
        rows.append(Row(process_id=process_id, band_index=band_index, band_hash=band_hash))
    return rows

# Register UDF to split signatures into bands and hash them
split_and_hash_bands_udf = udf(lambda process_id, split_signature: split_and_hash_bands(process_id, split_signature), ArrayType(StructType([
    StructField("process_id", StringType(), False),
    StructField("band_index", IntegerType(), False),
    StructField("band_hash", LongType(), False)
])))

# Apply the UDF to create a DataFrame with band information
bands_df = logs_grouped_with_signatures.withColumn("band_data", explode(split_and_hash_bands_udf(col("process_id"), col("split_signature")))).select("band_data.*")

# Show the resulting DataFrame with band information
bands_df.show(5, truncate=False)


AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column or function parameter with name `split_signature` cannot be resolved. Did you mean one of the following? [`minhash_signature`, `actions`, `sparse_vector`, `from_servers`, `process_id`].;
'Project [process_id#62, from_servers#290, to_servers#298, times#282, actions#306, process_string#497, process_string_cleaned#834, shingles#1158, sparse_vector#1647, minhash_signature#1812, explode(<lambda>(process_id#62, 'split_signature)#2201) AS band_data#2202]
+- Project [process_id#62, from_servers#290, to_servers#298, times#282, actions#306, process_string#497, process_string_cleaned#834, shingles#1158, sparse_vector#1647, <lambda>(sparse_vector#1647)#1811 AS minhash_signature#1812]
   +- Project [process_id#62, from_servers#290, to_servers#298, times#282, actions#306, process_string#497, process_string_cleaned#834, shingles#1158, <lambda>(shingles#1158)#1646 AS sparse_vector#1647]
      +- Project [process_id#62, from_servers#290, to_servers#298, times#282, actions#306, process_string#497, process_string_cleaned#834, <lambda>(process_string#497)#1157 AS shingles#1158]
         +- Project [process_id#62, from_servers#290, to_servers#298, times#282, actions#306, process_string#497, regexp_replace(process_string#497, _server_, , 1) AS process_string_cleaned#834]
            +- Project [process_id#62, from_servers#290, to_servers#298, times#282, actions#306, concat_ws(,, from_servers#290) AS process_string#497]
               +- Project [process_id#62, from_servers#290, to_servers#298, times#282, actions#306]
                  +- Project [process_id#62, from_servers#290, to_servers#298, times#282, sorted_lists#275.actions AS actions#306, sorted_lists#275]
                     +- Project [process_id#62, from_servers#290, sorted_lists#275.to_servers AS to_servers#298, times#282, actions#102, sorted_lists#275]
                        +- Project [process_id#62, sorted_lists#275.from_servers AS from_servers#290, to_servers#98, times#282, actions#102, sorted_lists#275]
                           +- Project [process_id#62, from_servers#96, to_servers#98, sorted_lists#275.times AS times#282, actions#102, sorted_lists#275]
                              +- Project [process_id#62, from_servers#96, to_servers#98, times#100, actions#102, sort_lists(times#100, from_servers#96, to_servers#98, actions#102)#274 AS sorted_lists#275]
                                 +- Aggregate [process_id#62], [process_id#62, collect_list(from_server#26, 0, 0) AS from_servers#96, collect_list(to_server#29, 0, 0) AS to_servers#98, collect_list(time#56, 0, 0) AS times#100, collect_list(action#38, 0, 0) AS actions#102]
                                    +- Project [from_server#26, to_server#29, time#56, action#38, cast(process_id#44 as int) AS process_id#62]
                                       +- Project [from_server#26, to_server#29, cast(time#33 as int) AS time#56, action#38, process_id#44]
                                          +- Project [from_server#26, to_server#29, time#33, action#38, process_id#44]
                                             +- Project [Logs#17, from_server#26, to_server#29, time#33, action#38, regexp_replace(split(Logs#17, , , -1)[4], [<>], , 1) AS process_id#44]
                                                +- Project [Logs#17, from_server#26, to_server#29, time#33, split(Logs#17, , , -1)[3] AS action#38]
                                                   +- Project [Logs#17, from_server#26, to_server#29, split(Logs#17, , , -1)[2] AS time#33]
                                                      +- Project [Logs#17, from_server#26, split(Logs#17, , , -1)[1] AS to_server#29]
                                                         +- Project [Logs#17, regexp_replace(split(Logs#17, , , -1)[0], [<>], , 1) AS from_server#26]
                                                            +- Relation [Logs#17] csv
