In [1]:
import logging
import time
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.ml.fpm import FPGrowth
from pyspark import SparkConf
from pyspark.sql.types import ArrayType, StringType
from pyspark.sql.functions import col, udf
from pyspark.sql.types import StructType, StructField, LongType, StringType, ArrayType
from graphframes import GraphFrame

# --- Initialize Logger ---
# Set up basic logging (adjust level and format as needed)
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# --- Spark Configuration (assuming you have this configured from previous steps) ---
conf = SparkConf() \
    .setAppName("SARL_Spark_LPA_Partitioning") \
    .setMaster("local[*]") \
    .set("spark.driver.memory", "24g") \
    .set("spark.executor.memory", "24g") \
    .set("spark.executor.instances", "6") \
    .set("spark.executor.cores", "6") \
    .set("spark.memory.offHeap.enabled", "true") \
    .set("spark.memory.offHeap.size", "3g") \
    .set("spark.sql.shuffle.partitions", "200") \
    .set("spark.executor.extraJavaOptions", "-Xss4m") \
    .set("spark.driver.extraJavaOptions", "-Xss4m") \
    .set("spark.executor.memoryOverhead", "8g") \
    .set("spark.driver.memoryOverhead", "8g") \
    .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .set("spark.kryoserializer.buffer.max", "512m") \
    .set("spark.memory.fraction", "0.6") \
    .set("spark.memory.storageFraction", "0.5") \
    .set("spark.jars.packages", "graphframes:graphframes:0.8.2-spark3.2-s_2.12")

spark = SparkSession.builder.config(conf=conf).getOrCreate()
sc = spark.sparkContext # Get SparkContext for log level setting
sc.setLogLevel("INFO") # Set Spark's own logging level
logger.info(f"Spark Version: {spark.version}")
logger.info(f"Spark Driver Memory: {spark.conf.get('spark.driver.memory')}")
logger.info(f"Spark Executor Memory: {spark.conf.get('spark.executor.memory')}")
logger.info(f"Spark Executor Cores: {spark.conf.get('spark.executor.cores')}")

2025-06-03 16:56:04,605 - INFO - Spark Version: 3.4.0
2025-06-03 16:56:04,997 - INFO - Spark Driver Memory: 24g
2025-06-03 16:56:04,998 - INFO - Spark Executor Memory: 24g
2025-06-03 16:56:04,999 - INFO - Spark Executor Cores: 6


In [4]:

# --- Load Transactional Data (Consistent with SARL-Spark pipeline) ---
transaction_file_path = 'sarl-spark/data/processed/transactional_data_zscore.csv'

raw_lines_rdd = sc.textFile(transaction_file_path)
header = raw_lines_rdd.first()

# Define a helper function to process each line, ensuring 're' is available in worker scope
def _process_transaction_line(line):
    # Import 're' inside the function to ensure it's available on Spark workers
    import re
    items = re.findall(r'[+-]?[a-zA-Z0-9_]+', line)
    # Filter out 'tumor' and 'normal' if they are labels
    filtered_items = [item for item in items if item.lower() not in ['tumor', 'normal']]
    # Ensure items within a transaction are unique by converting to set and back to list
    unique_items = list(set(filtered_items))
    return unique_items

transactions_rdd = raw_lines_rdd.filter(lambda line: line != header) \
                                 .map(_process_transaction_line) \
                                 .filter(lambda transaction: len(transaction) > 0)

transactions_df = spark.createDataFrame(transactions_rdd, ArrayType(StringType()))
transactions_df = transactions_df.withColumnRenamed("value", "items")
transactions_df.cache()
total_transactions = transactions_df.count() # Needed for Lift calculation if you extend this
logger.info(f"Total transactions loaded for baseline: {total_transactions}")


# --- SARL Heuristic Parameters (Consistent with SARL-Spark pipeline) ---
MIN_SUPPORT_PERCENTAGE = 0.03 # Example: 10% minsup
MIN_CONFIDENCE = 0.7 # Example: 70% minconf

logger.info(f"Baseline Parameters: min_support_percentage={MIN_SUPPORT_PERCENTAGE}, min_confidence={MIN_CONFIDENCE}")

# --- 1. Run FPGrowth to find Frequent Itemsets (Baseline) ---
logger.info("Running FPGrowth (Baseline) to find Frequent Itemsets...")
start_time_fp = time.time()

# Configure FPGrowth - numPartitions here controls internal parallelism for the whole dataset
# FIX: Cast spark.conf.get() result to int before passing to numPartitions
fpGrowth = FPGrowth(itemsCol="items", minSupport=MIN_SUPPORT_PERCENTAGE, numPartitions=int(spark.conf.get("spark.sql.shuffle.partitions"))) # Use shuffle partitions as a heuristic

# Fit the model to the entire transactions_df
model_fp = fpGrowth.fit(transactions_df)

# Get frequent itemsets
frequent_itemsets_baseline = model_fp.freqItemsets
frequent_itemsets_baseline.cache()
num_frequent_itemsets_baseline = frequent_itemsets_baseline.count()
time_fp_itemsets = time.time() - start_time_fp
logger.info(f"FPGrowth Baseline: Found {num_frequent_itemsets_baseline} frequent itemsets in {time_fp_itemsets:.2f} seconds.")
logger.info("Sample of FPGrowth Baseline Frequent Itemsets:")
frequent_itemsets_baseline.show(10, truncate=False)


# --- 2. Generate Association Rules (Baseline) ---
logger.info("Generating Association Rules (Baseline) from Frequent Itemsets...")
start_time_rules = time.time()

# Use AssociationRules transformer to generate rules from frequent itemsets
# minConfidence is applied here
association_rules_generator = AssociationRules(minConfidence=MIN_CONFIDENCE)
association_rules_baseline = association_rules_generator.transform(frequent_itemsets_baseline)
association_rules_baseline.cache()

num_association_rules_baseline = association_rules_baseline.count()
time_rules_baseline = time.time() - start_time_rules
logger.info(f"Association Rules Baseline: Found {num_association_rules_baseline} rules in {time_rules_baseline:.2f} seconds.")
logger.info("Sample of FPGrowth Baseline Association Rules (by confidence):")
association_rules_baseline.orderBy(F.desc("confidence")).show(10, truncate=False)

# --- Clean up baseline caches ---
transactions_df.unpersist()
frequent_itemsets_baseline.unpersist()
association_rules_baseline.unpersist()

# --- End of Baseline Script ---
logger.info("FPGrowth Baseline comparison script finished.")
spark.stop()

2025-06-03 16:58:24,128 - INFO - Total transactions loaded for baseline: 2302
2025-06-03 16:58:24,129 - INFO - Baseline Parameters: min_support_percentage=0.03, min_confidence=0.7
2025-06-03 16:58:24,129 - INFO - Running FPGrowth (Baseline) to find Frequent Itemsets...
----------------------------------------
Exception occurred during processing of request from ('127.0.0.1', 38002)
2025-06-03 16:58:59,039 - INFO - Error while receiving.
Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 516, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty
Traceback (most recent call last):
2025-06-03 16:58:59,041 - INFO - Closing down clientserver connection
  File "/opt/conda/lib/python3.10/socketserver.py", line 316, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/opt/conda/lib/python3.10/socketserver.py

Py4JError: An error occurred while calling o247.count

2025-06-03 16:59:00,031 - INFO - Error while receiving.
Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 516, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty
2025-06-03 16:59:00,033 - INFO - Closing down clientserver connection
2025-06-03 16:59:00,034 - ERROR - Exception while sending command.
Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 516, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/usr/l