In [1]:
# --- PySpark Setup Cell ---
import os, sys, findspark

os.environ["SPARK_HOME"] = r"E:\Coding\BDA-PySpark\spark-3.4.1-bin-hadoop3"
os.environ["PYSPARK_PYTHON"] = sys.executable
os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable

findspark.init()


In [None]:
# Optimized Topic Clustering with Spark + SentenceTransformer

import os
import torch
import numpy as np
from sentence_transformers import SentenceTransformer
from pyspark.sql import SparkSession, functions as F
from pyspark.sql.types import *
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator

# Initialize Spark Session
spark = (
    SparkSession.builder
    .appName("TopicClusteringPipelineOptimized")
    .config("spark.master", "local[*]")
    .config("spark.executor.memory", "4g")
    .config("spark.driver.memory", "4g")
    .config("spark.sql.shuffle.partitions", "4")  
    .config("spark.default.parallelism", "4")
    .config("spark.network.timeout", "1200s")
    .config("spark.executor.heartbeatInterval", "200s")
    .getOrCreate()
)
spark.sparkContext.setLogLevel("WARN")

# Optimized embed_partition (batched)

def embed_partition(iterator):
    """
    Encode texts in batches per partition.
    Loads model once per partition to avoid timeouts.
    """
    try:
        model = SentenceTransformer("all-MiniLM-L6-v2", device="cpu")
        batch_texts, batch_rows = [], []
        batch_size = 32  

        for row in iterator:
            text = row["cleaned_text"]
            if text and isinstance(text, str) and len(text.strip()) > 0:
                batch_rows.append(row)
                batch_texts.append(text)

            if len(batch_texts) >= batch_size:
                embeddings = model.encode(batch_texts, show_progress_bar=False)
                for r, e in zip(batch_rows, embeddings):
                    yield (r["comment"], r["cleaned_text"], e.tolist())
                batch_texts, batch_rows = [], []

        if batch_texts:
            embeddings = model.encode(batch_texts, show_progress_bar=False)
            for r, e in zip(batch_rows, embeddings):
                yield (r["comment"], r["cleaned_text"], e.tolist())

    except Exception as e:
        print(f"[ERROR in partition]: {e}")
        return

# Schema
embed_schema = StructType([
    StructField("comment", StringType(), True),
    StructField("cleaned_text", StringType(), True),
    StructField("features_array", ArrayType(FloatType()), True)
])

# 4. Main Pipeline
def run_spark_kmeans_pipeline(spark, input_path, output_path):
    print("\n===============================")
    print("[INFO] Starting Topic Clustering Pipeline...")
    print("===============================")

    df = spark.read.text(input_path).withColumnRenamed("value", "comment")
    df = df.withColumn("cleaned_text", F.trim(F.lower(F.col("comment"))))
    df = df.filter(F.length("cleaned_text") > 10)

    if df.count() == 0:
        print("[ERROR] No valid rows found.")
        return

    df = df.coalesce(4)   

    
    print("[INFO] Generating embeddings with batching...")
    rdd = df.rdd.mapPartitions(embed_partition)
    df_embed = spark.createDataFrame(rdd, schema=embed_schema)

   
    array_to_vector_udf = F.udf(lambda arr: Vectors.dense(arr) if arr else None, VectorUDT())
    df_features = df_embed.withColumn("features", array_to_vector_udf("features_array")).cache()

    valid_count = df_features.count()
    print(f"[INFO] Total rows with valid embeddings: {valid_count}")
    if valid_count < 2:
        print("[ERROR] Not enough embeddings for clustering.")
        return

    
    # KMeans clustering
    
    print("[INFO] Running KMeans clustering...")
    k = 5
    kmeans = KMeans(k=k, seed=42, featuresCol="features", predictionCol="topic")
    model = kmeans.fit(df_features)
    clustered = model.transform(df_features)

    centroids = model.clusterCenters()
    num_clusters = len(centroids)
    print(f"[INFO] Total clusters generated: {num_clusters}")

    cluster_sizes = clustered.groupBy("topic").count().orderBy("topic").collect()
    print("\n[INFO] Cluster distribution:")
    for row in cluster_sizes:
        print(f"   Cluster {row['topic']}: {row['count']} samples")

    # Evaluate clustering quality
    evaluator = ClusteringEvaluator(featuresCol="features", predictionCol="topic", metricName="silhouette")
    silhouette = evaluator.evaluate(clustered)
    print(f"[RESULT] Silhouette Score: {silhouette:.4f}")

    # Compute topic probabilities (cosine similarity)
    centroids_arr = [np.array(c) for c in centroids]
    centroid_bcast = spark.sparkContext.broadcast(centroids_arr)

    def cosine_similarity(v, idx):
        if v is None or idx is None:
            return 0.0
        v = np.array(v)
        c = centroid_bcast.value[int(idx)]
        return float(np.dot(v, c) / (np.linalg.norm(v) * np.linalg.norm(c)))

    cosine_udf = F.udf(cosine_similarity, FloatType())

    df_final = (
        clustered
        .withColumn("topic_prob", cosine_udf(F.col("features_array"), F.col("topic")))
        .select("comment", "topic", "topic_prob")
    )


    print(f"[INFO] Saving final results to: {output_path}")
    df_final.toPandas().to_csv(output_path, index=False)
    print("[SUCCESS] Clustering completed and results saved.")
    print("===============================")



hdfs_uri = "hdfs://localhost:9000"
hdfs_raw_dir = f"{hdfs_uri}/user/adarsh/realtime_pipeline/raw_batches"
topics_path = "E:/Coding/BDA-PySpark/realtime-pipeline/results_spark/"
os.makedirs(topics_path, exist_ok=True)
final_output_file = os.path.join(topics_path, "topic_results_spark.csv")

run_spark_kmeans_pipeline(spark, hdfs_raw_dir, final_output_file)

spark.stop()
print("[INFO] Spark session stopped successfully.")



[INFO] Starting Topic Clustering Pipeline...
[INFO] Generating embeddings with batching...
[INFO] Total rows with valid embeddings: 148
[INFO] Running KMeans clustering...
[INFO] Total clusters generated: 5

[INFO] Cluster distribution:
   Cluster 0: 7 samples
   Cluster 1: 53 samples
   Cluster 2: 33 samples
   Cluster 3: 26 samples
   Cluster 4: 29 samples
[RESULT] Silhouette Score: 0.0736
[INFO] Saving final results to: E:/Coding/BDA-PySpark/realtime-pipeline/results_spark/topic_results_spark.csv
[SUCCESS] Clustering completed and results saved.
[INFO] Spark session stopped successfully.
