In [1]:
# --- PySpark Setup Cell ---
import os, sys, findspark

os.environ["SPARK_HOME"] = r"E:\Coding\BDA-PySpark\spark-3.4.1-bin-hadoop3"
os.environ["PYSPARK_PYTHON"] = sys.executable
os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable

findspark.init()

Libraries

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col
from pyspark.sql.types import StructType, StructField, StringType, FloatType
from transformers import pipeline
import torch
import threading
import os
import shutil
import glob
import sys

Paths

In [3]:

# HDFS input directory
hdfs_uri = "hdfs://localhost:9000"
hdfs_filtered_dir = f"{hdfs_uri}/user/adarsh/realtime_pipeline/filtered_batches"

# Path for saving the final CSV
output_dir = r"E:\Coding\BDA-PySpark\realtime-pipeline\results_spark"
final_output_file = os.path.join(output_dir, "harmful_flags_spark.csv")

Spark Session

In [4]:

# Initialize Spark
spark = SparkSession.builder \
    .appName("HarmfulFlagPipeline") \
    .config("spark.driver.memory", "16g") \
    .config("spark.executor.memory", "16g") \
    .config("spark.executor.cores", "4") \
    .master("local[4]") \
    .getOrCreate()


1. `run_harmful_udf` function:  RoBERTa model (SamLowe/roberta-base-go_emotions) to classify each text based on emotion
2. Processes in batches and returns : `label` and `confidence`

In [5]:
def run_harmful_udf(text):
    local_model = threading.local()

    def get_model():
        if not hasattr(local_model, "model"):
            device = 0 if torch.cuda.is_available() else -1
            local_model.model = pipeline(
                "text-classification",
                model="SamLowe/roberta-base-go_emotions",
                framework="pt",
                device=device,
                truncation=True,
                max_length=512
            )
        return local_model.model

    try:
        model = get_model()
        if not isinstance(text, str) or len(text.strip().split()) < 5:
            return {'label': 'neutral', 'score': 0.0}

        text = text.strip()
        words = text.split()
        max_words = 400
        if len(words) <= max_words:
            chunks = [text]
        else:
            chunks = [' '.join(words[i:i + max_words]) for i in range(0, len(words), max_words)]

        batch_size = 16
        total_batches = (len(chunks) + batch_size - 1) // batch_size
        chunk_preds = []
        for i in range(0, len(chunks), batch_size):
            batch_num = i // batch_size + 1
            batch = chunks[i:i + batch_size]
            print(f"Processing batch {batch_num} of {total_batches}", file=sys.stderr)
            preds = model(batch, truncation=True, max_length=512)
            chunk_preds.extend(preds)

        if not chunk_preds:
            return {'label': 'neutral', 'score': 0.0}
        if len(chunk_preds) == 1:
            return chunk_preds[0]
        label_weights = {}
        for result in chunk_preds:
            label = result['label']
            score = result['score']
            label_weights[label] = label_weights.get(label, 0.0) + score

        final_label = max(label_weights, key=label_weights.get)
        final_score = label_weights[final_label] / len(chunk_preds)
        return {'label': final_label, 'score': final_score}
    except:
        return {'label': 'error', 'score': 0.0}


result_schema = StructType([
    StructField("label", StringType(), True),
    StructField("score", FloatType(), True)
])

harmful_pred_udf = udf(run_harmful_udf, result_schema)

recursive cleaning of temporary directory

In [6]:
def save_single_csv(df, output_path):
    temp_dir = output_path + "_temp"
    if os.path.exists(temp_dir):
        shutil.rmtree(temp_dir)  

    df.coalesce(1).write.mode("overwrite").option("header", True).csv(temp_dir)

    part_file = glob.glob(os.path.join(temp_dir, 'part-*.csv'))
    if not part_file:
        raise FileNotFoundError("No part file found in temp directory")
    part_file = part_file[0]

    if os.path.exists(output_path):
        os.remove(output_path)
    shutil.move(part_file, output_path)
    shutil.rmtree(temp_dir)


store and save to .csv

In [None]:
def run_and_save(df_spark):
    df_result = df_spark.withColumn("harmful_result", harmful_pred_udf(col("comment")))
    df_result = df_result.withColumn("harmful_label", col("harmful_result.label")) \
                         .withColumn("score", col("harmful_result.score")) \
                         .drop("harmful_result")

    save_single_csv(df_result, final_output_file)
    print(f"Output saved to {output_dir} as harmful_flags_spark.csv")

Main Function

In [8]:
if __name__ == "__main__":
    print(f"Reading data from HDFS path: {hdfs_filtered_dir}")
    df_spark = spark.read.text(hdfs_filtered_dir).toDF("comment")
    print(f"Read {df_spark.count()} rows.")

    df_spark = df_spark.filter("length(comment) > 10")
    print(f"After filtering, {df_spark.count()} rows left.")

    print("Starting harmful content detection UDF")
    run_and_save(df_spark)
    print("Completed harmful content detection.")

    spark.stop()

Reading data from HDFS path: hdfs://localhost:9000/user/adarsh/realtime_pipeline/filtered_batches
Read 322 rows.
After filtering, 322 rows left.
Starting harmful content detection UDF
Output saved to {output_dir} as harmful_flags_spark.csv
Completed harmful content detection.
