In [1]:
# --- PySpark Setup Cell ---
import os, sys, findspark

os.environ["SPARK_HOME"] = r"E:\Coding\BDA-PySpark\spark-3.4.1-bin-hadoop3"
os.environ["PYSPARK_PYTHON"] = sys.executable
os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable

findspark.init()


`Hugging Face transformers` library imported for emotion detection.

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col
from pyspark.sql.types import StructType, StructField, StringType, FloatType
import torch
import threading

# Import transformer pipeline
from transformers import pipeline

Spark Session

In [3]:

# Spark session
spark = SparkSession.builder \
    .appName("EmotionDetectionPipeline") \
    .config("spark.driver.memory", "16g") \
    .config("spark.executor.memory", "16g") \
    .master("local[*]") \
    .getOrCreate()

Paths

In [4]:
# HDFS or local input path
input_path = "hdfs://localhost:9000/user/adarsh/realtime_pipeline/filtered_batches"

# Output CSV path
output_dir = r"E:\Coding\BDA-PySpark\realtime-pipeline\results_spark"
final_output_file = os.path.join(output_dir, "emotion_flags_spark.csv")

1. The emotion_udf function performs emotion detection on each comment using the\
  Hugging Face model j-hartmann
/emotion-english-distilroberta-base, loaded efficiently via a thread-local setup.
2. It cleans and splits long texts into smaller chunks, processes them in batches, and then \
aggregates predictions to find the dominant emotion and its confidence score.
3. Finally, the output structure (label, score) is defined with a Spark schema and\
 registered as a UDF so Spark can apply it in parallel across all comments.



In [5]:
def emotion_udf(text):
    # Thread local avoids re-loading model
    local_model = threading.local()
    def get_model():
        if not hasattr(local_model, "model"):
            device = 0 if torch.cuda.is_available() else -1
            local_model.model = pipeline(
                "text-classification",
                model="j-hartmann/emotion-english-distilroberta-base",
                framework="pt",
                device=device,
                truncation=True,
                max_length=512
            )
        return local_model.model
    try:
        model = get_model()
        # Clean, filter, and chunk
        if not isinstance(text, str) or len(text.strip().split()) < 5:
            return {'label': 'neutral', 'score': 0.0}
        text = text.strip()
        words = text.split()
        max_words = 400
        if len(words) <= max_words:
            chunks = [text]
        else:
            chunks = [' '.join(words[i:i + max_words]) for i in range(0, len(words), max_words)]
        batch_size = 32
        chunk_preds = []
        for i in range(0, len(chunks), batch_size):
            batch = chunks[i:i + batch_size]
            preds = model(batch, truncation=True, max_length=512)
            chunk_preds.extend(preds)
        # Aggregate results
        if not chunk_preds:
            return {'label': 'neutral', 'score': 0.0}
        emotion_weights = {}
        for result in chunk_preds:
            label = result['label']
            score = result['score']
            emotion_weights[label] = emotion_weights.get(label, 0.0) + score
        final_label = max(emotion_weights, key=emotion_weights.get)
        final_score = emotion_weights[final_label] / len(chunk_preds)
        return {'label': final_label, 'score': final_score}
    except Exception as e:
        print(f"Emotion detection error: {e}")
        return {'label': 'error', 'score': 0.0}

emotion_schema = StructType([
    StructField("label", StringType(), True),
    StructField("score", FloatType(), True)
])
emotion_predict_udf = udf(emotion_udf, emotion_schema)

1. Saving to .csv file
2. cleaning of temporary folders

In [6]:
def save_single_csv(df, output_path):
    temp_dir = output_path + "_temp"
    import shutil, glob, os
    if os.path.exists(temp_dir):
        shutil.rmtree(temp_dir)
    df.select("comment", "emotion", "score") \
        .coalesce(1) \
        .write \
        .mode("overwrite") \
        .option("header", True) \
        .csv(temp_dir)
    part_file = glob.glob(os.path.join(temp_dir, 'part-*.csv'))
    if not part_file:
        raise FileNotFoundError("No part file found in temp directory")
    part_file = part_file[0]
    if os.path.exists(output_path):
        os.remove(output_path)
    shutil.move(part_file, output_path)
    shutil.rmtree(temp_dir)

Main Function

In [7]:
if __name__ == "__main__":
    # Read comments from HDFS or local text files
    df_spark = spark.read.text(input_path).toDF("comment")
    df_spark = df_spark.filter("length(comment) > 10")
    print(f"Processing {df_spark.count()} comments")

    # Run distributed emotion detection
    df_result = df_spark.withColumn("emotion_result", emotion_predict_udf(col("comment")))
    df_result = df_result.withColumn("emotion", col("emotion_result.label")) \
                         .withColumn("score", col("emotion_result.score")) \
                         .drop("emotion_result")

    # Save as a single CSV
    save_single_csv(df_result.select("comment", "emotion", "score"), final_output_file)

    # Print summary to console
    print(f"Output saved to {output_dir} as emotion_flags_spark.csv")


    spark.stop()

Processing 322 comments
Output saved to {output_dir} as emotion_flags_spark.csv
