In [1]:
# --- PySpark Setup Cell ---
import os, sys, findspark

os.environ["SPARK_HOME"] = r"E:\Coding\BDA-PySpark\spark-3.4.1-bin-hadoop3"
os.environ["PYSPARK_PYTHON"] = sys.executable
os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable

findspark.init()


Libraries

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col
from pyspark.sql.types import StructType, StructField, StringType, FloatType
import torch
import threading
import re
from transformers import pipeline
import os
import shutil
import glob

Spark Session

In [3]:
# Spark session
spark = SparkSession.builder \
    .appName("SentimentDetectionPipeline") \
    .config("spark.driver.memory", "16g") \
    .config("spark.executor.memory", "16g") \
    .master("local[*]") \
    .getOrCreate()


Paths

In [4]:

input_path = r"hdfs://localhost:9000/user/adarsh/realtime_pipeline/filtered_batches"
output_dir = r"E:\Coding\BDA-PySpark\realtime-pipeline\results_spark"
final_output_file = os.path.join(output_dir, "sentiment_flags_spark.csv")

# Model setup
1. Loads a multilingual sentiment model (positive, negative, neutral) for text classification.
2. thread-local object is used -> Spark worker thread loads the model once

In [5]:
def sentiment_udf(text):
    # Thread local for model reuse
    local_model = threading.local()
    def get_model():
        if not hasattr(local_model, "model"):
            device = 0 if torch.cuda.is_available() else -1
            local_model.model = pipeline(
                "text-classification",
                model="tabularisai/multilingual-sentiment-analysis",
                framework="pt",
                device=device,
                return_all_scores=True,
                truncation=True,
                max_length=512
            )
        return local_model.model

# Text preprocessing
1. Splits very long comments into chunks of ≤400 words

In [6]:
    def minimal_clean_sentiment(text):
        if not isinstance(text, str):
            return ""
        text = text.strip()
        if len(text.split()) < 5:
            return ""
        text = re.sub(r'\s+', ' ', text)
        return text

    def split_long_text(text, max_words=400):
        words = text.split()
        if len(words) <= max_words:
            return [text]
        return [' '.join(words[i:i + max_words]) for i in range(0, len(words), max_words) if ' '.join(words[i:i + max_words]).strip()]

# Prediction + aggregation
1. Runs inference batch-wise
2. per batch ,picks the highest confidence label per chunk
3. `{label, score}`

In [7]:
def sentiment_udf(text):
    local_model = threading.local()
    
    def get_model():
        if not hasattr(local_model, "model"):
            device = 0 if torch.cuda.is_available() else -1
            local_model.model = pipeline(
                "text-classification",
                model="tabularisai/multilingual-sentiment-analysis",
                framework="pt",
                device=device,
                return_all_scores=True,
                truncation=True,
                max_length=512
            )
        return local_model.model

    def minimal_clean_sentiment(text):
        if not isinstance(text, str):
            return ""
        text = text.strip()
        if len(text.split()) < 5:
            return ""
        text = re.sub(r'\s+', ' ', text)
        return text

    def split_long_text(text, max_words=400):
        words = text.split()
        if len(words) <= max_words:
            return [text]
        return [
            ' '.join(words[i:i + max_words])
            for i in range(0, len(words), max_words)
            if ' '.join(words[i:i + max_words]).strip()
        ]

    def aggregate_chunk_sentiments(chunk_results):
        if not chunk_results:
            return {'label': 'neutral', 'score': 0.0}
        if len(chunk_results) == 1:
            return chunk_results[0]
        sentiment_weights = {}
        for result in chunk_results:
            label = result['label']
            score = result['score']
            sentiment_weights[label] = sentiment_weights.get(label, 0.0) + score
        final_label = max(sentiment_weights, key=sentiment_weights.get)
        final_score = sentiment_weights[final_label] / len(chunk_results)
        return {'label': final_label, 'score': final_score}

    try:
        text = minimal_clean_sentiment(text)
        if not text:
            return {'label': 'neutral', 'score': 0.0}
        chunks = split_long_text(text, max_words=400)
        chunk_preds = []
        model = get_model()
        for i in range(0, len(chunks), 32):
            batch = chunks[i:i + 32]
            batch_preds = model(batch, truncation=True, max_length=512)
            for pred_group in batch_preds:
                if isinstance(pred_group, list):
                    best_pred = max(pred_group, key=lambda x: x['score'])
                    chunk_preds.append(best_pred)
                else:
                    chunk_preds.append(pred_group)
        final_result = aggregate_chunk_sentiments(chunk_preds)
        return {'label': final_result['label'], 'score': final_result['score']}
    except Exception as e:
        print(f"UDF error: {e}")
        return {'label': 'error', 'score': 0.0}
    
    
result_schema = StructType([
    StructField("label", StringType(), True),
    StructField("score", FloatType(), True)
])

sentiment_predict_udf = udf(sentiment_udf, result_schema)


cleaning temporary folder recursively

In [8]:
def save_single_csv(df, output_path):
    temp_dir = output_path + "_temp"
    if os.path.exists(temp_dir):
        shutil.rmtree(temp_dir)
    # Only output required columns
    df.select("comment", "sentiment", "score") \
        .coalesce(1) \
        .write \
        .mode("overwrite") \
        .option("header", True) \
        .csv(temp_dir)
    part_file = glob.glob(os.path.join(temp_dir, 'part-*.csv'))
    if not part_file:
        raise FileNotFoundError("No part file found in temp directory")
    part_file = part_file[0]
    if os.path.exists(output_path):
        os.remove(output_path)
    shutil.move(part_file, output_path)
    shutil.rmtree(temp_dir)

Main Function + saving file to as .csv

In [9]:
if __name__ == "__main__":
    df_spark = spark.read.text(input_path).toDF("comment")
    df_spark = df_spark.filter("length(comment) > 10")
    print(f"Processing {df_spark.count()} comments")

    df_result = df_spark.withColumn("sentiment_result", sentiment_predict_udf(col("comment")))
    df_result = df_result.withColumn("sentiment", col("sentiment_result.label")) \
                         .withColumn("score", col("sentiment_result.score")) \
                         .drop("sentiment_result")
    save_single_csv(df_result, final_output_file)
    print(f"Output saved to {output_dir} as sentiment_flags_spark.csv")
    spark.stop()

Processing 322 comments
Output saved to E:\Coding\BDA-PySpark\realtime-pipeline\results_spark as sentiment_flags_spark.csv
