In [68]:
import findspark

findspark.init()
from IPython.display import clear_output

In [69]:
from pyspark.sql import SparkSession, functions as f
from pyspark.sql.functions import udf
from pyspark.sql.streaming import DataStreamWriter, DataStreamReader
from pyspark.ml.pipeline import PipelineModel

spark: SparkSession = (
    SparkSession.builder.appName("stream_sentiment_analysis")
    .master("local[*]")
    .config("spark.sql.legacy.timeParserPolicy", "LEGACY")
    .getOrCreate()
)
spark.sparkContext.setCheckpointDir("../checkpoints/")

In [70]:
model = PipelineModel.load("../model")

##### Set streamReader

In [71]:
schema = "polarity FLOAT, id LONG, date_time TIMESTAMP, query STRING, user STRING, text STRING"
timestampformat = "EEE MMM dd HH:mm:ss zzz yyyy"

spark_reader: DataStreamReader = spark.readStream.schema(schema)

##### Prune and clean incoming data, also, use trained model to find predictions

In [72]:
import html

user_regex = r"(@\w{1,15})"
url_regex = r"((https?|ftp|file):\/{2,3})+([-\w+&@#/%=~|$?!:,.]*)|(www.)+([-\w+&@#/%=~|$?!:,.]*)"
email_regex = r"[\w.-]+@[\w.-]+\.[a-zA-Z]{1,}"


@udf
def html_unescape(s: str):
    if isinstance(s, str):
        return html.unescape(s)
    return s

In [73]:
dataframe = (
    spark_reader.csv(
        "../stream/", timestampFormat=timestampformat, quote='"', header=False
    )
    .select("id", "date_time", "user", "text")
    .withColumn("text", f.regexp_replace("text", url_regex, ""))
    .withColumn("text", f.regexp_replace("text", email_regex, ""))
    .withColumn("text", f.regexp_replace("text", user_regex, ""))
    .withColumn("text", f.regexp_replace("text", "#", " "))
    .withColumn("text", html_unescape(f.col("text")))
    .withColumn("text", f.regexp_replace("text", "[^a-zA-Z']", " "))
    .withColumn("text", f.regexp_replace("text", " +", " "))
    .withColumn("text", f.trim("text"))
    .filter("text != ''")
    .na.drop(subset="text")
    .coalesce(1)
)

predictions = model.transform(dataframe)

##### Find the number of negative, positive assigned comments with trained model

In [108]:
# user_count = dataframe.select(f.approx_count_distinct("user").alias('number of users'), f.current_timestamp().alias('timestamp'))

result = (
    predictions.select(
        "prediction",
        f.when(f.col("prediction") == 1.0, 1).otherwise(0).alias("positive"),
        f.when(f.col("prediction") == 0.0, 1).otherwise(0).alias("negative"),
        f.lit(1).alias("total"),
    )
    .agg(
        f.sum("positive").alias("positive tweets"),
        f.sum("negative").alias("negative tweets"),
        f.sum("total").alias("total tweets"),
        f.round(f.avg("prediction"), 2).alias("average polarity")
    )
    .select("*", f.current_timestamp().alias("timestamp"))
)

##### Set streamWriter and activate streaming query

In [109]:
if dataframe.isStreaming:
    stream_writer: DataStreamWriter = (
        result.writeStream.queryName("result")
        .trigger(processingTime="5 seconds")
        .outputMode("complete")
        .format("memory")
    )

    query = stream_writer.start()

    print(f"Stream is {'Active' if query.isActive else 'Not Active'}")

Stream is Active


##### Run query every 5 seconds

In [111]:
from time import sleep

while True:
    spark.sql(f"select * from {query.name}").show(truncate=False)
    sleep(5)
    clear_output()

+---------------+---------------+------------+----------------+-----------------------+
|positive tweets|negative tweets|total tweets|average polarity|timestamp              |
+---------------+---------------+------------+----------------+-----------------------+
|22             |28             |50          |0.44            |2023-06-17 15:27:24.547|
+---------------+---------------+------------+----------------+-----------------------+



KeyboardInterrupt: 

In [112]:
query.stop()

In [113]:
spark.stop()