In [None]:
import sys
import time
import datetime

In [None]:
TOPIC_Step2_NAME="Sahamyab-Session_16_2"
KAFKA_SERVER="kafka-broker:29092"

In [None]:
import os

# https://mvnrepository.com/artifact/org.apache.spark/spark-sql-kafka-0-10_2.12

os.environ['PYSPARK_SUBMIT_ARGS']='--packages org.apache.spark:spark-sql-kafka-0-10_2.12:3.4.1 pyspark-shell'

## Note: 
`org.apache.spark:spark-sql-kafka-0-10_2.12:3.4.1` is a package that provides integration between Apache Spark’s Structured Streaming and Apache Kafka. This package allows you to read data from and write data to Kafka using Spark’s Structured Streaming API.

In [None]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [None]:
spark = SparkSession.builder \
    .master("spark://spark-master:7077") \
    .appName("Hashtag-Processing") \
    .config("spark.executor.memory", "512mb") \
    .config("spark.executor.cores","1") \
    .config("spark.cores.max", "1") \
    .config("spark.sql.session.timeZone", "Asia/Tehran") \
    .getOrCreate()      

 `.config("spark.cores.max", "1")`: This sets the maximum amount of CPU cores to request for the application from across the cluster (not necessarily from a single machine

In [None]:
schema = StructType([StructField("id", StringType(), True),\
                         StructField("content", StringType(), True),\
                         StructField("sendTime", StringType(), True),\
                         StructField("sendTimePersian", StringType(), True),\
                         StructField("senderName", StringType(), True),\
                         StructField("senderUsername", StringType(), True),\
                         StructField("type", StringType(), True),\
                         StructField("hashtags", ArrayType(StringType()), True)
                    ])

In [None]:
# This cell creates a DataFrame that reads data from a Kafka topic using Spark’s Structured Streaming API.
df = spark \
  .readStream \
  .format("kafka") \
  .option("kafka.bootstrap.servers", KAFKA_SERVER) \
  .option("subscribe", TOPIC_Step2_NAME) \
  .option("startingOffsets", "earliest") \
  .load()

In [None]:
df.printSchema()

`.option("subscribe", TOPIC_Step2_NAME)`: The DataFrame will read data from the Kafka topic with  name "TOPIC_Step2_NAME".

In [None]:
# This line of code creates a new DataFrame by selecting and transforming columns from an 
# existing DataFrame using SQL expressions

In [None]:
tweetsStringDF = df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")

tweetsDF = tweetsStringDF.select(from_json(col("value"), schema).alias("data")).select("data.*") 

tweetsDF = tweetsDF.withColumn("timestamp", unix_timestamp("sendTime", "yyyy-MM-dd'T'HH:mm:ssz").cast('timestamp')) \
            .withColumn("persianYear", tweetsDF['sendTimePersian'].substr(0, 4)) \
            .withColumn("persianMonth", tweetsDF['sendTimePersian'].substr(6, 2)) \
            .withColumn("persianDay", tweetsDF['sendTimePersian'].substr(9, 2))

`.select("data.*")`: This selects all columns from the "data" column, which is a struct type. The "*" is a wildcard that matches all columns

`.withColumn("persianYear", tweetsDF['sendTimePersian'].substr(0, 4))`: This line adds a new column named "persianYear" to the DataFrame. The values for this column are computed by extracting a substring of length 4 starting from position 0 from the "sendTimePersian" column of the DataFrame.

In [None]:
tweetsDF.select(col("id"), col("sendTime"), col("senderName"), col("persianYear"),col("persianMonth"), col("persianDay"))\
      .writeStream \
      .format("console") \
      .outputMode("append") \
      .start() \
      .awaitTermination()

This code (above cell) uses the `writeStream` method of the DataFrame class in PySpark to write the contents of the tweetsDF DataFrame to the console as a stream. 

`.format("console")`: This specifies the format of the data sink. In this case, it is set to "console", which means that the data will be written to the console.