In [1]:
import time
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.ml.feature import Normalizer, StandardScaler
import random

In [2]:
'''
name of the Kafka topic that the consumer will subscribe to and read messages from
'''
kafka_topic_name = "Topic4"
'''
indicates that the Kafka broker is running on the local machine and is accessible through port 9092
'''
kafka_bootstrap_servers = 'localhost:9092'

spark = SparkSession \
        .builder \
        .appName("Structured Streaming ") \
        .master("local[*]") \
        .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.1") \
        .getOrCreate()

spark.sparkContext.setLogLevel("ERROR")

In [3]:
'''
read message from the kafka topic
'''
df_transaction = spark \
        .readStream \
        .format("kafka") \
        .option("kafka.bootstrap.servers", kafka_bootstrap_servers) \
        .option("subscribe", kafka_topic_name) \
        .option("startingOffsets", "earliest") \
        .load()

In [9]:
df_transaction.printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



In [4]:
df1_transaction = df_transaction.selectExpr("CAST(value AS STRING)", "timestamp")
df_schema_string = "id INT,Account_No STRING,TRANSACTION_DETAILS STRING,Cheq_No STRING,WITHDRAWL_AMOUNT STRING,DEPOSIT_AMT STRING,BALANCE_AMT STRING"
'''
Parsing the CSV Data
'''
df2_transaction= df1_transaction \
        .select(from_csv(col("value"), df_schema_string) \
                .alias("transaction"), "timestamp")

'''
Flattening the Structure
'''
df3_transaction = df2_transaction.select("transaction.*", "timestamp")

'''
 Creating Temporary View
'''   
df3_transaction.createOrReplaceTempView("transaction");
final_transaction = spark.sql("SELECT * FROM transaction")
Transaction_agg_write_stream = final_transaction \
        .writeStream \
        .trigger(processingTime='5 seconds') \
        .outputMode("append") \
        .option("truncate", "false") \
        .format("memory") \
        .queryName("Transaction_Table") \
        .start()
Transaction_agg_write_stream.awaitTermination(1)

False

In [7]:
df = spark.sql("SELECT * FROM Transaction_Table")
df.show()

+---+-------------+--------------------+-------+----------------+-----------+-----------+--------------------+
| id|   Account_No| TRANSACTION_DETAILS|Cheq_No|WITHDRAWL_AMOUNT|DEPOSIT_AMT|BALANCE_AMT|           timestamp|
+---+-------------+--------------------+-------+----------------+-----------+-----------+--------------------+
|  0|409000611074'|TRF FROM  Indiafo...|   NULL|            0.00| 1000000.00| 1000000.00|2024-06-27 13:53:...|
|  1|409000611074'|TRF FROM  Indiafo...|   NULL|            0.00| 1000000.00| 2000000.00|2024-06-27 13:53:...|
|  2|409000611074'|FDRL/INTERNAL FUN...|   NULL|            0.00|  500000.00| 2500000.00|2024-06-27 13:53:...|
|  3|409000611074'|TRF FRM  Indiafor...|   NULL|            0.00| 3000000.00| 5500000.00|2024-06-27 13:53:...|
|  4|409000611074'|FDRL/INTERNAL FUN...|   NULL|            0.00|  500000.00| 6000000.00|2024-06-27 13:53:...|
|  5|409000611074'|FDRL/INTERNAL FUN...|   NULL|            0.00|  500000.00| 6500000.00|2024-06-27 13:53:...|
|

In [8]:
df.count()

50