In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, from_json, explode, arrays_zip
from pyspark.sql.types import StructType, StructField, ArrayType, DoubleType, LongType, StringType

In [2]:
# Initialize Spark Session
spark = SparkSession.builder.appName("StockPriceStream").getOrCreate()
spark.sparkContext.setLogLevel("ERROR")  # Reduce logs

25/04/09 10:52:34 WARN Utils: Your hostname, ayushkhaire resolves to a loopback address: 127.0.1.1; using 192.168.105.89 instead (on interface enp1s0)
25/04/09 10:52:34 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/04/09 10:52:36 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
# Define schema for incoming JSON data
schema = StructType([
    StructField("unix_timestamp", ArrayType(LongType()), True),
    StructField("stockname", ArrayType(StringType()), True),
    StructField("open", ArrayType(DoubleType()), True),
    StructField("close", ArrayType(DoubleType()), True),
    StructField("high", ArrayType(DoubleType()), True),
    StructField("low", ArrayType(DoubleType()), True),
    StructField("volume", ArrayType(DoubleType()), True)
])

In [4]:
# Read stream from socket
raw_stream = spark.readStream.format("socket") \
    .option("host", "localhost") \
    .option("port", 3456) \
    .load()

In [5]:
# Parse JSON
parsed_stream = raw_stream.select(from_json(col("value"), schema).alias("data")).select("data.*")

In [6]:
# Zip all arrays together
zipped_stream = parsed_stream.withColumn("zipped", arrays_zip(
    col("unix_timestamp"),
    col("stockname"),
    col("open"),
    col("close"),
    col("high"),
    col("low"),
    col("volume")
))

In [7]:
# Explode into a structured DataFrame
flattened_stream = zipped_stream.select(
    explode(col("zipped")).alias("row")
).select(
    col("row.unix_timestamp").alias("unix_timestamp"),
    col("row.stockname").alias("stockname"),
    col("row.open").alias("open"),
    col("row.close").alias("close"),
    col("row.high").alias("high"),
    col("row.low").alias("low"),
    col("row.volume").alias("volume")
)

In [8]:
update_query = flattened_stream.writeStream \
    .outputMode("append") \
    .format("memory") \
    .queryName("stock_current_data") \
    .start()

append_query = flattened_stream.writeStream \
    .outputMode("append") \
    .format("csv") \
    .option("path", "/home/ayushkhaire/code/dataennginneerinng/stocksly/stream-processing/output") \
    .option("checkpointLocation", "/home/ayushkhaire/code/dataennginneerinng/stocksly/stream-processing/checkpoints") \
    .start()

In [9]:
# Wait for termination
update_query.awaitTermination()
append_query.awaitTermination()

ERROR:root:KeyboardInterrupt while sending command.                             
Traceback (most recent call last):
  File "/home/ayushkhaire/code/dataennginneerinng/stocksly/stream-processing/streamenv/lib/python3.12/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/ayushkhaire/code/dataennginneerinng/stocksly/stream-processing/streamenv/lib/python3.12/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.12/socket.py", line 707, in readinto
    return self._sock.recv_into(b)
           ^^^^^^^^^^^^^^^^^^^^^^^
KeyboardInterrupt


KeyboardInterrupt: 