In [1]:
import os
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, from_json
from pyspark.sql.types import StructType, StringType, TimestampType


In [4]:
# Initialize Spark Session
spark = SparkSession.builder.appName("NewApp").getOrCreate()

In [5]:
spark

In [6]:
# Define schema for Kafka messages
schema = StructType() \
    .add("client_host", StringType()) \
    .add("http_method", StringType()) \
    .add("url", StringType()) \
    .add("event_time", TimestampType())


In [7]:
path = '../extracted_data/jre_data_2025-01-22.csv'

In [9]:
df = spark.read.format("csv").option("header", True).option("inferSchema", True).load(path)

In [10]:
from pyspark.sql.functions import when
df = df.withColumn('duration_in_minutes', (df['Duration']/60)).drop('Category', 'Tags')\
    .withColumn('jre_episode', when(df['Title'] == 'Joe Rogan Experience%', True).otherwise(False))

In [11]:
df.show()

+--------------------+--------------------+--------+--------------------+-----------+-----------+--------------------+-------------------+-----------+
|               Title|        Publish Date|Duration|          Guest Name|View Counts|Like Counts|          Count Date|duration_in_minutes|jre_episode|
+--------------------+--------------------+--------+--------------------+-----------+-----------+--------------------+-------------------+-----------+
|Joe Rogan Experie...|2025-01-22T18:00:22Z| 12080.0|         Lex Fridman|      24019|       1417|2025-01-22T18:00:22Z| 201.33333333333334|      false|
|Joe Rogan Experie...|2025-01-17T18:00:46Z| 10061.0|     Thomas Campbell|    1578684|      31097|2025-01-17T18:00:46Z| 167.68333333333334|      false|
|Joe Rogan Experie...|2025-01-16T18:00:14Z| 10074.0|      Steven Rinella|    1162294|      19939|2025-01-16T18:00:14Z|              167.9|      false|
|Joe Rogan Experie...|2025-01-15T18:00:44Z|  9981.0|        Bryan Callen|    1217079|      227

In [36]:
# Read from Kafka
df = spark.readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka:9092") \
    .option("subscribe", "ui-event-log") \
    .option("startingOffsets", "earliest") \
    .load()


In [37]:
# Parse the value column and apply schema
parsed_df = df.selectExpr("CAST(value AS STRING)") \
    .select(from_json(col("value"), schema).alias("data")) \
    .select("data.*")

# Example: Transform data (simple transformation)
transformed_df = parsed_df.withColumn("processed_time", col("event_time"))


AnalysisException: Queries with streaming sources must be executed with writeStream.start();
kafka

In [38]:
# Display transformed data to console
query = transformed_df.writeStream \
    .outputMode("append") \
    .format("console") \
    .start()a

# Await termination to keep the stream running
query.awaitTermination()


25/03/12 18:06:08 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-3eeecf2d-feec-44e9-b762-98f534a128b5. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
25/03/12 18:06:08 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
25/03/12 18:06:08 ERROR MicroBatchExecution: Query [id = 03bdb7fb-79cb-4d39-be29-2644864a82a5, runId = 22ad06cf-2073-4f59-9e95-e7cdb081067f] terminated with error
java.lang.NoClassDefFoundError: org/apache/spark/kafka010/KafkaConfigUpdater
	at org.apache.spark.sql.kafka010.KafkaSourceProvider$.kafkaParamsForDriver(KafkaSourceProvider.scala:643)
	at org.apache.spark.sql.kafka010.KafkaSourceProvider$KafkaScan.toMicroBatchStream(KafkaSourceProvider.scala:482)
	at org.ap

StreamingQueryException: org/apache/spark/kafka010/KafkaConfigUpdater
=== Streaming Query ===
Identifier: [id = 03bdb7fb-79cb-4d39-be29-2644864a82a5, runId = 22ad06cf-2073-4f59-9e95-e7cdb081067f]
Current Committed Offsets: {}
Current Available Offsets: {}

Current State: INITIALIZING
Thread State: RUNNABLE

In [49]:
spark.conf.get('spark.executor.cores')

'2'