In [1]:
import os
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, from_json
from pyspark.sql.types import StructType, StringType, TimestampType


In [4]:
# Initialize Spark Session
spark = SparkSession.builder.appName("NewApp").getOrCreate()

In [5]:
spark

In [6]:
# Define schema for Kafka messages
schema = StructType() \
    .add("client_host", StringType()) \
    .add("http_method", StringType()) \
    .add("url", StringType()) \
    .add("event_time", TimestampType())


In [7]:
path = '../extracted_data/jre_data_2025-01-22.csv'

In [9]:
df = spark.read.format("csv").option("header", True).option("inferSchema", True).load(path)

In [10]:
from pyspark.sql.functions import when
df = df.withColumn('duration_in_minutes', (df['Duration']/60)).drop('Category', 'Tags')\
    .withColumn('jre_episode', when(df['Title'] == 'Joe Rogan Experience%', True).otherwise(False))

In [11]:
df.show()

+--------------------+--------------------+--------+--------------------+-----------+-----------+--------------------+-------------------+-----------+
|               Title|        Publish Date|Duration|          Guest Name|View Counts|Like Counts|          Count Date|duration_in_minutes|jre_episode|
+--------------------+--------------------+--------+--------------------+-----------+-----------+--------------------+-------------------+-----------+
|Joe Rogan Experie...|2025-01-22T18:00:22Z| 12080.0|         Lex Fridman|      24019|       1417|2025-01-22T18:00:22Z| 201.33333333333334|      false|
|Joe Rogan Experie...|2025-01-17T18:00:46Z| 10061.0|     Thomas Campbell|    1578684|      31097|2025-01-17T18:00:46Z| 167.68333333333334|      false|
|Joe Rogan Experie...|2025-01-16T18:00:14Z| 10074.0|      Steven Rinella|    1162294|      19939|2025-01-16T18:00:14Z|              167.9|      false|
|Joe Rogan Experie...|2025-01-15T18:00:44Z|  9981.0|        Bryan Callen|    1217079|      227

In [36]:
# Read from Kafka
df = spark.readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka:9092") \
    .option("subscribe", "ui-event-log") \
    .option("startingOffsets", "earliest") \
    .load()


In [37]:
# Parse the value column and apply schema
parsed_df = df.selectExpr("CAST(value AS STRING)") \
    .select(from_json(col("value"), schema).alias("data")) \
    .select("data.*")

# Example: Transform data (simple transformation)
transformed_df = parsed_df.withColumn("processed_time", col("event_time"))


AnalysisException: Queries with streaming sources must be executed with writeStream.start();
kafka

In [1]:
from pyspark.sql import SparkSession

# Minimal Spark session with explicit JAR
# 'spark-sql-kafka-0-10_2.12-3.2.1.jar''
spark = SparkSession.builder \
    .appName("KafkaTest") \
    .config("spark.jars", "/opt/spark/jars/spark-sql-kafka-0-10_2.12-3.2.1.jar") \
    .getOrCreate() 

# Simple Kafka read (batch, not streaming)
df = spark \
    .read \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka:9092") \
    .option("subscribe", "AIDHOOSTATION") \
    .load()

df.show()

spark.stop()

25/03/15 07:16:26 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/03/15 07:16:28 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


Py4JJavaError: An error occurred while calling o38.showString.
: java.lang.NoClassDefFoundError: org/apache/spark/kafka010/KafkaConfigUpdater
	at org.apache.spark.sql.kafka010.KafkaSourceProvider$.kafkaParamsForDriver(KafkaSourceProvider.scala:643)
	at org.apache.spark.sql.kafka010.KafkaRelation.buildScan(KafkaRelation.scala:61)
	at org.apache.spark.sql.execution.datasources.DataSourceStrategy$.apply(DataSourceStrategy.scala:339)
	at org.apache.spark.sql.catalyst.planning.QueryPlanner.$anonfun$plan$1(QueryPlanner.scala:63)
	at scala.collection.Iterator$$anon$11.nextCur(Iterator.scala:486)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:492)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:491)
	at org.apache.spark.sql.catalyst.planning.QueryPlanner.plan(QueryPlanner.scala:93)
	at org.apache.spark.sql.execution.SparkStrategies.plan(SparkStrategies.scala:68)
	at org.apache.spark.sql.catalyst.planning.QueryPlanner.$anonfun$plan$3(QueryPlanner.scala:78)
	at scala.collection.TraversableOnce$folder$1.apply(TraversableOnce.scala:196)
	at scala.collection.TraversableOnce$folder$1.apply(TraversableOnce.scala:194)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1431)
	at scala.collection.TraversableOnce.foldLeft(TraversableOnce.scala:199)
	at scala.collection.TraversableOnce.foldLeft$(TraversableOnce.scala:192)
	at scala.collection.AbstractIterator.foldLeft(Iterator.scala:1431)
	at org.apache.spark.sql.catalyst.planning.QueryPlanner.$anonfun$plan$2(QueryPlanner.scala:75)
	at scala.collection.Iterator$$anon$11.nextCur(Iterator.scala:486)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:492)
	at org.apache.spark.sql.catalyst.planning.QueryPlanner.plan(QueryPlanner.scala:93)
	at org.apache.spark.sql.execution.SparkStrategies.plan(SparkStrategies.scala:68)
	at org.apache.spark.sql.catalyst.planning.QueryPlanner.$anonfun$plan$3(QueryPlanner.scala:78)
	at scala.collection.TraversableOnce$folder$1.apply(TraversableOnce.scala:196)
	at scala.collection.TraversableOnce$folder$1.apply(TraversableOnce.scala:194)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1431)
	at scala.collection.TraversableOnce.foldLeft(TraversableOnce.scala:199)
	at scala.collection.TraversableOnce.foldLeft$(TraversableOnce.scala:192)
	at scala.collection.AbstractIterator.foldLeft(Iterator.scala:1431)
	at org.apache.spark.sql.catalyst.planning.QueryPlanner.$anonfun$plan$2(QueryPlanner.scala:75)
	at scala.collection.Iterator$$anon$11.nextCur(Iterator.scala:486)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:492)
	at org.apache.spark.sql.catalyst.planning.QueryPlanner.plan(QueryPlanner.scala:93)
	at org.apache.spark.sql.execution.SparkStrategies.plan(SparkStrategies.scala:68)
	at org.apache.spark.sql.execution.QueryExecution$.createSparkPlan(QueryExecution.scala:468)
	at org.apache.spark.sql.execution.QueryExecution.$anonfun$sparkPlan$2(QueryExecution.scala:157)
	at org.apache.spark.sql.catalyst.QueryPlanningTracker.measurePhase(QueryPlanningTracker.scala:111)
	at org.apache.spark.sql.execution.QueryExecution.$anonfun$executePhase$1(QueryExecution.scala:196)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:775)
	at org.apache.spark.sql.execution.QueryExecution.executePhase(QueryExecution.scala:196)
	at org.apache.spark.sql.execution.QueryExecution.$anonfun$sparkPlan$1(QueryExecution.scala:157)
	at org.apache.spark.sql.execution.QueryExecution.withCteMap(QueryExecution.scala:73)
	at org.apache.spark.sql.execution.QueryExecution.sparkPlan$lzycompute(QueryExecution.scala:150)
	at org.apache.spark.sql.execution.QueryExecution.sparkPlan(QueryExecution.scala:150)
	at org.apache.spark.sql.execution.QueryExecution.$anonfun$executedPlan$2(QueryExecution.scala:170)
	at org.apache.spark.sql.catalyst.QueryPlanningTracker.measurePhase(QueryPlanningTracker.scala:111)
	at org.apache.spark.sql.execution.QueryExecution.$anonfun$executePhase$1(QueryExecution.scala:196)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:775)
	at org.apache.spark.sql.execution.QueryExecution.executePhase(QueryExecution.scala:196)
	at org.apache.spark.sql.execution.QueryExecution.$anonfun$executedPlan$1(QueryExecution.scala:170)
	at org.apache.spark.sql.execution.QueryExecution.withCteMap(QueryExecution.scala:73)
	at org.apache.spark.sql.execution.QueryExecution.executedPlan$lzycompute(QueryExecution.scala:163)
	at org.apache.spark.sql.execution.QueryExecution.executedPlan(QueryExecution.scala:163)
	at org.apache.spark.sql.execution.QueryExecution.simpleString(QueryExecution.scala:214)
	at org.apache.spark.sql.execution.QueryExecution.org$apache$spark$sql$execution$QueryExecution$$explainString(QueryExecution.scala:259)
	at org.apache.spark.sql.execution.QueryExecution.explainString(QueryExecution.scala:228)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:98)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:163)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:90)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:775)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3704)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:2728)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:2935)
	at org.apache.spark.sql.Dataset.getRows(Dataset.scala:287)
	at org.apache.spark.sql.Dataset.showString(Dataset.scala:326)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:829)
Caused by: java.lang.ClassNotFoundException: org.apache.spark.kafka010.KafkaConfigUpdater
	at java.base/jdk.internal.loader.BuiltinClassLoader.loadClass(BuiltinClassLoader.java:581)
	at java.base/jdk.internal.loader.ClassLoaders$AppClassLoader.loadClass(ClassLoaders.java:178)
	at java.base/java.lang.ClassLoader.loadClass(ClassLoader.java:522)
	... 81 more


In [2]:
import os
print(os.listdir('/opt/spark/jars/'))

['hive-shims-scheduler-2.3.9.jar', 'hk2-api-2.6.1.jar', 'jakarta.validation-api-2.0.2.jar', 'commons-logging-1.1.3.jar', 'audience-annotations-0.5.0.jar', 'logging-interceptor-3.12.12.jar', 'antlr4-runtime-4.8.jar', 'spark-hive_2.12-3.2.1.jar', 'jaxb-api-2.2.11.jar', 'scala-library-2.12.15.jar', 'jdom-1.1.jar', 'hadoop-cloud-storage-3.3.1.jar', 'spark-mesos_2.12-3.2.1.jar', 'azure-data-lake-store-sdk-2.3.9.jar', 'breeze-macros_2.12-1.2.jar', 'breeze_2.12-1.2.jar', 'aopalliance-repackaged-2.6.1.jar', 'commons-collections-3.2.2.jar', 'kubernetes-model-discovery-5.4.1.jar', 'spire-macros_2.12-0.17.0.jar', 'bonecp-0.8.0.RELEASE.jar', 'dropwizard-metrics-hadoop-metrics2-reporter-0.1.2.jar', 'spark-unsafe_2.12-3.2.1.jar', 'commons-text-1.6.jar', 'spark-graphx_2.12-3.2.1.jar', 'arpack-2.2.1.jar', 'commons-crypto-1.1.0.jar', 'zjsonpatch-0.3.0.jar', 'pyrolite-4.30.jar', 'metrics-json-4.2.0.jar', 'zstd-jni-1.5.0-4.jar', 'jersey-container-servlet-2.34.jar', 'json4s-core_2.12-3.7.0-M11.jar', 'hk2-

In [3]:
df = spark.read \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka:9092") \
    .option("subscribe", "AIDHOOSTATION") \
    .option("startingOffsets", "earliest") \
    .load()

In [8]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("KafkaDebug") \
    .config("spark.jars", "/opt/spark/jars/spark-sql-kafka-0-10_2.12-3.2.1.jar") \
    .config("spark.executor.extraJavaOptions", "-Dlog4j.configuration=file:/path/to/log4j-debug.properties") \
    .getOrCreate()

df = spark.read.format("kafka") \
    .option("kafka.bootstrap.servers", "kafka:9092") \
    .option("subscribe", "AIDHOOSTATION") \
    .option("startingOffsets", "earliest") \
    .option("failOnDataLoss", "false") \
    .option("kafka.security.protocol", "PLAINTEXT").load()

# Check if DF is empty first
if df.rdd.isEmpty():
    print("No data found in Kafka topic!")
else:
    df.show(truncate=False)

spark.stop()

25/03/15 07:37:50 DEBUG GenerateUnsafeProjection: code for input[0, binary, true],input[1, binary, true],input[2, string, true],input[3, int, true],input[4, bigint, true],input[5, timestamp, true],input[6, int, true]:
/* 001 */ public java.lang.Object generate(Object[] references) {
/* 002 */   return new SpecificUnsafeProjection(references);
/* 003 */ }
/* 004 */
/* 005 */ class SpecificUnsafeProjection extends org.apache.spark.sql.catalyst.expressions.UnsafeProjection {
/* 006 */
/* 007 */   private Object[] references;
/* 008 */   private org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter[] mutableStateArray_0 = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter[1];
/* 009 */
/* 010 */   public SpecificUnsafeProjection(Object[] references) {
/* 011 */     this.references = references;
/* 012 */     mutableStateArray_0[0] = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(7, 96);
/* 013 */
/* 014 */   }
/* 015 */
/* 016 */   publ

Py4JJavaError: An error occurred while calling o85.javaToPython.
: java.lang.NoClassDefFoundError: org/apache/spark/kafka010/KafkaConfigUpdater
	at org.apache.spark.sql.kafka010.KafkaSourceProvider$.kafkaParamsForDriver(KafkaSourceProvider.scala:643)
	at org.apache.spark.sql.kafka010.KafkaRelation.buildScan(KafkaRelation.scala:61)
	at org.apache.spark.sql.execution.datasources.DataSourceStrategy$.apply(DataSourceStrategy.scala:339)
	at org.apache.spark.sql.catalyst.planning.QueryPlanner.$anonfun$plan$1(QueryPlanner.scala:63)
	at scala.collection.Iterator$$anon$11.nextCur(Iterator.scala:486)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:492)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:491)
	at org.apache.spark.sql.catalyst.planning.QueryPlanner.plan(QueryPlanner.scala:93)
	at org.apache.spark.sql.execution.SparkStrategies.plan(SparkStrategies.scala:68)
	at org.apache.spark.sql.catalyst.planning.QueryPlanner.$anonfun$plan$3(QueryPlanner.scala:78)
	at scala.collection.TraversableOnce$folder$1.apply(TraversableOnce.scala:196)
	at scala.collection.TraversableOnce$folder$1.apply(TraversableOnce.scala:194)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1431)
	at scala.collection.TraversableOnce.foldLeft(TraversableOnce.scala:199)
	at scala.collection.TraversableOnce.foldLeft$(TraversableOnce.scala:192)
	at scala.collection.AbstractIterator.foldLeft(Iterator.scala:1431)
	at org.apache.spark.sql.catalyst.planning.QueryPlanner.$anonfun$plan$2(QueryPlanner.scala:75)
	at scala.collection.Iterator$$anon$11.nextCur(Iterator.scala:486)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:492)
	at org.apache.spark.sql.catalyst.planning.QueryPlanner.plan(QueryPlanner.scala:93)
	at org.apache.spark.sql.execution.SparkStrategies.plan(SparkStrategies.scala:68)
	at org.apache.spark.sql.execution.QueryExecution$.createSparkPlan(QueryExecution.scala:468)
	at org.apache.spark.sql.execution.QueryExecution.$anonfun$sparkPlan$2(QueryExecution.scala:157)
	at org.apache.spark.sql.catalyst.QueryPlanningTracker.measurePhase(QueryPlanningTracker.scala:111)
	at org.apache.spark.sql.execution.QueryExecution.$anonfun$executePhase$1(QueryExecution.scala:196)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:775)
	at org.apache.spark.sql.execution.QueryExecution.executePhase(QueryExecution.scala:196)
	at org.apache.spark.sql.execution.QueryExecution.$anonfun$sparkPlan$1(QueryExecution.scala:157)
	at org.apache.spark.sql.execution.QueryExecution.withCteMap(QueryExecution.scala:73)
	at org.apache.spark.sql.execution.QueryExecution.sparkPlan$lzycompute(QueryExecution.scala:150)
	at org.apache.spark.sql.execution.QueryExecution.sparkPlan(QueryExecution.scala:150)
	at org.apache.spark.sql.execution.QueryExecution.$anonfun$executedPlan$2(QueryExecution.scala:170)
	at org.apache.spark.sql.catalyst.QueryPlanningTracker.measurePhase(QueryPlanningTracker.scala:111)
	at org.apache.spark.sql.execution.QueryExecution.$anonfun$executePhase$1(QueryExecution.scala:196)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:775)
	at org.apache.spark.sql.execution.QueryExecution.executePhase(QueryExecution.scala:196)
	at org.apache.spark.sql.execution.QueryExecution.$anonfun$executedPlan$1(QueryExecution.scala:170)
	at org.apache.spark.sql.execution.QueryExecution.withCteMap(QueryExecution.scala:73)
	at org.apache.spark.sql.execution.QueryExecution.executedPlan$lzycompute(QueryExecution.scala:163)
	at org.apache.spark.sql.execution.QueryExecution.executedPlan(QueryExecution.scala:163)
	at org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:185)
	at org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:184)
	at org.apache.spark.sql.Dataset.javaToPython(Dataset.scala:3529)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:829)


In [9]:
spark.stop()

25/03/15 07:41:52 INFO SparkUI: Stopped Spark web UI at http://159.89.165.171:4041
25/03/15 07:41:52 INFO StandaloneSchedulerBackend: Shutting down all executors
25/03/15 07:41:52 INFO CoarseGrainedSchedulerBackend$DriverEndpoint: Asking each executor to shut down
25/03/15 07:41:52 INFO MapOutputTrackerMasterEndpoint: MapOutputTrackerMasterEndpoint stopped!
25/03/15 07:41:52 INFO MemoryStore: MemoryStore cleared
25/03/15 07:41:52 INFO BlockManager: BlockManager stopped
25/03/15 07:41:52 INFO BlockManagerMaster: BlockManagerMaster stopped
25/03/15 07:41:52 INFO OutputCommitCoordinator$OutputCommitCoordinatorEndpoint: OutputCommitCoordinator stopped!
25/03/15 07:41:52 INFO SparkContext: Successfully stopped SparkContext


In [10]:
spark = SparkSession.builder \
    .appName("KafkaTest") \
    .config("spark.jars", "/opt/spark/jars/spark-sql-kafka-0-10_2.12-3.2.1.jar") \
    .config("spark.executor.extraClassPath", "/opt/spark/jars/*") \
    .config("spark.driver.extraClassPath", "/opt/spark/jars/*") \
    .getOrCreate()

25/03/15 07:42:22 INFO SparkContext: Running Spark version 3.2.1
25/03/15 07:42:22 INFO ResourceUtils: No custom resources configured for spark.driver.
25/03/15 07:42:22 INFO SparkContext: Submitted application: KafkaTest
25/03/15 07:42:22 INFO ResourceProfile: Default ResourceProfile created, executor resources: Map(cores -> name: cores, amount: 2, script: , vendor: , memory -> name: memory, amount: 4096, script: , vendor: , offHeap -> name: offHeap, amount: 0, script: , vendor: ), task resources: Map(cpus -> name: cpus, amount: 1.0)
25/03/15 07:42:22 INFO ResourceProfile: Limiting resource is cpus at 2 tasks per executor
25/03/15 07:42:22 INFO ResourceProfileManager: Added ResourceProfile id: 0
25/03/15 07:42:22 INFO SecurityManager: Changing view acls to: root
25/03/15 07:42:22 INFO SecurityManager: Changing modify acls to: root
25/03/15 07:42:22 INFO SecurityManager: Changing view acls groups to: 
25/03/15 07:42:22 INFO SecurityManager: Changing modify acls groups to: 
25/03/15 07:

SyntaxError: invalid syntax (3339603491.py, line 1)