In [1]:
# Verify Environment
import os
from pyspark.sql import SparkSession

print("✅ Python environment ready")

✅ Python environment ready


In [2]:
# Dependency Check
required_jars = [
    "spark-sql-kafka-0-10_2.12-3.5.0.jar",
    "kafka-clients-3.5.0.jar",
    "kafka_2.12-3.5.0.jar",
    "commons-pool2-2.11.1.jar"
]

print("Checking Kafka dependencies...")
missing = [j for j in required_jars if not os.path.exists(f"/opt/spark/jars/{j}")]
if missing:
    print(f"❌ Missing JARs: {missing}")
else:
    print("✅ All required JARs present")

Checking Kafka dependencies...
✅ All required JARs present


In [3]:
# Minimal Spark Session
spark = (SparkSession.builder
    .appName("KafkaConnectivityTest")
    .config("spark.jars", ",".join([f"/opt/spark/jars/{j}" for j in required_jars]))
    .config("spark.driver.extraClassPath", "/opt/spark/jars/*")
    .config("spark.executor.extraClassPath", "/opt/spark/jars/*")
    .config("spark.sql.streaming.kafka.useDeprecatedOffsetFetching", "false")
    .getOrCreate())

print("SparkSession created with Kafka support")

SparkSession created with Kafka support


In [4]:
# Connection Test
def test_kafka_connection(bootstrap_servers="kafka-1:9092,kafka-2:9095", topic="test_topic"):
    try:
        test_df = spark.read \
            .format("kafka") \
            .option("kafka.bootstrap.servers", bootstrap_servers) \
            .option("subscribe", topic) \
            .option("startingOffsets", "earliest") \
            .option("failOnDataLoss", "false") \
            .load()
        
        print(f"✅ Successfully connected to Kafka brokers: {bootstrap_servers}")
        print("Schema of Kafka stream:")
        test_df.printSchema()
        return True
    except Exception as e:
        print(f"❌ Connection failed: {str(e)}")
        return False

# Test with different configurations
print("\nTesting connection to Kafka...")
test_kafka_connection()


Testing connection to Kafka...
✅ Successfully connected to Kafka brokers: kafka-1:9092,kafka-2:9095
Schema of Kafka stream:
root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



True