In [0]:
%pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType, LongType

# Create a SparkSession
spark = SparkSession.builder \
    .appName("KafkaToHiveORC") \
    .config("spark.sql.orc.impl", "native") \
    .enableHiveSupport() \
    .getOrCreate()

# Kafka details
kafka_server = "kafka-broker:29092"
topic = "source_topic-001"

schema = StructType([
    StructField("Name", StringType()),
    StructField("Price", StringType()),
    StructField("24H_CHANGE", StringType()),
    StructField("24H_VOLUME", StringType()),
    StructField("Market_Cap", StringType()),
    StructField("Datetime", StringType()),
])

df = spark.readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", kafka_server) \
    .option("subscribe", topic) \
    .option("failOnDataLoss", "false") \
    .load() \
    .selectExpr("CAST(value AS STRING)") \
    .select(from_json("value", schema).alias("data")) \
    .select("data.*")

   
df.writeStream \
    .outputMode("append") \
    .format("parquet") \
    .option("path", "/user/hive/warehouse/crypto_data1") \
    .option("checkpointLocation", "/tmp/myNew_checkpoint") \
    .option("failOnDataLoss", "false") \
    .start() \
    .awaitTermination()

spark.stop()
