In [1]:
import os
import sys
from pyspark.sql import SparkSession

# Point the executor to the same Python interpreter the driver is using
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [2]:
spark = SparkSession.builder \
    .appName("HDFS_CSV_Test") \
    .master("local[*]") \
    .getOrCreate()

In [3]:
original_csv_path = "hdfs://localhost:9000/weather_data/LAweather.csv"
streaming_dir = "hdfs://localhost:9000/streaming_weather/"

In [22]:
df = spark.read.csv(original_csv_path, header=True, inferSchema=True)
df.show(5)

+-----------+----------+--------+----------+---------+--------------------+----+---------------+----+---------------+------+--------------+-----+--------------+-----+----------------+----+---------------+-----+-----+----+--------------+----+--------------+----+---------------+-----+------+
|    STATION|      DATE|LATITUDE| LONGITUDE|ELEVATION|                NAME|TEMP|TEMP_ATTRIBUTES|DEWP|DEWP_ATTRIBUTES|   SLP|SLP_ATTRIBUTES|  STP|STP_ATTRIBUTES|VISIB|VISIB_ATTRIBUTES|WDSP|WDSP_ATTRIBUTES|MXSPD| GUST| MAX|MAX_ATTRIBUTES| MIN|MIN_ATTRIBUTES|PRCP|PRCP_ATTRIBUTES| SNDP|FRSHTT|
+-----------+----------+--------+----------+---------+--------------------+----+---------------+----+---------------+------+--------------+-----+--------------+-----+----------------+----+---------------+-----+-----+----+--------------+----+--------------+----+---------------+-----+------+
|74505753130|2025-01-01|34.25917|-118.41333|    305.7|LOS ANGELES WHITE...|62.6|           10.0|28.8|            9.0|9999.9|   

In [6]:
import pandas as pd
import time

In [7]:
pandas_df = df.toPandas()

In [34]:
batch_size = 10
total_rows = len(pandas_df)

In [35]:
rows_used = 0
batch_num = 1


In [36]:
while rows_used < total_rows:
    # Select the next batch of rows
    batch_df = pandas_df.iloc[rows_used:rows_used + batch_size]
    
    # Convert batch back to Spark DataFrame
    spark_batch_df = spark.createDataFrame(batch_df)
    
    # Write batch to HDFS
    batch_path = os.path.join(streaming_dir, f"batch_{batch_num}")
    spark_batch_df.write.mode("overwrite").csv(batch_path, header=True)
    
    print(f"Batch {batch_num} written to HDFS")
    
    # Increment counters
    rows_used += batch_size
    batch_num += 1
    
    # Pause to simulate real-time arrival
    time.sleep(2)  # 2 seconds between batches

print("All batches streamed successfully!")

Batch 1 written to HDFS
Batch 2 written to HDFS
Batch 3 written to HDFS
Batch 4 written to HDFS
Batch 5 written to HDFS
Batch 6 written to HDFS
Batch 7 written to HDFS
Batch 8 written to HDFS
Batch 9 written to HDFS
Batch 10 written to HDFS
Batch 11 written to HDFS
Batch 12 written to HDFS
Batch 13 written to HDFS
Batch 14 written to HDFS
Batch 15 written to HDFS
Batch 16 written to HDFS
Batch 17 written to HDFS
Batch 18 written to HDFS
Batch 19 written to HDFS
Batch 20 written to HDFS
Batch 21 written to HDFS
Batch 22 written to HDFS
Batch 23 written to HDFS
Batch 24 written to HDFS
All batches streamed successfully!


In [37]:
from pyspark.sql.functions import input_file_name
from pyspark.sql.functions import lit

df_stream = spark.readStream \
    .option("header", True) \
    .option("inferSchema", True) \
    .csv("hdfs://localhost:9000/streaming_weather")

# Add a column showing which file Spark read

# Add a constant column to verify streaming is alive
df_test = df_stream.withColumn("status", lit("🚀 Stream is working!"))

query = df_test.writeStream \
    .outputMode("append") \
    .format("console") \
    .option("truncate", False) \
    .start()


IllegalArgumentException: Schema must be specified when creating a streaming source DataFrame. If some files already exist in the directory, then depending on the file format you may be able to create a static DataFrame on that directory with 'spark.read.load(directory)' and infer schema from it.