In [11]:
import os
import sys
from pyspark.sql import SparkSession

# Point the executor to the same Python interpreter the driver is using
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [12]:
spark = SparkSession.builder \
    .appName("HDFS_CSV_Test") \
    .master("local[*]") \
    .getOrCreate()

In [13]:
original_csv_path = "hdfs://localhost:9000/weather_data//weatherbigdata.csv"
streaming_dir = "hdfs://localhost:9000/streaming_weather/"

In [14]:
from pyspark.sql.functions import col
from pyspark.sql.types import *
df = spark.read.csv(original_csv_path, header=True, inferSchema=True)
 # Step 2.1: Drop rows with null temperature values
clean_df = df.na.drop(subset=["TMAX", "TMIN"])

# Step 2.2: Transform temperature to actual degrees Celsius and add a 'range' column
transformed_df = clean_df.withColumn("TMAX_actual", col("TMAX") / 10.0) \
                   .withColumn("TMIN_actual", col("TMIN") / 10.0) \
                   .withColumn("TRANGE", col("TMAX_actual") - col("TMIN_actual"))

# Step 2.3: Add a data quality filter to ensure TMAX is greater than or equal to TMIN
validated_df = transformed_df.filter(col("TMAX_actual") >= col("TMIN_actual"))
validated_df.show()

+-----------+----------+----+----+----+----+--------+---------+---------+-----------+-----------+------------------+
|         ID|      DATE|TMAX|TMIN|EVAP|PRCP|Latitude|Longitude|Elevation|TMAX_actual|TMIN_actual|            TRANGE|
+-----------+----------+----+----+----+----+--------+---------+---------+-----------+-----------+------------------+
|USR0000AGOP|  3/7/2011| -28|-200|NULL|NULL| 64.2381|-145.2669|    463.3|       -2.8|      -20.0|              17.2|
|USC00201675| 9/26/2012| 200| 100|NULL|   0| 41.9622| -84.9925|    299.9|       20.0|       10.0|              10.0|
|USC00389039| 12/3/2000|  33| -11|NULL|   0|    33.9| -80.5206|     76.2|        3.3|       -1.1|               4.4|
|USC00230657| 5/10/2007| 267| 139|NULL|   3| 37.0539| -93.5756|    399.3|       26.7|       13.9|12.799999999999999|
|USC00206510|  8/2/2018| 267| 156|NULL|  28| 45.3614| -84.9511|    228.0|       26.7|       15.6|              11.1|
|USC00236045| 4/16/2008| 150|   6|NULL|   0| 36.5869| -89.5325| 

In [15]:
import pandas as pd
import time
pandas_df = validated_df.sample(fraction=0.01).toPandas()  # only 1% of data


In [16]:
pandas_df.head()

Unnamed: 0,ID,DATE,TMAX,TMIN,EVAP,PRCP,Latitude,Longitude,Elevation,TMAX_actual,TMIN_actual,TRANGE
0,USC00104670,6/12/1993,200,28,,0.0,42.7325,-114.5192,1140.0,20.0,2.8,17.2
1,USC00466284,3/31/2012,217,44,,48.0,38.1878,-80.8483,595.3,21.7,4.4,17.3
2,USC00249187,4/2/2011,56,-6,,130.0,48.9514,-115.6267,940.3,5.6,-0.6,6.2
3,USR0000ALOS,3/17/2021,-111,-239,,,66.0425,-147.9714,213.4,-11.1,-23.9,12.8
4,USR0000CDOW,12/13/1997,33,-150,,,39.6269,-106.4517,2742.6,3.3,-15.0,18.3


In [18]:
from pyspark.sql.functions import col
from pyspark.sql.types import *
batch_size = 10
total_rows = len(pandas_df)
rows_used = 0
batch_num = 1
while rows_used < total_rows:
    # Select the next batch of rows
    batch_df = pandas_df.iloc[rows_used:rows_used + batch_size]
    
    # Convert batch back to Spark DataFrame
    spark_batch_df = spark.createDataFrame(batch_df)
   
    # Write batch to HDFS
    batch_path = os.path.join(streaming_dir, f"batch_{batch_num}")
    spark_batch_df.write.mode("append").csv(batch_path, header=True)
   
    print(f"Batch {batch_num} written to HDFS")
    spark_batch_df.show()
    # Increment counters
    rows_used += batch_size
    batch_num += 1
    
    # Pause to simulate real-time arrival
    time.sleep(2)  # 2 seconds between batches

print("All batches streamed successfully!")

Batch 1 written to HDFS
+-----------+----------+----+----+----+-----+--------+---------+---------+-----------+-----------+------------------+
|         ID|      DATE|TMAX|TMIN|EVAP| PRCP|Latitude|Longitude|Elevation|TMAX_actual|TMIN_actual|            TRANGE|
+-----------+----------+----+----+----+-----+--------+---------+---------+-----------+-----------+------------------+
|USC00104670| 6/12/1993| 200|  28| NaN|  0.0| 42.7325|-114.5192|   1140.0|       20.0|        2.8|              17.2|
|USC00466284| 3/31/2012| 217|  44| NaN| 48.0| 38.1878| -80.8483|    595.3|       21.7|        4.4|17.299999999999997|
|USC00249187|  4/2/2011|  56|  -6| NaN|130.0| 48.9514|-115.6267|    940.3|        5.6|       -0.6| 6.199999999999999|
|USR0000ALOS| 3/17/2021|-111|-239| NaN|  NaN| 66.0425|-147.9714|    213.4|      -11.1|      -23.9|12.799999999999999|
|USR0000CDOW|12/13/1997|  33|-150| NaN|  NaN| 39.6269|-106.4517|   2742.6|        3.3|      -15.0|              18.3|
|USR0000MBIG| 3/15/1997|  22|-58

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "C:\Users\Admin\miniconda3\envs\bigdata\lib\site-packages\py4j\java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "C:\Users\Admin\miniconda3\envs\bigdata\lib\site-packages\py4j\clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "C:\Users\Admin\miniconda3\envs\bigdata\lib\socket.py", line 716, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 