In [1]:
#pip install python-geohash

In [2]:
import json
from pymongo import MongoClient
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType
import geohash
import os

os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-streaming-kafka-0-10_2.12:3.3.0,org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.0 pyspark-shell'

#change ip address
host_ip = "10.192.33.112"
#172.16.33.120


spark = (
    SparkSession.builder
    .master('local[*]')
    .appName('Streaming Application')
    .getOrCreate()
)

#append stream from multiple topic together
topic = "Producer1|Producer2|Producer3"
#topic_hotspot = 'A2_hotspot_topic'
# topic_AQUA = 'Scenaries02'
# topic_TERRA  = 'Scenaries03'

In [3]:
kafka_sdf = (
    spark.readStream
    .format('kafka')
    .option('kafka.bootstrap.servers', f'{host_ip}:9092')
    .option('subscribePattern', topic) #listen to multiple topics
    .load()
)
kafka_sdf.printSchema()

#select the value column in dataframe
climatehotspot_sdf = kafka_sdf.select('value')

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



In [4]:
from datetime import datetime, timedelta

def climate_geohash(lat, lon):
    return geohash.encode(lat, lon, precision=3)

def hotspot_geohash(lat, lon):
    return geohash.encode(lat, lon, precision=5)

def insertOne(result):
    mongo_client = MongoClient(
        host=f'{host_ip}',
        port=27017
    )
    db = mongo_client['fit3182_assignment_db']
    collection = db.a2_partB
    collection.insert_one(result)
    #print("INSERTED !")
    

def func(batch_df,batch_id):
    #retrieve all data from dataframe  (for kafka) into a python list for 
    entries = batch_df.collect()
    #convert each entry into python dictionary
    data = [entry.asDict() for entry in entries]
    
    
    if data is not None:
        #for each entry/row in one batch of 10 seconds
        climate_report = None
        climate_geohash3 = None
        hotspot_hash = {}
        #store the hotspot in precision 3
        climate_hash = {}
        
        #each dict
        for row in data: #select value column from kafka dict
            byte_array = row['value']
            # Decode the bytearray into a string
            json_str = byte_array.decode('utf-8')

            # Parse the JSON string into a dictionary
            data_dict = json.loads(json_str)
            
            #climate report
            if data_dict['producer'] == 'Producer_1':
                #take only the first climate report
                if climate_report is None:
                    climate_report = data_dict
                    
                    
                    climate_geohash3 = climate_geohash(climate_report['latitude'], climate_report['longitude'])
                    #initialise a blank array in set to append hotspot later
                    climate_hash[climate_geohash3] = []
                    climate_report['hotspot'] = []

                    
            #hotspot report
            else:
                hotspot_dict = data_dict
                #print("hotspot_dict: ", hotspot_dict)    #climate geohash as key only ke
                
                #hash the geohash among the hotspots
                hotspot_geohash5 = hotspot_geohash(hotspot_dict['latitude'], hotspot_dict['longitude'])
                
                #maintain hotspot for not the same geohash string in precision 5
                if hotspot_geohash5 not in hotspot_hash.keys():
                    #accumulate the hotspot dictionary until cliamte report come
                    hotspot_hash[hotspot_geohash5] = [hotspot_dict]
                    
                #add to the array with the same geohash key 
                else:
                    hotspot_hash[hotspot_geohash5].append(hotspot_dict)
                    
                
        #if climate report exist for each batch
        if climate_report is not None:
                #for each geohash5 hotspots do average of temp within 10 minutes
            for key, value in hotspot_hash.items():
                last_hp = value[-1]
                #datetime string      #do again strptime to turn into datetime object
                latest_min = datetime.strptime(last_hp['created_time'], '%H:%M:%S')
                avg_temp = 0
                avg_conf = 0
                len_array = len(value)
                #print(len_array)

                #for each key average the surface temperature and confidence of the array in value
                #by reducing array 
                for hp in value:
                    # within 10 minutes from latest hotspot in each batch
                    time_differnce = latest_min - datetime.strptime(hp['created_time'], '%H:%M:%S')
                    if  time_differnce < timedelta(minutes = 10):
                        #print(time_differnce)
                        avg_temp += hp['surface_temperature_celcius']
                        avg_conf += hp['confidence']

                avg_temp = avg_temp/len_array
                avg_conf = avg_conf/len_array
                last_hp['surface_temperature_celcius'] = avg_temp
                last_hp['avg_conf'] = avg_conf
                #set it to only hotspot for this key
                hotspot_hash[key] = last_hp


                # Check if the first 3 characters of the key match the climate_geohash3
                if str(key[:3]) == climate_geohash3:
                    # If the key exists in climate_hash, append to it, otherwise create a new list
                    if climate_geohash3 in climate_hash:
                        climate_hash[climate_geohash3].append(hotspot_hash[key])
                    else:
                        climate_hash[climate_geohash3] = [hotspot_hash[key]]
                    # Append the hotspots to the climate report
                    #print(climate_hash[climate_geohash3])
                    
                    
            climate_report['hotspot'] = climate_hash[climate_geohash3]
                    #print("hotspots: " + str(climate_report['hotspot']))
            #print(len_array)
            #CAUSE OF fire event
            if climate_report['air_temperature_celcius'] > 20 and climate_report['GHI_w/m2'] > 180:
                climate_report['fire_cause'] = 'natural'
            else:
                climate_report['fire_cause'] = 'other'
        
            print(climate_report)
            insertOne(climate_report)        

                
db_writer = (
    climatehotspot_sdf.writeStream
    .outputMode('append')
    .trigger(processingTime ='10 seconds') # one day 
    #apply a function to each aach of data with the internal of processing time
    .foreachBatch(func) #batch and batch id and input
)



In [5]:
try:
    query = db_writer.start()
    query.awaitTermination()
except KeyboardInterrupt:
    print('Interrupted by CTRL-C. Stopping query.')
finally:
    query.stop()


{'latitude': -36.7685, 'longitude': 142.7134, 'air_temperature_celcius': 14, 'relative_humidity': 48.2, 'windspeed_knots': 12.5, 'max_wind_speed': 19.0, 'precipitation': ' 0.03G', 'GHI_w/m2': 122, 'producer': 'Producer_1', 'date': '2024-01-02', 'hotspot': [], 'fire_cause': 'other'}
{'latitude': -37.591, 'longitude': 149.33, 'air_temperature_celcius': 16, 'relative_humidity': 46.7, 'windspeed_knots': 10.0, 'max_wind_speed': 16.9, 'precipitation': ' 0.00I', 'GHI_w/m2': 141, 'producer': 'Producer_1', 'date': '2024-01-03', 'hotspot': [], 'fire_cause': 'other'}


ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/opt/conda/lib/python3.8/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/opt/conda/lib/python3.8/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/opt/conda/lib/python3.8/socket.py", line 669, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


Interrupted by CTRL-C. Stopping query.


In [46]:
import geohash
from pyspark.sql.functions import from_json, col, avg
from pyspark.sql.types import StructType, StructField, DoubleType, StringType


# Define the schema for the Climate and Hotspot data payload
# for from_json to know how to parse the structure

#Schema to inform how parse the JSON object
climate_schema = StructType([   #specify the data typem True to contain null values
    StructField('latitude', StringType(), True),
    StructField('longitude', DoubleType(), True),
    StructField('air_temperature_celcius', IntegerType(), True),
    StructField('relative_humidity', DoubleType(), True),
    StructField('windspeed_knots', DoubleType(), True),
    StructField('max_wind_speed', DoubleType(), True),
    StructField('precipitation', StringType(), True),
    StructField('GHI_w/m2', IntegerType(), True),
    StructField('producer', StringType(), True),
    StructField('date', StringType(), True),
    StructField('created_time', IntegerType(), True)
])

hotspot_schema = StructType([
    StructField("latitude", DoubleType(), True),
    StructField("longitude", DoubleType(), True),
    StructField("confidence", DoubleType(), True),
    StructField('surface_temperature_celcius', DoubleType(), True),
    StructField("producer", StringType(), True),
    StructField("random_time", IntegerType(), True)

])

In [47]:
from pyspark.sql.functions import to_timestamp
# Define the geohash UDF for climate
def climate_geohash(lat, lon):
    return geohash.encode(lat, lon, precision=3)

# # Register UDFs in Spark
climate_geohash_udf = udf(climate_geohash, StringType())
spark.udf.register("climate_geohash", climate_geohash_udf)


# climate_sdf = (
#     spark.readStream
#     .format('kafka')
#     .option('kafka.bootstrap.servers', f'{host_ip}:9092')
#     .option('subscribe', topic)
#     .load() #value store the actual dataframe
#     .select('value')
#      #do processing after .load() to get into the dataframe and deal with row
# #     .withColumn('geohash', climate_geohash_udf(col('latitude'), col('longtitude')))
# #     .limit(1)  # Retains only the first row
    
# )

# Deserialize the JSON payload from the 'value' column
climate_sdf = (
    spark.readStream
    .format('kafka')
    .option('kafka.bootstrap.servers', f'{host_ip}:9092')
    .option('subscribe', topic)
    .load()  #load the JSON object to be in Dataframe, use climate_sdf.printSchema()  to see it
    #deserialize the binary ‘value’ column into a format that allows you to access the ‘latitude’ and ‘longitude’ fields
    .select(from_json(col("value").cast("string"), climate_schema).alias("data"))
    #from_json: parse a JSON string and convert into a DataDrame of complex type StructType or MapType
    #col("value").cast("string"): akes the ‘value’ column, which is in binary format, and 
    #casts it to a string type. This is necessary because from_json expects a JSON string as input.
    #climate_schema: schema that defined which from_json will use to parse the JSON string. 
    #should match the structure of the JSON data you’re working with.
    #.alias("data"): This renames the resulting column from the from_json function to ‘data’.
    .select("data.*")  
    #After JSON string parsed into a structured format
    #select statement used to select all fields from the 'data' column since renamed
    #.select("data.*") select 
    .withColumn('geohash', climate_geohash_udf(col('latitude'), col('longitude')))
    .withColumn('created_time', to_timestamp(col('created_time')))
    .limit(1)  # Retains only the first row
)

#print the schema
climate_sdf.printSchema() 


root
 |-- latitude: string (nullable = true)
 |-- longitude: double (nullable = true)
 |-- air_temperature_celcius: integer (nullable = true)
 |-- relative_humidity: double (nullable = true)
 |-- windspeed_knots: double (nullable = true)
 |-- max_wind_speed: double (nullable = true)
 |-- precipitation: string (nullable = true)
 |-- GHI_w/m2: integer (nullable = true)
 |-- producer: string (nullable = true)
 |-- date: string (nullable = true)
 |-- created_time: timestamp (nullable = true)
 |-- geohash: string (nullable = true)



In [92]:
#all producers send to the same topic to make Kafka to join the stream
hotspot_sdf = (
    spark.readStream
    .format('kafka')
    .option('kafka.bootstrap.servers', f'{host_ip}:9092')
    .option('subscribe', topic_hotspot)
    .load()
)

hotspot_sdf = (
    spark.readStream
    .format('kafka')
    .option('kafka.bootstrap.servers', f'{host_ip}:9092')
    .option('subscribe', topic)
    .load()  
    .select(from_json(col("value").cast("string"), hotspot_schema).alias("data"))
    .select("data.*")
)

hotspot_sdf.printSchema() 

root
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- confidence: double (nullable = true)
 |-- surface_temperature_celcius: double (nullable = true)
 |-- producer: string (nullable = true)
 |-- random_time: integer (nullable = true)



In [105]:
from pyspark.sql.functions import first
# Define the geohash UDF for hotspot
def hotspot_geohash(lat, lon):
    return geohash.encode(lat, lon, precision=5)

# Register the UDF for pyspark to group
hotspot_geohash_udf = udf(hotspot_geohash, StringType())
spark.udf.register("hotspot_geohash", hotspot_geohash_udf)



# climate_stream = climate_stream.withColumn('geohash', climate_geohash_udf(col('latitude'), col('longtitude')))
hotspots_merge_sdf = (
    hotspot_sdf
    .withColumn('geohash', hotspot_geohash_udf(col('latitude'), col('longitude')))
    #drop the rows with the same geohash at this time window
    .dropDuplicates(['geohash'])
    #laod data from source in dataframe format
)

# Update the result after dropDuplicates to update geohash to new value
hotspots_merge_sdf = (
    hotspot_sdf
    .withColumn('geohash', hotspot_geohash_udf(col('latitude'), col('longitude')))
    .dropDuplicates(['geohash'])
    # Load data from source in DataFrame format
    # Assuming load() is a method that loads the DataFrame, replace it with the actual method if different
    # after dropDuplicates base on geohash, replace with lower precision geohash value
    # for climat
    .withColumn('geohash', climate_geohash_udf(col('latitude'), col('longitude')))
    
)

# Alias the DataFrames before joining
climate_sdf_alias = climate_sdf.alias("climate")
hotspots_merge_sdf_alias = hotspots_merge_sdf.alias("hotspot")

# Perform the join using the aliased DataFrames
joined_df = climate_sdf_alias.join(hotspots_merge_sdf_alias, ["geohash"], how='left_outer')

# Select columns using the alias  #select all columns 
joined_df = joined_df.select("climate.*", "hotspot.*")

# Apply a watermark to the joined DataFrame using the correct timestamp column
# Replace 'created_time' with the actual timestamp column name from your data
joined_df = joined_df.withWatermark("climate.created_time", "10 seconds")

# Perform aggregation
averaged_df = joined_df.groupBy('climate.geohash').agg(
    avg(col('hotspot.surface_temperature_celcius')).alias('surface_temperature_celcius'),
    avg(col('hotspot.confidence')).alias('confidence'),
    first(col('climate.air_temperature_celcius')).alias('air_temperature_celcius'),
    first(col('climate.relative_humidity')).alias('relative_humidity'),
    first(col('climate.windspeed_knots')).alias('windspeed_knots'),
    first(col('climate.max_wind_speed')).alias('max_wind_speed'),
    first(col('climate.precipitation')).alias('precipitation'),
    first(col('climate.GHI_w/m2')).alias('GHI_w/m2'),
    first(col('hotspot.producer')).alias('producer'),
    first(col('climate.date')).alias('date'),
    first(col('climate.latitude')).alias('latitude'),
    first(col('climate.longitude')).alias('longitude'),
    first(col('climate.created_time')).alias('created_time')
)

averaged_df.printSchema()
# ## Each stream is a DStream of (key, value) pairs where 'key' could be a location 
# # Only include rows that have matching geohashes in both streams
# # inner join since it is important that it has fire

# #must join since from two different streams
# joined_df = climate_sdf.join(hotspots_merge_sdf, on=['geohash'], how='left_outer') 

# # Select the disambiguated 'created_time' column from one of the DataFrames
# # Assuming 'created_time' comes from the 'climate' DataFrame
# joined_df = joined_df.select("climate_sdf.*", "hotspots_merge_sdf.geohash")
# # Apply a watermark to the joined DataFrame
# joined_df = joined_df.withWatermark("created_time", "10 seconds")

# # Perform aggregation
# averaged_df = joined_df.groupBy('geohash').agg(avg(col('surface_temperature_celcius')),avg(col('confidence')))






root
 |-- geohash: string (nullable = true)
 |-- surface_temperature_celcius: double (nullable = true)
 |-- confidence: double (nullable = true)
 |-- air_temperature_celcius: integer (nullable = true)
 |-- relative_humidity: double (nullable = true)
 |-- windspeed_knots: double (nullable = true)
 |-- max_wind_speed: double (nullable = true)
 |-- precipitation: string (nullable = true)
 |-- GHI_w/m2: integer (nullable = true)
 |-- producer: string (nullable = true)
 |-- date: string (nullable = true)
 |-- latitude: string (nullable = true)
 |-- longitude: double (nullable = true)
 |-- created_time: timestamp (nullable = true)



In [114]:
# for debugging, print on console
class DbWriter:
    
    # called at the start of processing each partition in each output micro-batch
    def open(self, partition_id, epoch_id):
        self.mongo_client = MongoClient(
            host=f'{host_ip}',
            port=27017
        )
        #use the same database, name: fit3182_db
        self.db = self.mongo_client['fit3182_db']
        return True
    
    #what ever row it receive at the time just process
    def process(self, row):
                #road of data from JSON string to 
        data = json.loads(row.value)
        
        record = {}
        #climate
        record['air_temperature_celcius'] = data.get('air_temperature_celcius')
        record['relative_humidity'] = data.get('relative_humidity')
        record['windspeed_knots'] = data.get('windspeed_knots')
        record['max_wind_speed'] = data.get('max_wind_speed')
        record['precipitation'] = data.get('precipitation')
        record['GHI_w/m2'] = data.get('GHI_w/m2')
        record['date'] = data.get('date')
        record['latitude'] = data.get('latitude')
        record['longtitude'] = data.get('longtitude')
        
        #hotspots
        record['confidence'] = data.get('confidence')
        record['surface_temperature_celcius'] = data.get('surface_temperature_celcius')
        record['producer'] = data.get('producer')
        print(record)
        
    def close(self, err):
        return True

In [115]:
writer = (
    # Initializes a streaming write for the climate_sdf DataFrame.
    averaged_df.writeStream.format("console")
    # Output will be written to the standard console/output.
    .option("checkpointLocation", "./hotspot_sdf_checkpoints")
    # Specifies the location for checkpointing, which allows streaming 
    # queries to be resilient to failures by storing the state.
    .outputMode('append')  # Only new rows will be written to the output sink since the last trigger.
    .trigger(processingTime = '10 seconds')
    .foreach(DbWriter())  # Applies a foreach writer with DbWriter_climate instance.
    # This indicates that for each row in the output, the DbWriter_climate class’s process method will be called.
)




In [116]:
try:
    query = writer.start()
    query.awaitTermination()
except KeyboardInterrupt:
    print('Interrupted by CTRL-C. Stopping query.')
finally:
    query.stop()




NameError: name 'query' is not defined

In [11]:
class DbWriter_climate:
    """
    Check whether it works in pyspark log
    """
    # called at the start of processing each partition in each output micro-batch
    def open(self, partition_id, epoch_id):
        self.mongo_client = MongoClient(
            host=f'{host_ip}',
            port=27017
        )
        #use the same database, name: fit3182_db
        self.db = self.mongo_client['fit3182_db']
        return True
    
    
    # called once per row of the result dataframe
    # the current code DOES NOT handle duplicate processing
    #   e.g., query fails and restarts just before current micro-batch was fully inserted
    def process(self, row):
        #passing JSON string from row.value
        #into a dictionary data
        data = json.loads(row.value)
        
        db_record = {}
        db_record['air_temperature_celcius'] = data.get('air_temperature_celcius')
        db_record['relative_humidity'] = data.get('relative_humidity')
        db_record['windspeed_knots'] = data.get('windspeed_knots')
        db_record['max_wind_speed'] = data.get('max_wind_speed')
        db_record['precipitation'] = data.get('precipitation')
        db_record['GHI_w/m2'] = data.get('GHI_w/m2')
        db_record['producer'] = data.get('producer')
        db_record['date'] = data.get('date')
        #print(db_record)
        #print heren not working
                  
        #update and insert
        # New database has nothing so just insert
        # later something with the same station then update
        self.db['A2'].replace_one({'station': data.get('kerbsideid')}, db_record, upsert=True)
    
    # called once all rows have been processed (possibly with error)
    def close(self, err):
        self.mongo_client.close()

In [13]:
writer = (
    #initializes a streaming write for the parking_sdf DataFrame.
    climate_sdf.writeStream.format("console")
    #output will be written to the standard console/output.
    .option("checkpointLocation", "./climate_sdf_checkpoints")
    #Specifies the location for checkpointing, which allows streaming 
    #queries to be resilient to failures by storing the state.
    #.outputMode('append'): Sets the output mode to ‘append’, meaning 
    #only new rows will be written to the output sink since the last trigger.
    .outputMode('append').foreach(DbWriter_climate())
    # Applies a foreach writer, which in this case is an instance of DbWriter(). 
    #This indicates that for each row in the output, the DbWriter() class’s process 
    #method will be called
)

In [62]:
try:
    query = writer.start()
    query.awaitTermination()
except KeyboardInterrupt:
    print('Interrupted by CTRL-C. Stopping query.')
finally:
    query.stop()

NameError: name 'query' is not defined

In [80]:
# from pyspark.sql.functions import from_json, col
# climate_stream = (
#     spark.readStream
#     .format('kafka')
#     .option('kafka.bootstrap.servers', f'{host_ip}:9092')
#     .option('subscribe', topic)
#     #load into dataframe for raw data from source(Kafka)
#     .load()    #parse JSON string into structured format (Dataframe) based on schema
#     .select(from_json(col('value').cast('string'), climate_schema).alias('data'))
#     .select('data.*')         
# )           #cast bytes in value column into string
#             #alias rename value column into data
#             #flatten by expandign fields of the data column into separate columns
#             #select data column since renamed
            
# hotspot_AQUA_stream = (
#     spark.readStream \
#     .format("kafka")
#     .option("kafka.bootstrap.servers", "localhost:9092") 
#     .option("subscribe", topic_AQUA)
#     .load()
#     .select(from_json(col("value").cast("string"), hotspot_schema).alias("data"))
#     .select("data.*")
# )

# hotspot_TERRA_stream = (
#     spark.readStream \
#     .format("kafka")
#     .option("kafka.bootstrap.servers", "localhost:9092") 
#     .option("subscribe", topic_TERRA)
#     .load()
#     .select(from_json(col("value").cast("string"), hotspot_schema).alias("data"))
#     .select("data.*")
# )

In [81]:
# import geohash
# from pyspark.sql.functions import col, udf
# from pyspark.sql.types import StringType


# # Define the geohash UDF for hotspot
# def hotspot_geohash(lat, lon):
#     return geohash.encode(lat, lon, precision=5)

# # Register the UDF
# hotspot_geohash_udf = udf(hotspot_geohash, StringType())
# spark.udf.register("hotspot_geohash", hotspot_geohash_udf)


# # Define the geohash UDF for climate
# def climate_geohash(lat, lon):
#     return geohash.encode(lat, lon, precision=3)

# # # Register UDFs in Spark
# climate_geohash_udf = udf(climate_geohash, StringType())
# spark.udf.register("climate_geohash", climate_geohash_udf)



# # Apply the geohash UDF to the DataFrame columns
# climate_stream = climate_stream.withColumn('geohash', climate_geohash_udf(col('latitude'), col('longtitude')))
# hotspot_AQUA_stream = hotspot_AQUA_stream.withColumn('geohash', hotspot_geohash_udf(col('latitude'), col('longtitude')))
# hotspot_TERRA_stream = hotspot_TERRA_stream.withColumn('geohash', hotspot_geohash_udf(col('latitude'), col('longtitude')))


# # Join the streams and drop duplicates
# joined_hotspot_stream = hotspot_AQUA_stream.join(hotspot_TERRA_stream, 'geohash').dropDuplicates(['geohash'])
# final_stream = climate_stream.join(joined_hotspot_stream, 'geohash')


# # # Assuming you have a joined DataFrame called 'joined_df1_df2'
# # for row in joined_hotspot_stream.collect():
# #     geohash = row["geohash"]
# #     temperature = row["temperature"]
# #     hotspot_count = row["hotspot_count"]
# #     # Store relevant information in the dictionary
# #     result_dict[geohash] = {"temperature": temperature, "hotspot_count": hotspot_count}
# #     print(row)
# # # Now 'result_dict' contains the processed data
# # Print the schema of the DataFrame
# climate_stream.printSchema()
# hotspot_AQUA_stream.printSchema()
# hotspot_TERRA_stream.printSchema()

# # Show the contents of the DataFrame
# climate_stream.show()
# hotspot_AQUA_stream.show()
# hotspot_TERRA_stream.show()

# # After joining the streams
# joined_hotspot_stream.printSchema()
# final_stream.printSchema()

# # Show the contents after joining
# joined_hotspot_stream.show()
# final_stream.show()


root
 |-- latitude: string (nullable = true)
 |-- longtitude: double (nullable = true)
 |-- air_temperature_celcius: integer (nullable = true)
 |-- relative_humidity: double (nullable = true)
 |-- windspeed_knots: double (nullable = true)
 |-- max_wind_speed: double (nullable = true)
 |-- precipitation: string (nullable = true)
 |-- GHI_w/m2: integer (nullable = true)
 |-- producer: string (nullable = true)
 |-- date: string (nullable = true)
 |-- geohash: string (nullable = true)

root
 |-- latitude: double (nullable = true)
 |-- longtitude: double (nullable = true)
 |-- confidence: double (nullable = true)
 |-- surface_temperature: double (nullable = true)
 |-- created_time: string (nullable = true)
 |-- created_time: integer (nullable = true)
 |-- geohash: string (nullable = true)

root
 |-- latitude: double (nullable = true)
 |-- longtitude: double (nullable = true)
 |-- confidence: double (nullable = true)
 |-- surface_temperature: double (nullable = true)
 |-- created_time: strin

AnalysisException: Queries with streaming sources must be executed with writeStream.start();
kafka