In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
import os

spark = SparkSession.builder.appName("StaticDataFrame").getOrCreate()

In [2]:
# Create an input streaming dataframe for json files in the user_weather folder.
schema = StructType([
    StructField('is_day', LongType(), True),
    StructField('temperature', DoubleType(), True),
    StructField('time', StringType(), True),
    StructField('weathercode', LongType(), True),
    StructField('winddirection', DoubleType(), True),
    StructField('windspeed', DoubleType(), True)
])

input_df = (spark.readStream.format("json")
            .schema(schema)
            .option("maxFilesPerTrigger", 1)
            .load("user_weather/"))



In [3]:
# Apply the function to get the average temperature of streaming data
temp_df = (input_df.select(col("temperature"))
           .groupBy()
           .agg(avg("temperature").alias("avg_temperature")))

In [4]:
# Create an output sink to memory. Add the 30 minutes trigger.
query = (temp_df.writeStream
         .outputMode("complete")
         .format("memory")
         .queryName("avg_temperature")
         .trigger(processingTime="30 minutes")
         .start())

query.status

{'message': 'Processing new data',
 'isDataAvailable': True,
 'isTriggerActive': True}

In [5]:
# Modify the code below to create a program which will be make weather API calls every 30 minutes 
# and store data in user_weather/[user]_[current_timestamp]_weather.json files. Move it to the separate Python file and start the program.

import time
import requests
import json
import random
import datetime

user_locations = []
for i in range(20):
    user_locations.append(("user"+str(i), random.uniform(-90, 90), random.uniform(-180, 180)))


def get_current_weather(url):
    request_response = requests.get(url)
    weather_request_response_json = json.loads(request_response.text)
    current_weather = weather_request_response_json["current_weather"]
    return current_weather

def write_current_weather_to_file(file_name, current_weather):
    with open(file_name, "a") as f:
        f.write(json.dumps(current_weather))

while True:
    current_time = datetime.datetime.now()
    for i in range(len(user_locations)):
        weather_api_url = f"https://api.open-meteo.com/v1/forecast?latitude={user_locations[i][1]}&longitude={user_locations[i][2]}&current_weather=true&hourly=temperature_2m,relativehumidity_2m,windspeed_10m"
        current_weather = get_current_weather(weather_api_url)
        file_name = f"user_weather/{user_locations[i][0]}_{current_time.strftime('%Y%m%d_%H%M%S')}_weather.json"
        write_current_weather_to_file(file_name, current_weather)
    result_df = spark.sql("SELECT * FROM avg_temperature").show() # Select data from the in-memory table every 30 minutes.
    time.sleep(1800) # wait for 30 minutes

+---------------+
|avg_temperature|
+---------------+
|           12.4|
+---------------+



KeyboardInterrupt: 