In [1]:
from pyspark.sql import SparkSession
import os
import pyspark.sql.functions as F
from pyspark.sql.functions import input_file_name, regexp_extract
from pyspark.ml import PipelineModel

In [2]:
spark = SparkSession.builder \
        .appName("LoadAndPredictApp") \
        .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
        .config("spark.hadoop.fs.s3a.threads.keepalivetime", "60000") \
        .config("spark.hadoop.fs.s3a.multipart.purge.age", "24") \
        .config("spark.hadoop.fs.s3a.connection.establish.timeout", "30000") \
        .config("spark.hadoop.fs.s3a.connection.timeout", "60000") \
        .config("spark.hadoop.fs.s3a.aws.credentials.provider", "com.amazonaws.auth.EnvironmentVariableCredentialsProvider") \
        .getOrCreate()

25/09/09 09:18:06 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [4]:
hadoop_conf = spark.sparkContext._jsc.hadoopConfiguration()
hadoop_conf.clear()  # Clear all Hadoop configurations
hadoop_conf.set("fs.s3a.aws.credentials.provider", "com.amazonaws.auth.EnvironmentVariableCredentialsProvider")
hadoop_conf.set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
hadoop_conf.set("fs.s3a.connection.timeout", "60000")
hadoop_conf.set("fs.s3a.socket.timeout", "60000")
hadoop_conf.set("fs.s3a.connection.establish.timeout", "60000")
hadoop_conf.set("fs.s3a.threads.keepalivetime", "60")
hadoop_conf.set("fs.s3a.multipart.purge.age", "24")

In [8]:
from datetime import datetime
date_str = datetime.now().strftime("%Y-%m-%d")
model_path = f"s3a://af-weather-lake-01/models/date={date_str}/model/"
# date_str    
model = PipelineModel.load(f"s3a://af-weather-lake-01/models/date={date_str}/model/")

In [9]:
model

PipelineModel_0f2bc5945cee

In [67]:
current_time = datetime.now().strftime("%Y-%m-%d_%H%M")
current_time

'2025-09-09_1024'

In [40]:
# Simulate new data 

havana_entry = spark.read.json("s3a://af-weather-lake-01/data/weather/date=2025-09-09_1500/havana")
nassau_entry = spark.read.json("s3a://af-weather-lake-01/data/weather/date=2025-09-09_1500/nassau")
miami_entry = spark.read.json("s3a://af-weather-lake-01/data/weather/date=2025-09-09_1500/miami")

all_entries = havana_entry.union(nassau_entry).union(miami_entry)

In [41]:
all_entries.printSchema()

root
 |-- current: struct (nullable = true)
 |    |-- apparent_temperature: double (nullable = true)
 |    |-- cloud_cover: long (nullable = true)
 |    |-- interval: long (nullable = true)
 |    |-- is_day: long (nullable = true)
 |    |-- precipitation: double (nullable = true)
 |    |-- pressure_msl: double (nullable = true)
 |    |-- rain: double (nullable = true)
 |    |-- relative_humidity_2m: long (nullable = true)
 |    |-- showers: double (nullable = true)
 |    |-- surface_pressure: double (nullable = true)
 |    |-- temperature_2m: double (nullable = true)
 |    |-- time: string (nullable = true)
 |    |-- weather_code: long (nullable = true)
 |    |-- wind_direction_10m: long (nullable = true)
 |    |-- wind_gusts_10m: double (nullable = true)
 |    |-- wind_speed_10m: double (nullable = true)
 |-- current_units: struct (nullable = true)
 |    |-- apparent_temperature: string (nullable = true)
 |    |-- cloud_cover: string (nullable = true)
 |    |-- interval: string (nulla

In [42]:
all_df = all_entries.select("current.*", "latitude", "longitude") \
    .withColumn("file_path", input_file_name()) \
    .withColumn("city", regexp_extract("file_path", r"date=[^/]+/([^/]+)/[^/]+\.json$", 1))


In [43]:
all_df.show()

+--------------------+-----------+--------+------+-------------+------------+----+--------------------+-------+----------------+--------------+----------------+------------+------------------+--------------+--------------+--------+---------+--------------------+------+
|apparent_temperature|cloud_cover|interval|is_day|precipitation|pressure_msl|rain|relative_humidity_2m|showers|surface_pressure|temperature_2m|            time|weather_code|wind_direction_10m|wind_gusts_10m|wind_speed_10m|latitude|longitude|           file_path|  city|
+--------------------+-----------+--------+------+-------------+------------+----+--------------------+-------+----------------+--------------+----------------+------------+------------------+--------------+--------------+--------+---------+--------------------+------+
|                88.2|        100|     900|     0|          0.5|      1008.6| 0.0|                  87|    0.5|           951.8|          78.4|2025-09-09T16:00|          95|               28

In [44]:
all_predictions = model.transform(all_df)

In [46]:
all_predictions.select('features', 'city', 'city_index', 'prediction').show()

+--------------------+------+----------+----------+
|            features|  city|city_index|prediction|
+--------------------+------+----------+----------+
|[-0.4064548916150...|havana|       1.0|       1.0|
|[-0.1130529959689...|nassau|       2.0|       2.0|
|[1.01541583343893...| miami|       0.0|       0.0|
+--------------------+------+----------+----------+



In [47]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

accuracty_evaluator = MulticlassClassificationEvaluator(labelCol='city_index', predictionCol='prediction', metricName='accuracy')
accuracy = accuracty_evaluator.evaluate(all_predictions) * 100
print(f'Accuracy = {accuracy:.2f}%')

Accuracy = 100.00%


In [52]:
precision_evaluator = MulticlassClassificationEvaluator(labelCol='city_index', predictionCol='prediction', metricName='precisionByLabel')
precision = precision_evaluator.evaluate(all_predictions) * 100
print(f'Precision = {precision:.2f}%')

Precision = 100.00%


In [70]:
# Write a Report to S3
result = {
    "model_type": "PipelineModel",
    "model_location": f"s3a://af-weather-lake-01/models/date={date_str}/model/",
    "datetime": current_time,
    "accuracy": f'{accuracy:.2f}%',
    "precision": f'{precision:.2f}%',
    "city_indexes": {
        "miami": all_predictions.select("city_index").where(all_predictions.city == "miami").first()[0],
        "massau": all_predictions.select("city_index").where(all_predictions.city == "nassau").first()[0],
        "havana": all_predictions.select("city_index").where(all_predictions.city == "havana").first()[0],
    },
    "city_guesses": {
        "miami": all_predictions.select("prediction").where(all_predictions.city == "miami").first()[0],
        "nassau": all_predictions.select("prediction").where(all_predictions.city == "nassau").first()[0],
        "havana": all_predictions.select("prediction").where(all_predictions.city == "havana").first()[0],
    }
}
result


{'model_type': 'PipelineModel',
 'model_location': 's3a://af-weather-lake-01/models/date=2025-09-09/model/',
 'datetime': '2025-09-09_1024',
 'accuracy': '100.00%',
 'precision': '100.00%',
 'city_indexes': {'miami': 0.0, 'massau': 2.0, 'havana': 1.0},
 'city_guesses': {'miami': 0.0, 'nassau': 2.0, 'havana': 1.0}}

In [74]:
import json
json_string = json.dumps(result, indent=2)
print(json_string)

{
  "model_type": "PipelineModel",
  "model_location": "s3a://af-weather-lake-01/models/date=2025-09-09/model/",
  "datetime": "2025-09-09_1024",
  "accuracy": "100.00%",
  "precision": "100.00%",
  "city_indexes": {
    "miami": 0.0,
    "massau": 2.0,
    "havana": 1.0
  },
  "city_guesses": {
    "miami": 0.0,
    "nassau": 2.0,
    "havana": 1.0
  }
}


In [77]:
import tempfile
import subprocess

with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.json') as temp_file:
                    temp_file.write(json_string)
                    temp_path = temp_file.name
try:
    key = f'reports/hourly/date={current_time}/{current_time}_report.json'
    aws_result = subprocess.run([
        'aws', 's3', 'cp', 
        temp_path, 
        f's3://af-weather-lake-01/{key}',
        '--content-type', 'application/json'
    ], capture_output=True, text=True, timeout=60)
    
    if aws_result.returncode == 0:
        print(f"Successfully uploaded to S3: {key}")
    else:
        print(f"AWS CLI failed: {aws_result.stderr}")
        raise Exception("S3 upload failed")
finally:
    # Clean up temp file
    os.unlink(temp_path)

Successfully uploaded to S3: reports/hourly/date=2025-09-09_1024/2025-09-09_1024_report.json
