In [382]:
from pyspark.sql import SparkSession
import os
import pyspark.sql.functions as F
from pyspark.sql.functions import input_file_name, regexp_extract

In [383]:
spark = SparkSession.builder \
        .appName("S3ToAuroraPostgres") \
        .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
        .config("spark.hadoop.fs.s3a.threads.keepalivetime", "60000") \
        .config("spark.hadoop.fs.s3a.multipart.purge.age", "24") \
        .config("spark.hadoop.fs.s3a.connection.establish.timeout", "30000") \
        .config("spark.hadoop.fs.s3a.connection.timeout", "60000") \
        .config("spark.hadoop.fs.s3a.aws.credentials.provider", "com.amazonaws.auth.EnvironmentVariableCredentialsProvider") \
        .getOrCreate()

In [384]:
hadoop_conf = spark.sparkContext._jsc.hadoopConfiguration()
iterator = hadoop_conf.iterator()
while iterator.hasNext():
    entry = iterator.next()
    if "fs.s3a" in entry.getKey():
        print(f"{entry.getKey()}: {entry.getValue()}")

fs.s3a.connection.establish.timeout: 60000
fs.s3a.impl: org.apache.hadoop.fs.s3a.S3AFileSystem
fs.s3a.socket.timeout: 60000
fs.s3a.threads.keepalivetime: 60
fs.s3a.connection.timeout: 60000
fs.s3a.multipart.purge.age: 24
fs.s3a.aws.credentials.provider: com.amazonaws.auth.EnvironmentVariableCredentialsProvider


In [385]:
hadoop_conf = spark.sparkContext._jsc.hadoopConfiguration()
hadoop_conf.clear()  # Clear all Hadoop configurations
hadoop_conf.set("fs.s3a.aws.credentials.provider", "com.amazonaws.auth.EnvironmentVariableCredentialsProvider")
hadoop_conf.set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
hadoop_conf.set("fs.s3a.connection.timeout", "60000")
hadoop_conf.set("fs.s3a.socket.timeout", "60000")
hadoop_conf.set("fs.s3a.connection.establish.timeout", "60000")
hadoop_conf.set("fs.s3a.threads.keepalivetime", "60")
hadoop_conf.set("fs.s3a.multipart.purge.age", "24")

In [386]:
df = spark.read.json("s3a://af-weather-lake-01/data/weather/")

                                                                                

In [387]:
df.select("current.*").show(5)

+--------------------+-----------+--------+------+-------------+------------+----+--------------------+-------+----------------+--------------+----------------+------------+------------------+--------------+--------------+
|apparent_temperature|cloud_cover|interval|is_day|precipitation|pressure_msl|rain|relative_humidity_2m|showers|surface_pressure|temperature_2m|            time|weather_code|wind_direction_10m|wind_gusts_10m|wind_speed_10m|
+--------------------+-----------+--------+------+-------------+------------+----+--------------------+-------+----------------+--------------+----------------+------------+------------------+--------------+--------------+
|                91.1|        100|     900|     1|          0.1|      1003.0| 0.0|                  83|    0.1|           946.8|          81.1|2025-09-02T05:00|          80|               100|          14.8|           4.0|
|                81.2|        100|     900|     0|          1.0|      1000.4| 0.0|                  94|    1

In [388]:
df.agg(
    F.count("*") \
).show()

[Stage 697:>                                                      (0 + 10) / 10]

+--------+
|count(1)|
+--------+
|     188|
+--------+



                                                                                

In [389]:
df.printSchema()

root
 |-- current: struct (nullable = true)
 |    |-- apparent_temperature: double (nullable = true)
 |    |-- cloud_cover: long (nullable = true)
 |    |-- interval: long (nullable = true)
 |    |-- is_day: long (nullable = true)
 |    |-- precipitation: double (nullable = true)
 |    |-- pressure_msl: double (nullable = true)
 |    |-- rain: double (nullable = true)
 |    |-- relative_humidity_2m: long (nullable = true)
 |    |-- showers: double (nullable = true)
 |    |-- surface_pressure: double (nullable = true)
 |    |-- temperature_2m: double (nullable = true)
 |    |-- time: string (nullable = true)
 |    |-- weather_code: long (nullable = true)
 |    |-- wind_direction_10m: long (nullable = true)
 |    |-- wind_gusts_10m: double (nullable = true)
 |    |-- wind_speed_10m: double (nullable = true)
 |-- current_units: struct (nullable = true)
 |    |-- apparent_temperature: string (nullable = true)
 |    |-- cloud_cover: string (nullable = true)
 |    |-- interval: string (nulla

In [390]:
work_df = df.select("current.*", "date", "latitude", "longitude") \
    .withColumn("file_path", input_file_name()) \
    .withColumn("city", regexp_extract("file_path", r"date=[^/]+/([^/]+)/[^/]+\.json$", 1))

In [391]:
work_df.printSchema()

root
 |-- apparent_temperature: double (nullable = true)
 |-- cloud_cover: long (nullable = true)
 |-- interval: long (nullable = true)
 |-- is_day: long (nullable = true)
 |-- precipitation: double (nullable = true)
 |-- pressure_msl: double (nullable = true)
 |-- rain: double (nullable = true)
 |-- relative_humidity_2m: long (nullable = true)
 |-- showers: double (nullable = true)
 |-- surface_pressure: double (nullable = true)
 |-- temperature_2m: double (nullable = true)
 |-- time: string (nullable = true)
 |-- weather_code: long (nullable = true)
 |-- wind_direction_10m: long (nullable = true)
 |-- wind_gusts_10m: double (nullable = true)
 |-- wind_speed_10m: double (nullable = true)
 |-- date: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- file_path: string (nullable = false)
 |-- city: string (nullable = false)



In [392]:
work_df.select("city").show(5)

+------+
|  city|
+------+
|havana|
|havana|
|havana|
|havana|
|havana|
+------+
only showing top 5 rows


In [393]:
work_df.groupBy("city") \
    .agg(
        F.count("*").alias("num_of_cities")
    ) \
    .show()

[Stage 701:>                                                      (0 + 10) / 10]

+------+-------------+
|  city|num_of_cities|
+------+-------------+
|havana|           62|
|nassau|           63|
| miami|           63|
+------+-------------+



                                                                                

In [394]:
work_df.groupBy("date") \
    .agg(
        F.count("*").alias("num_of_dates")
    ) \
    .show(3)

[Stage 704:>                                                      (0 + 10) / 10]

+---------------+------------+
|           date|num_of_dates|
+---------------+------------+
|2025-09-01_1400|           3|
|2025-09-01_2000|           3|
|2025-09-01_0700|           3|
+---------------+------------+
only showing top 3 rows


                                                                                

In [395]:
work_df.groupBy("date") \
    .agg(
        F.count("*").alias("num_of_dates")
    ) \
    .where("num_of_dates < 3") \
    .show()

[Stage 707:=====>                                                  (1 + 9) / 10]

+---------------+------------+
|           date|num_of_dates|
+---------------+------------+
|2025-09-03_0400|           2|
+---------------+------------+



                                                                                

In [396]:
clean_df = work_df.select("*")\
    .where(work_df.date != "2025-09-03_0400")

In [397]:
clean_df.groupBy("city") \
    .agg(
        F.count("*").alias("num_of_cities")
    ) \
    .show()

[Stage 710:>                                                      (0 + 10) / 10]

+------+-------------+
|  city|num_of_cities|
+------+-------------+
|havana|           62|
|nassau|           62|
| miami|           62|
+------+-------------+



                                                                                

In [398]:
clean_df.agg(F.count("*")).show()

[Stage 713:>                                                      (0 + 10) / 10]

+--------+
|count(1)|
+--------+
|     186|
+--------+



                                                                                

In [399]:
clean_df = clean_df.dropna()

In [400]:
clean_df.agg(F.count("*")).show()

[Stage 716:>                                                      (0 + 10) / 10]

+--------+
|count(1)|
+--------+
|     186|
+--------+



                                                                                

In [401]:
clean_df.printSchema()

root
 |-- apparent_temperature: double (nullable = true)
 |-- cloud_cover: long (nullable = true)
 |-- interval: long (nullable = true)
 |-- is_day: long (nullable = true)
 |-- precipitation: double (nullable = true)
 |-- pressure_msl: double (nullable = true)
 |-- rain: double (nullable = true)
 |-- relative_humidity_2m: long (nullable = true)
 |-- showers: double (nullable = true)
 |-- surface_pressure: double (nullable = true)
 |-- temperature_2m: double (nullable = true)
 |-- time: string (nullable = true)
 |-- weather_code: long (nullable = true)
 |-- wind_direction_10m: long (nullable = true)
 |-- wind_gusts_10m: double (nullable = true)
 |-- wind_speed_10m: double (nullable = true)
 |-- date: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- file_path: string (nullable = false)
 |-- city: string (nullable = false)



In [402]:
clean_df.select(clean_df.surface_pressure, clean_df.city).show(60)

[Stage 720:>                                                        (0 + 4) / 4]

+----------------+------+
|surface_pressure|  city|
+----------------+------+
|           946.8|havana|
|           943.7|havana|
|           944.4|havana|
|           946.4|havana|
|           946.2|havana|
|           945.1|havana|
|           946.4|havana|
|           944.6|havana|
|           946.1|havana|
|           947.1|havana|
|           946.6|havana|
|           944.0|havana|
|           943.1|havana|
|           945.5|havana|
|           944.9|havana|
|           960.5|nassau|
|           945.9|havana|
|           944.4|havana|
|           945.3|havana|
|           961.5|nassau|
|           943.7|havana|
|           947.1|havana|
|           945.6|havana|
|           943.9|havana|
|           959.0|nassau|
|           960.3|nassau|
|           960.0|nassau|
|           945.0|havana|
|           945.8|havana|
|           960.6|nassau|
|           945.4|havana|
|           946.1|havana|
|           961.6|nassau|
|           943.8|havana|
|           945.3|havana|
|           

                                                                                

In [403]:
train_data, test_data = clean_df.randomSplit([0.8, 0.2], seed=42)
print("Train size: ", train_data.count())
print("Test size: ", test_data.count())

                                                                                

Train size:  148


[Stage 724:>                                                      (0 + 10) / 10]

Test size:  38


                                                                                

In [404]:
from pyspark.ml.feature import StringIndexer

indexer = StringIndexer(inputCol="city", outputCol="city_index")
indexer_model = indexer.fit(train_data)
train_data = indexer_model.transform(train_data)

                                                                                

In [405]:
# input_columns = ["apparent_temperature", "cloud_cover", "interval",
#                  "is_day", "precipitation", "pressure_msl",
#                  "rain", "relative_humidity_2m", "showers",
#                  "surface_pressure", "temperature_2m",
#                  "wind_direction_10m", "wind_gusts_10m", "wind_speed_10m"]
input_columns = ["apparent_temperature", "cloud_cover", "interval",
                 "precipitation", "pressure_msl",
                "rain", "relative_humidity_2m", "showers",
                 "temperature_2m",
                  "wind_direction_10m", "wind_gusts_10m", "wind_speed_10m"
                 ]
# input_columns = ["surface_pressure"]

In [406]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols=input_columns, outputCol="unscaled_features")
train_data = assembler.transform(train_data)

In [407]:
from pyspark.ml.feature import StandardScaler

scaler = StandardScaler(inputCol='unscaled_features', outputCol='features', withMean=True, withStd=True)

scaler_model = scaler.fit(train_data)
train_data = scaler_model.transform(train_data)

                                                                                

In [408]:
train_data.show(5)

[Stage 736:>                                                        (0 + 1) / 1]

+--------------------+-----------+--------+------+-------------+------------+----+--------------------+-------+----------------+--------------+----------------+------------+------------------+--------------+--------------+---------------+--------+---------+--------------------+------+----------+--------------------+--------------------+
|apparent_temperature|cloud_cover|interval|is_day|precipitation|pressure_msl|rain|relative_humidity_2m|showers|surface_pressure|temperature_2m|            time|weather_code|wind_direction_10m|wind_gusts_10m|wind_speed_10m|           date|latitude|longitude|           file_path|  city|city_index|   unscaled_features|            features|
+--------------------+-----------+--------+------+-------------+------------+----+--------------------+-------+----------------+--------------+----------------+------------+------------------+--------------+--------------+---------------+--------+---------+--------------------+------+----------+--------------------+-----

                                                                                

In [409]:
from pyspark.ml.classification import DecisionTreeClassifier

dtc = DecisionTreeClassifier(featuresCol='features', labelCol='city_index')
model = dtc.fit(train_data)

                                                                                

In [410]:
test_data = indexer_model.transform(test_data)
test_data = assembler.transform(test_data)
test_data = scaler_model.transform(test_data)
test_data.show(1)

[Stage 751:>                                                        (0 + 1) / 1]

+--------------------+-----------+--------+------+-------------+------------+----+--------------------+-------+----------------+--------------+----------------+------------+------------------+--------------+--------------+---------------+--------+---------+--------------------+------+----------+--------------------+--------------------+
|apparent_temperature|cloud_cover|interval|is_day|precipitation|pressure_msl|rain|relative_humidity_2m|showers|surface_pressure|temperature_2m|            time|weather_code|wind_direction_10m|wind_gusts_10m|wind_speed_10m|           date|latitude|longitude|           file_path|  city|city_index|   unscaled_features|            features|
+--------------------+-----------+--------+------+-------------+------------+----+--------------------+-------+----------------+--------------+----------------+------------+------------------+--------------+--------------+---------------+--------+---------+--------------------+------+----------+--------------------+-----

                                                                                

In [411]:
predictions = model.transform(test_data)
predictions.show(3)

[Stage 752:>                                                        (0 + 1) / 1]

+--------------------+-----------+--------+------+-------------+------------+----+--------------------+-------+----------------+--------------+----------------+------------+------------------+--------------+--------------+---------------+--------+---------+--------------------+------+----------+--------------------+--------------------+--------------+-------------+----------+
|apparent_temperature|cloud_cover|interval|is_day|precipitation|pressure_msl|rain|relative_humidity_2m|showers|surface_pressure|temperature_2m|            time|weather_code|wind_direction_10m|wind_gusts_10m|wind_speed_10m|           date|latitude|longitude|           file_path|  city|city_index|   unscaled_features|            features| rawPrediction|  probability|prediction|
+--------------------+-----------+--------+------+-------------+------------+----+--------------------+-------+----------------+--------------+----------------+------------+------------------+--------------+--------------+---------------+----

                                                                                

In [412]:
predictions.select('features', 'city_index', 'prediction').show(10)

[Stage 754:>                                                        (0 + 3) / 3]

+--------------------+----------+----------+
|            features|city_index|prediction|
+--------------------+----------+----------+
|[-1.3836648325615...|       1.0|       1.0|
|[-0.8436657081673...|       1.0|       1.0|
|[-0.5245753164799...|       1.0|       1.0|
|[0.48178668807278...|       1.0|       1.0|
|[-1.7273006389941...|       1.0|       1.0|
|[-0.8191202934222...|       0.0|       0.0|
|[-0.8191202934222...|       1.0|       0.0|
|[-0.0091216068309...|       1.0|       1.0|
|[0.92360415348617...|       1.0|       1.0|
|[-1.0400290261288...|       1.0|       0.0|
+--------------------+----------+----------+
only showing top 10 rows


                                                                                

In [413]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

accuracty_evaluator = MulticlassClassificationEvaluator(labelCol='city_index', predictionCol='prediction', metricName='accuracy')
accuracy = accuracty_evaluator.evaluate(predictions) * 100
print(f'Accuracy = {accuracy:.2f}%')

[Stage 755:>                                                      (0 + 10) / 10]

Accuracy = 81.58%


                                                                                