In [47]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import mean, to_date, sum, col, year, month, avg

spark = SparkSession.builder.appName("Weather").getOrCreate()

df = spark.read.csv("/home/jovyan/work/code/PySpark_test/config/weather_data.csv", header=True, inferSchema=True)

df = df.withColumn("date", to_date(df["date"], "yyyy-MM-dd"))

mean_temperature = df.select(mean("temperature")).first()[0]
mean_precipitation = df.select(mean("precipitation")).first()[0]
mean_wind_speed = df.select(mean("wind_speed")).first()[0]

df = df.fillna({'temperature': mean_temperature, 'precipitation': mean_precipitation, 'wind_speed': mean_wind_speed})

top5_days = df.orderBy(df["temperature"].desc()).limit(5).select("date", "temperature")

last_year_df = df.filter(year("date") == 2023)

top_precipitation = last_year_df.groupBy("station_id").agg(sum("precipitation").alias("sum(precipitation)")) \
.orderBy(col("sum(precipitation)").desc()).limit(1)

month_avg = df.withColumn("month", month("date")).groupBy("month").agg(avg("temperature")).orderBy("month")

df.printSchema()

top_precipitation.show()
top5_days.show()
month_avg.show()

df.show()

spark.stop()

root
 |-- station_id: string (nullable = true)
 |-- date: date (nullable = true)
 |-- temperature: double (nullable = false)
 |-- precipitation: double (nullable = false)
 |-- wind_speed: double (nullable = false)

+----------+------------------+
|station_id|sum(precipitation)|
+----------+------------------+
| station_5| 642.9302626767898|
+----------+------------------+

+----------+------------------+
|      date|       temperature|
+----------+------------------+
|2021-08-20|39.982828249354846|
|2023-12-02| 39.96797489293784|
|2022-03-28|  39.8246894248997|
|2019-02-11| 39.76737697836647|
|2020-06-10| 39.69147838355929|
+----------+------------------+

+-----+------------------+
|month|  avg(temperature)|
+-----+------------------+
|    1|11.356518462550754|
|    2| 9.067229891101926|
|    3| 7.244080205633994|
|    4|12.024529009744693|
|    5| 9.902883346912718|
|    6|13.421092297254138|
|    7|6.1857183016954576|
|    8|  10.9678002814186|
|    9| 9.596744236573942|
|   10|  9.