In [0]:
spark.conf.set("fs.azure.account.key.arulrajgopalshare.dfs.core.windows.net","")

#import necessary modules

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType
from pyspark.sql.functions import to_timestamp, col, input_file_name,regexp_extract


#data preparation

In [0]:
weather_schema = StructType([
    StructField("_c0", IntegerType(), True),
    StructField("date", StringType(), True),
    StructField("temperature_2m", DoubleType(), True),
    StructField("relative_humidity_2m", DoubleType(), True),
    StructField("dew_point_2m", DoubleType(), True),
    StructField("apparent_temperature", DoubleType(), True),
    StructField("precipitation", DoubleType(), True),
    StructField("rain", DoubleType(), True),
    StructField("snowfall", DoubleType(), True),
    StructField("snow_depth", DoubleType(), True),
    StructField("pressure_msl", DoubleType(), True),
    StructField("surface_pressure", DoubleType(), True),
    StructField("cloud_cover", DoubleType(), True),
    StructField("cloud_cover_low", DoubleType(), True),
    StructField("cloud_cover_mid", DoubleType(), True),
    StructField("cloud_cover_high", DoubleType(), True),
    StructField("wind_speed_10m", DoubleType(), True),
    StructField("wind_speed_100m", DoubleType(), True),
    StructField("wind_direction_10m", DoubleType(), True),
    StructField("wind_direction_100m", DoubleType(), True),
    StructField("wind_gusts_10m", DoubleType(), True)
])

column_mapping = {
        "date": to_timestamp(col("date")),
        "file_name": input_file_name(),
        "city": regexp_extract(col("file_name"), r"/cities/([^/]+)\.csv$", 1)
    }

container_name = "kaninipro"
storage_account_name = "arulrajgopalshare"
prefix_path = f"abfss://{container_name}@{storage_account_name}.dfs.core.windows.net/"

weather_df = spark.read.format("csv")\
    .schema(weather_schema) \
    .option("header", "true")\
    .load(f"{prefix_path}kaggle_indian_cities_weather_2010_2024/cities/")\
    .withColumnRenamed("_c0","id")\
    .withColumns(column_mapping)

weather_df.write.mode("overwrite")\
        .option("path",f"{prefix_path}test_path/city_weather")\
        .saveAsTable("city_weather")


#When the filter column differs from the partition column

#When working with large datasets that need deduplication

#replace group by + join with windows

In [0]:
%sql
-- #Average query
SELECT a.id, a.name, b.total_sales
FROM customers a
JOIN (
    SELECT id, SUM(sales) AS total_sales
    FROM orders
    GROUP BY id
) b
ON a.id = b.id

In [0]:
# Performant query
from pyspark.sql import functions as F
from pyspark.sql.window import Window

windowSpec = Window.partitionBy("id")

df_with_sales = (
    orders
    .withColumn("total_sales", F.sum("sales").over(windowSpec))
    .select("id", "name", "total_sales")
    .distinct()  # optional if duplicates matter
)