#import necessary modules

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType
from pyspark.sql.functions import to_timestamp, col, input_file_name,regexp_extract, min, max, round, concat, lit,rand


#configuration

In [0]:
spark.conf.set("fs.azure.account.key.kaninipro.dfs.core.windows.net",your_storage_account_access_key)

#data preparation

In [0]:
weather_schema = StructType([
    StructField("_c0", IntegerType(), True),
    StructField("date", StringType(), True),
    StructField("temperature_2m", DoubleType(), True),
    StructField("relative_humidity_2m", DoubleType(), True),
    StructField("dew_point_2m", DoubleType(), True),
    StructField("apparent_temperature", DoubleType(), True),
    StructField("precipitation", DoubleType(), True),
    StructField("rain", DoubleType(), True),
    StructField("snowfall", DoubleType(), True),
    StructField("snow_depth", DoubleType(), True),
    StructField("pressure_msl", DoubleType(), True),
    StructField("surface_pressure", DoubleType(), True),
    StructField("cloud_cover", DoubleType(), True),
    StructField("cloud_cover_low", DoubleType(), True),
    StructField("cloud_cover_mid", DoubleType(), True),
    StructField("cloud_cover_high", DoubleType(), True),
    StructField("wind_speed_10m", DoubleType(), True),
    StructField("wind_speed_100m", DoubleType(), True),
    StructField("wind_direction_10m", DoubleType(), True),
    StructField("wind_direction_100m", DoubleType(), True),
    StructField("wind_gusts_10m", DoubleType(), True)
])

column_mapping = {
        "date": to_timestamp(col("date")),
        "file_name": input_file_name(),
        "city": regexp_extract(col("file_name"), r"/cities/([^/]+)\.csv$", 1)
    }

container_name = "data"
storage_account_name = "kaninipro"
prefix_path = f"abfss://{container_name}@{storage_account_name}.dfs.core.windows.net"

weather_df = spark.read.format("csv")\
    .schema(weather_schema) \
    .option("header", "true")\
    .load(f"{prefix_path}/src_city_weather/cities/")\
    .withColumnRenamed("_c0","id")\
    .withColumns(column_mapping)

city_schema = StructType([
    StructField("city_id", IntegerType(), True),
    StructField("city", StringType(), True),
    StructField("lat", DoubleType(), True),
    StructField("lng", DoubleType(), True)
])

city_df = spark.read.format("csv")\
    .schema(city_schema) \
    .option("header", "true")\
    .load(f"{prefix_path}/src_city_weather/Weather_Data_Scraping_and_Analysis/weather.csv")\
    .withColumn("city_group_code", round(rand() * 200))\

city_code_joined_df = weather_df.join(city_df, "city").repartition("city_group_code")


city_code_joined_df.write.mode("overwrite")\
        .format("parquet")\
        .partitionBy("city_group_code")\
        .option("path",f"{prefix_path}/city_code_joined")\
        .saveAsTable("city_code_joined")



#When the filter column differs from the partition column

In [0]:
%sql
select id, city, count(*) from city_weather_parquet_city_prt
group by id, city
having count(*) > 1

In [0]:
%sql
select count(*) from city_weather_parquet_city_prt


In [0]:
%sql
select  distinct city from city_weather_parquet_city_prt


In [0]:
%sql
select count(*) from (select  distinct city from city_weather_parquet_city_prt)


In [0]:
%sql
select 577169952/4657

In [0]:
%sql
select count(*) from city_weather_parquet_city_prt
where city = "Alampur_2"


#When working with large datasets that need deduplication

#replace group by + join with windows

In [0]:
# %sql
# -- #Average query
# SELECT a.id, a.name, b.total_sales
# FROM customers a
# JOIN (
#     SELECT id, SUM(sales) AS total_sales
#     FROM orders
#     GROUP BY id
# ) b
# ON a.id = b.id

In [0]:
# # Performant query
# from pyspark.sql import functions as F
# from pyspark.sql.window import Window

# windowSpec = Window.partitionBy("id")

# df_with_sales = (
#     orders
#     .withColumn("total_sales", F.sum("sales").over(windowSpec))
#     .select("id", "name", "total_sales")
#     .distinct()  # optional if duplicates matter
# )