#setting up 

In [0]:
account_key = "your_access_key"
spark.conf.set("fs.azure.account.key.kaninipro.dfs.core.windows.net",account_key)

#Read file with metadata

In [0]:
city_weather = spark.read.format("parquet")\
                .load("abfss://data@kaninipro.dfs.core.windows.net/city_weather_parquet")\
                .selectExpr("*",
                            "_metadata.file_path as file_path",
                            "_metadata.file_name as file_name",
                            "_metadata.file_modification_time as file_modification_time",
                            "_metadata.file_size as file_size",
                            "_metadata.file_block_length as file_block_length",
                            "_metadata.file_block_start as file_block_start"
                            )


In [0]:
display(city_weather)

#eqNullSafe for safe equality checks

In [0]:
from pyspark.sql.functions import expr, col

df = spark.createDataFrame([
    (1, 1),
    (None, None),
    (None, 5),
    (10, None)
], ["col1", "col2"])

col_added_df = df.withColumn("is_equal", col("col1").eqNullSafe(col("col2")))

display(col_added_df)

In [0]:
df1 = spark.createDataFrame([
    (1, "A"),
    (None, "B"),
    (3, "C")
], ["id", "value1"])

df2 = spark.createDataFrame([
    (1, "X"),
    (None, "Y"),
    (4, "Z")
], ["id", "value2"])

result = df1.join(
    df2,
    df1["id"].eqNullSafe(df2["id"]),
    "inner"
)

display(result)


#sort array

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, ArrayType
from pyspark.sql.functions import sort_array, col

schema = StructType([
    StructField("id", IntegerType(), True),
    StructField(
        "array_field",
        ArrayType(
            StructType([
                StructField("field1", IntegerType(), True),
                StructField("field2", StringType(), True)
            ])
        ),
        True
    )
])

data = [
    (1, [
        {"field1": 10, "field2": "def"},
        {"field1": 70, "field2": "ghi"},
        {"field1": 70, "field2": "abc"}
    ]),
    (2, [
        {"field1": 10, "field2": "ijk"},
        {"field1": 20, "field2": "sdg"},
    ]),
    (3, [
        {"field1": 100, "field2": "abc"},
        {"field1": 5, "field2": "ood"},
        {"field1": 5, "field2": "afe"}
    ])
]

df = spark.createDataFrame(data, schema=schema)

In [0]:
display(df)

In [0]:
display(df.select("id",sort_array(col("array_field"))))

#Apply inline transform on arrays

##pyspark

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, ArrayType
from pyspark.sql.functions import col,expr

schema = StructType([
    StructField("id", IntegerType(), True),
    StructField(
        "left_array",
        ArrayType(
            StructType([
                StructField("id", IntegerType(), True),
                StructField("value", StringType(), True)
            ])
        ),
        True
    ),
     StructField(
        "right_array",
        ArrayType(
            StructType([
                StructField("id", IntegerType(), True),
                StructField("value", StringType(), True)
            ])
        ),
        True
    )
])

data = [
    (1, [
        {"id": 1, "value": "def"},
        {"id": 2, "value": "ghi"},
        {"id": 3, "value": "abc"}
    ],
     [
        {"id": 1, "value": "def"},
        {"id": 2, "value": "abc"}
    ]
     ),
    (2, [
        {"id": 1, "value": "ijk"},
        {"id": 2, "value": "sdg"},
    ],
     [
        {"id": 1, "value": "ijk"}
    ],
     )
]

df = spark.createDataFrame(data, schema=schema)

display(df)

In [0]:
missing_values_df = (
    df.withColumn(
        "inverse_intersection",
        expr("filter(left_array, x -> NOT array_contains(right_array, x))")
    )
    .withColumn("is_changed", expr("size(inverse_intersection) <> 0"))
)

display(missing_values_df)

In [0]:
missing_values_df = df.withColumn(
    "inverse_intersection",
    expr("""
        filter(
            left_array,
            x -> x.value <> filter(right_array, y -> y.id = x.id)[0].value
        )
    """)
)\
.withColumn("is_changed",expr("size(inverse_intersection) <> 0"))
display(missing_values_df)

In [0]:
transformed_df = df.withColumn(
    "new_array",
    expr("""
        transform(
            left_array,
            x -> 
            named_struct (
                "id", x.id + 5,
                "value", concat(x.value, '_new')
                )
        )
    """)
)

display(transformed_df.select("left_array", "new_array"))

##scala

In [0]:
%scala
import org.apache.spark.sql.Row
import org.apache.spark.sql.types._

val schema = StructType(Array(
  StructField("id", IntegerType, true),
  StructField(
    "left_array",
    ArrayType(
      StructType(Array(
        StructField("id", IntegerType, true),
        StructField("value", StringType, true)
      ))
    ),
    true
  ),
  StructField(
    "right_array",
    ArrayType(
      StructType(Array(
        StructField("id", IntegerType, true),
        StructField("value", StringType, true)
      ))
    ),
    true
  )
))

val data = Seq(
  Row(1, Seq(Row(1,"def"), Row(2, "ghi"), Row(3, "abc")), Seq(Row(1,"def"), Row(2, "abc"))),
  Row(2, Seq(Row(1,"ijk"), Row(2, "sdg")), Seq(Row(1,"ijk"))),
)


val df = spark.createDataFrame(spark.sparkContext.parallelize(data), schema)

display(df)




#PySpark DataFrame equality functions for testing

In [0]:
schema_actual = "name STRING, amount DOUBLE"

data_expected = [["Alfred", 1500], ["Alfred", 2500], ["Anna", 500], ["Anna", 3000]]
data_actual = [["Alfred", 1500.0], ["Alfred", 2500.0], ["Anna", 500.0], ["Anna", 3000.0]]

df_expected = spark.createDataFrame(data = data_expected)
df_actual = spark.createDataFrame(data = data_actual, schema = schema_actual)

from pyspark.testing import assertSchemaEqual

assertSchemaEqual(df_actual.schema, df_expected.schema)

In [0]:
df_expected = spark.createDataFrame(data=[("Alfred", 1500), ("Alfred", 2500), ("Anna", 
500), ("Anna", 3000)], schema=["name", "amount"])

df_actual = spark.createDataFrame(data=[("Alfred", 1200), ("Alfred", 2500), ("Anna", 500), 
("Anna", 3000)], schema=["name", "amount"])

from pyspark.testing import assertDataFrameEqual

assertDataFrameEqual(df_actual, df_expected)

In [0]:
df_expected = spark.createDataFrame(data=[
                                            ("Alfred", 1500), 
                                            ("Alfred", 2500), 
                                            ("Anna", 500), 
                                            ("Anna", 3000)], 
                                    schema=["name", "amount"]
                                    )

df_actual = spark.createDataFrame(data=[
                                 ("Alfred", 1200), 
                                 ("Alfred", 300), 
                                 ("Anna", 500), 
                                 ("Anna", 3000)], 
                            schema=["name", "amount"]
                            )


from pyspark.testing import assertDataFrameEqual
from pyspark.errors import PySparkAssertionError

try:
    assertDataFrameEqual(df_actual, df_expected, includeDiffRows=True)
except PySparkAssertionError as e:
    # `e.data` here looks like:
    # [(Row(name='Alfred', amount=1200), Row(name='Alfred', amount=1500))]
    errored_recrods = spark.createDataFrame(e.data, schema=["Actual", "Expected"])

display(errored_recrods)

#prefer unionByName over union

In [0]:
df1 = spark.createDataFrame(
    [(1, "Arul", 30)],
    ["id", "name", "age"]
)

df2 = spark.createDataFrame(
    [(2, 28, "Meena")],   # Notice: order is different → (id, age, name)
    ["id", "age", "name"]
)


In [0]:
display(df1.union(df2))

In [0]:
display(df1.unionByName(df2))

##handling missing columns

In [0]:
df1 = spark.createDataFrame(
    [(1, 30)],
    ["id", "age"]
)

df2 = spark.createDataFrame(
    [(2, 28, "Meena")],   # Notice: order is different → (id, age, name)
    ["id", "age", "name"]
)


In [0]:
display(df1.unionByName(df2, allowMissingColumns=True))