In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

# Create a Spark session
spark = SparkSession.builder.appName("SchemaExample").getOrCreate()

# Define the schema
schema = StructType([
    StructField("name", StringType(), True),   # Name column, nullable
    StructField("age", IntegerType(), False),  # Age column, not nullable
    StructField("city", StringType(), True)    # City column, nullable
])

# Create some sample data
data = [("Alice", 30, "New York"), ("Bob", 25, "San Francisco")]

# Create DataFrame with schema
df = spark.createDataFrame(data, schema=schema)

# Show DataFrame
df.show()


+-----+---+-------------+
| name|age|         city|
+-----+---+-------------+
|Alice| 30|     New York|
|  Bob| 25|San Francisco|
+-----+---+-------------+



In [0]:
from pyspark.sql.types import ArrayType

# Define a nested schema for address
address_schema = StructType([
    StructField("city", StringType(), True),
    StructField("state", StringType(), True),
])

# Define the main schema with an embedded struct and an array
schema = StructType([
    StructField("name", StringType(), True),
    StructField("age", IntegerType(), False),
    StructField("address", address_schema, True),  # Nested struct
    StructField("skills", ArrayType(StringType()), True)  # Array of strings
])

data = [
    ("Alice", 30, ("New York", "NY"), ["Python", "Spark"]),
    ("Bob", 25, ("San Francisco", "CA"), ["Java", "Hadoop"])
]

df = spark.createDataFrame(data, schema=schema)
df.show(truncate=False)


+-----+---+-------------------+---------------+
|name |age|address            |skills         |
+-----+---+-------------------+---------------+
|Alice|30 |{New York, NY}     |[Python, Spark]|
|Bob  |25 |{San Francisco, CA}|[Java, Hadoop] |
+-----+---+-------------------+---------------+



In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import sum, avg

data = [("Alice", "HR", 5000), ("Bob", "HR", 6000), ("Charlie", "IT", 7000)]
df = spark.createDataFrame(data, ["name", "dept", "salary"])

# Group by department and calculate sum and average salary
aggregated_df = df.groupBy("dept").agg(
    sum("salary"),
    avg("salary")
)
aggregated_df.show()


+----+-----------+-----------+
|dept|sum(salary)|avg(salary)|
+----+-----------+-----------+
|  HR|      11000|     5500.0|
|  IT|       7000|     7000.0|
+----+-----------+-----------+



In [0]:
df1 = spark.createDataFrame([(1, "Alice"), (2, "Bob")], ["id", "name"])
df2 = spark.createDataFrame([(1, "NY"), (3, "LA")], ["id", "city"])


df1.join(df2,on=["id"],how="inner").show()
df1.join(df2,on=["id"],how="left").show()
df1.join(df2,on=["id"],how="right").show()
df1.join(df2,on=["id"],how="full").show()
df1.crossJoin(df2).show()

+---+-----+----+
| id| name|city|
+---+-----+----+
|  1|Alice|  NY|
+---+-----+----+

+---+-----+----+
| id| name|city|
+---+-----+----+
|  1|Alice|  NY|
|  2|  Bob|null|
+---+-----+----+

+---+-----+----+
| id| name|city|
+---+-----+----+
|  1|Alice|  NY|
|  3| null|  LA|
+---+-----+----+

+---+-----+----+
| id| name|city|
+---+-----+----+
|  1|Alice|  NY|
|  2|  Bob|null|
|  3| null|  LA|
+---+-----+----+

+---+-----+---+----+
| id| name| id|city|
+---+-----+---+----+
|  1|Alice|  1|  NY|
|  1|Alice|  3|  LA|
|  2|  Bob|  1|  NY|
|  2|  Bob|  3|  LA|
+---+-----+---+----+

