In [3]:
from pyspark.sql import SparkSession, functions as F, Window
spark = SparkSession.builder.appName("Demo").getOrCreate()

df = spark.createDataFrame([
  ("Alice", 30, "Sales"),
  ("Bob", 25, "Sales"),
  ("Carol", 35, "HR"),
  ("David", 29, "HR"),
  ("Eve", 40, "IT")
], ["name", "age", "dept"])
df.show()

+-----+---+-----+
| name|age| dept|
+-----+---+-----+
|Alice| 30|Sales|
|  Bob| 25|Sales|
|Carol| 35|   HR|
|David| 29|   HR|
|  Eve| 40|   IT|
+-----+---+-----+



In [2]:
filtered = df.filter(F.col("age") > 30)
filtered.show()

+-----+---+----+
| name|age|dept|
+-----+---+----+
|Carol| 35|  HR|
|  Eve| 40|  IT|
+-----+---+----+



In [4]:
df2 = df.withColumn("age_plus_5", F.col("age") + 5)
df2.select("name", "age_plus_5").show()


+-----+----------+
| name|age_plus_5|
+-----+----------+
|Alice|        35|
|  Bob|        30|
|Carol|        40|
|David|        34|
|  Eve|        45|
+-----+----------+



In [5]:
other = spark.createDataFrame([
  ("Sales", 1000),
  ("HR",    800),
  ("IT",   1200)
], ["dept", "budget"])

joined = df.join(other, on="dept", how="inner")
joined.show()


+-----+-----+---+------+
| dept| name|age|budget|
+-----+-----+---+------+
|   HR|Carol| 35|   800|
|   HR|David| 29|   800|
|   IT|  Eve| 40|  1200|
|Sales|Alice| 30|  1000|
|Sales|  Bob| 25|  1000|
+-----+-----+---+------+



In [6]:
df.groupBy("dept") \
  .agg(
    F.count("*").alias("cnt"),
    F.avg("age").alias("avg_age"),
    F.sum("age").alias("sum_age")
  ).show()


+-----+---+-------+-------+
| dept|cnt|avg_age|sum_age|
+-----+---+-------+-------+
|Sales|  2|   27.5|     55|
|   HR|  2|   32.0|     64|
|   IT|  1|   40.0|     40|
+-----+---+-------+-------+



In [7]:
df.groupBy("dept") \
  .agg(F.sum("age").alias("total_age")) \
  .filter(F.col("total_age") >= 60) \
  .show()


+----+---------+
|dept|total_age|
+----+---------+
|  HR|       64|
+----+---------+



In [8]:
windowSpec = Window.partitionBy("dept").orderBy("age")

df.withColumn("rank_in_dept", F.rank().over(windowSpec)) \
  .withColumn("avg_age_dept", F.avg("age").over(Window.partitionBy("dept"))) \
  .show()


+-----+---+-----+------------+------------+
| name|age| dept|rank_in_dept|avg_age_dept|
+-----+---+-----+------------+------------+
|David| 29|   HR|           1|        32.0|
|Carol| 35|   HR|           2|        32.0|
|  Eve| 40|   IT|           1|        40.0|
|  Bob| 25|Sales|           1|        27.5|
|Alice| 30|Sales|           2|        27.5|
+-----+---+-----+------------+------------+



In [14]:
from pyspark.sql import SparkSession
from operator import add

spark = SparkSession.builder.appName("PySparkActionsDemo").getOrCreate()


numbers = spark.sparkContext.parallelize([1, 2, 3, 4, 5, 3, 2])
pairs   = spark.sparkContext.parallelize([('a', 1), ('b', 2), ('a', 3)])

print("reduce →", numbers.reduce(add))
print("fold →", numbers.fold(0, add))
print("aggregate →", numbers.aggregate(0,
                                        lambda acc, x: acc + x,
                                        lambda acc1, acc2: acc1 + acc2))
print("treeReduce →", numbers.treeReduce(add))
print("count →", numbers.count())
print("first →", numbers.first())
print("take(3) →", numbers.take(3))
print("takeOrdered(3) →", numbers.takeOrdered(3))
print("takeSample(…,3) →", numbers.takeSample(False, 3, seed=42))
print("countByValue →", numbers.countByValue())
print("countByKey →", pairs.countByKey())
print("min →", numbers.min())
print("max →", numbers.max())
print("top(2) →", numbers.top(2))

df = spark.createDataFrame([(1, "Alice", 25),
                            (2, "Bob",   30),
                            (3, "Cathy", 28)],
                           ["id", "name", "age"])

filtered = df.filter(df.age > 26)  # lazy

print("\nDataFrame show():")
filtered.show()

print("count →", filtered.count())

print("first →", filtered.first())
print("head(2) →", filtered.head(2))
print("take(2) →", filtered.take(2))

collected = filtered.collect()
print("collect", collected)

spark.stop()


reduce → 20
fold → 20
aggregate → 20
treeReduce → 20
count → 7
first → 1
take(3) → [1, 2, 3]
takeOrdered(3) → [1, 2, 2]
takeSample(…,3) → [2, 4, 5]
countByValue → defaultdict(<class 'int'>, {1: 1, 2: 2, 3: 2, 4: 1, 5: 1})
countByKey → defaultdict(<class 'int'>, {'a': 2, 'b': 1})
min → 1
max → 5
top(2) → [5, 4]

DataFrame show():
+---+-----+---+
| id| name|age|
+---+-----+---+
|  2|  Bob| 30|
|  3|Cathy| 28|
+---+-----+---+

count → 2
first → Row(id=2, name='Bob', age=30)
head(2) → [Row(id=2, name='Bob', age=30), Row(id=3, name='Cathy', age=28)]
take(2) → [Row(id=2, name='Bob', age=30), Row(id=3, name='Cathy', age=28)]
collect [Row(id=2, name='Bob', age=30), Row(id=3, name='Cathy', age=28)]
