In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("basics").getOrCreate()

In [3]:
df = spark.read.csv("students.csv", inferSchema=True, header=True)

In [4]:
df.show(2)

+-------------+---+-----------------+------------------+----------+---------------+---+
|         Name|Age|Favorite_Language|Favorite_Framework|Student_ID|Enrollment_Year|GPA|
+-------------+---+-----------------+------------------+----------+---------------+---+
|David HaMagid| 22|           Python|            Django|     ST001|           2021|3.8|
| Reuven Cohen| 21|       JavaScript|             React|     ST002|           2021|3.9|
+-------------+---+-----------------+------------------+----------+---------------+---+
only showing top 2 rows



In [5]:
df.count()

25

In [7]:
df.filter(df["Name"].startswith("E")).show()

+----------------+---+-----------------+------------------+----------+---------------+---+
|            Name|Age|Favorite_Language|Favorite_Framework|Student_ID|Enrollment_Year|GPA|
+----------------+---+-----------------+------------------+----------+---------------+---+
|Eliyahu Erlanger| 21|              PHP|           Laravel|     ST013|           2022|3.9|
|  Eliezer Shamul| 23|       JavaScript|           Next.js|     ST014|           2021|3.8|
|   Elkana Kalman| 20|           Python|           PyTorch|     ST015|           2022|3.6|
+----------------+---+-----------------+------------------+----------+---------------+---+



In [8]:
df.orderBy("Age").show(5)

+----------------+---+-----------------+------------------+----------+---------------+---+
|            Name|Age|Favorite_Language|Favorite_Framework|Student_ID|Enrollment_Year|GPA|
+----------------+---+-----------------+------------------+----------+---------------+---+
|Benjamin Naftali| 19|       JavaScript|            Vue.js|     ST006|           2023|3.5|
|   Shlomo Benati| 19|            Swift|           SwiftUI|     ST017|           2023|3.5|
|   Yair Weissman| 20|           Python|             Flask|     ST004|           2022|3.6|
|  Mordechai Oren| 20|           Python|           FastAPI|     ST010|           2022|3.6|
|   Elkana Kalman| 20|           Python|           PyTorch|     ST015|           2022|3.6|
+----------------+---+-----------------+------------------+----------+---------------+---+
only showing top 5 rows



In [11]:
df.orderBy(df["Age"].desc()).show(5)

+--------------------+---+-----------------+------------------+----------+---------------+---+
|                Name|Age|Favorite_Language|Favorite_Framework|Student_ID|Enrollment_Year|GPA|
+--------------------+---+-----------------+------------------+----------+---------------+---+
|Menachem Mendel A...| 24|               C#|         .NET Core|     ST005|           2020|3.9|
|   Pinchas Waxberger| 24|             Java|         Hibernate|     ST011|           2020|3.8|
|  Jonathan Rosenthal| 24|              C++|                Qt|     ST016|           2020|3.7|
|Menachem Mendel B...| 24|               Go|              Echo|     ST022|           2020|3.8|
|         Yehuda Zeev| 23|             Java|       Spring Boot|     ST003|           2020|3.7|
+--------------------+---+-----------------+------------------+----------+---------------+---+
only showing top 5 rows



In [12]:
df.selectExpr("avg(Age) as age_avg").show()

+-------+
|age_avg|
+-------+
|  21.76|
+-------+



In [13]:
df.selectExpr("max(Age) as age_max").show()

+-------+
|age_max|
+-------+
|     24|
+-------+



In [14]:
df.selectExpr("min(Age) as age_min").show()

+-------+
|age_min|
+-------+
|     19|
+-------+



In [15]:
df.groupby("Age").count().show()

+---+-----+
|Age|count|
+---+-----+
| 22|    5|
| 20|    4|
| 19|    2|
| 23|    5|
| 24|    4|
| 21|    5|
+---+-----+



In [20]:
df.groupby("Age").mean("Age").show()

+---+--------+
|Age|avg(Age)|
+---+--------+
| 22|    22.0|
| 20|    20.0|
| 19|    19.0|
| 23|    23.0|
| 24|    24.0|
| 21|    21.0|
+---+--------+



In [21]:
df.agg({ "Age":"sum" }).show()

+--------+
|sum(Age)|
+--------+
|     544|
+--------+



In [22]:
df.agg({ "Name": "mode" }).show()

+----------------+
|      mode(Name)|
+----------------+
|Eliyahu Erlanger|
+----------------+



In [23]:
df.agg({ "Name": "min", "Age": "max" }).show()

+--------+------------+
|max(Age)|   min(Name)|
+--------+------------+
|      24|Aaron Reiner|
+--------+------------+



In [24]:
from pyspark.sql.functions import count_distinct, avg

In [25]:
df.select(count_distinct("Age")).show()

+-------------------+
|count(DISTINCT Age)|
+-------------------+
|                  6|
+-------------------+



In [32]:
df.select(avg("Age")).show()

+--------+
|avg(Age)|
+--------+
|   21.76|
+--------+



In [34]:
df.select(avg("Age").alias("avg")).collect()[0].asDict()['avg']

21.76