In [1]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=0bb3bc6bfc807bcfbb1661cfc76979cbcc4a2b062ae75a69908284bdeb7272e7
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


In [18]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import expr , col

spark = SparkSession.builder.appName("Basic-Rdd").getOrCreate()

data = [("Alice", 25), ("Bob", 30), ("Charlie", 22), ("David", 28)]
columns = ["Name", "Age"]
df = spark.createDataFrame(data, columns)
df.show()

+-------+---+
|   Name|Age|
+-------+---+
|  Alice| 25|
|    Bob| 30|
|Charlie| 22|
|  David| 28|
+-------+---+



In [19]:
""" filter rows where age is greater than 25 """
filtered_df = df.filter(df["Age"]>25)
filtered_df.show()

+-----+---+
| Name|Age|
+-----+---+
|  Bob| 30|
|David| 28|
+-----+---+



In [20]:
""" Calculte the average age of all people """
average_age = df.agg({"Age": "avg"}).collect()[0][0]
print(f"Average age: {average_age}")

Average age: 26.25


In [21]:
""" Add a new column (e.g "Status") """
df_with_status = df.withColumn("Status", expr("CASE WHEN Age >=30 THEN 'Senior' ELSE 'Junior' END"))
df_with_status.show()

+-------+---+------+
|   Name|Age|Status|
+-------+---+------+
|  Alice| 25|Junior|
|    Bob| 30|Senior|
|Charlie| 22|Junior|
|  David| 28|Junior|
+-------+---+------+



In [22]:
""" Group by age and count occurences  """

age_counts = df.groupBy("Age").count()
age_counts.show()

+---+-----+
|Age|count|
+---+-----+
| 25|    1|
| 30|    1|
| 22|    1|
| 28|    1|
+---+-----+



In [24]:
"""  Rename the columns """

renamed_df = df.withColumnRenamed("Name", "Full_Name").withColumnRenamed("Age", "Years")
renamed_df.show()


+---------+-----+
|Full_Name|Years|
+---------+-----+
|    Alice|   25|
|      Bob|   30|
|  Charlie|   22|
|    David|   28|
+---------+-----+



In [25]:
""" Sort by age in descending order  """
sorted_df = df.orderBy(df["Age"].desc())
sorted_df.show()

+-------+---+
|   Name|Age|
+-------+---+
|    Bob| 30|
|  David| 28|
|  Alice| 25|
|Charlie| 22|
+-------+---+



In [26]:
"""  Drop the "Age" column """
dropped_df = df.drop("Age")
dropped_df.show()

+-------+
|   Name|
+-------+
|  Alice|
|    Bob|
|Charlie|
|  David|
+-------+

