In [1]:
## Setting the PySpark environment variables

# import os

# os.environ['SPARK_HOME'] = "/opt/spark/spark-3.5.1-bin-hadoop3"
# os.environ['PYSPARK_DRIVER_PYTHON'] = 'jupyter'
# os.environ['PYSPARK_DRIVER_PYTHON_OPTS'] = 'lab'
# os.environ['PYSPARK_PYTHON'] = 'python'

In [2]:
from pyspark.sql import SparkSession

# from pyspark.sql.functions import *
from pyspark.sql.functions import desc

In [3]:
# Create a SparkSession object

spark = SparkSession.builder.appName("DataFrame-operations-demo").getOrCreate()

24/03/14 14:41:44 WARN Utils: Your hostname, dhiraj resolves to a loopback address: 127.0.1.1; using 192.168.10.66 instead (on interface wlo1)
24/03/14 14:41:44 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/03/14 14:41:45 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


### simply view the data using bash command

In [4]:
%%bash
head -10 ../../data/text/stocks.txt

id,name,category,quantity,price
1,iPhone,Electronics,10,899.99
2,Macbook,Electronics,5,1299.99
3,iPad,Electronics,15,499.99
4,Samsung TV,Electronics,8,799.99
5,LG TV,Electronics,10,699.99
6,Nike Shoes,Clothing,30,99.99
7,Adidas Shoes,Clothing,25,89.99
8,Sony Headphones,Electronics,12,149.99
9,Beats Headphones,Electronics,20,199.99


### Load data into Dataframe

In [5]:
data_file_path = "../../data/text/stocks.txt"

df = spark.read.csv(data_file_path, header=True, inferSchema=True)

# header=True, to look first row as dataframe column header
# inferSchema=True, to auto guess the datatype of column

In [6]:
# Display schema of DataFrame
df.printSchema()

# Show the initial 10 rows of DataFrame
print("Initial DataFrame:")
df.show(10)

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- category: string (nullable = true)
 |-- quantity: integer (nullable = true)
 |-- price: double (nullable = true)

Initial DataFrame:
+---+----------------+-----------+--------+-------+
| id|            name|   category|quantity|  price|
+---+----------------+-----------+--------+-------+
|  1|          iPhone|Electronics|      10| 899.99|
|  2|         Macbook|Electronics|       5|1299.99|
|  3|            iPad|Electronics|      15| 499.99|
|  4|      Samsung TV|Electronics|       8| 799.99|
|  5|           LG TV|Electronics|      10| 699.99|
|  6|      Nike Shoes|   Clothing|      30|  99.99|
|  7|    Adidas Shoes|   Clothing|      25|  89.99|
|  8| Sony Headphones|Electronics|      12| 149.99|
|  9|Beats Headphones|Electronics|      20| 199.99|
| 10|    Dining Table|  Furniture|      10| 249.99|
+---+----------------+-----------+--------+-------+
only showing top 10 rows



### `select` specific columns using `select()`

In [7]:
selected_columns = df.select("id", "name", "price")

print("Selected Columns:")
selected_columns.show(10)

Selected Columns:
+---+----------------+-------+
| id|            name|  price|
+---+----------------+-------+
|  1|          iPhone| 899.99|
|  2|         Macbook|1299.99|
|  3|            iPad| 499.99|
|  4|      Samsung TV| 799.99|
|  5|           LG TV| 699.99|
|  6|      Nike Shoes|  99.99|
|  7|    Adidas Shoes|  89.99|
|  8| Sony Headphones| 149.99|
|  9|Beats Headphones| 199.99|
| 10|    Dining Table| 249.99|
+---+----------------+-------+
only showing top 10 rows



### Apply `conditions` to filter rows using `filter()`

In [8]:
# filtering rows where value in quantity column is greater than 20
filtered_data = df.filter(df.quantity > 20)

print("Filtered Data:", filtered_data.count())
filtered_data.show()

Filtered Data: 13
+---+--------------+-----------+--------+-----+
| id|          name|   category|quantity|price|
+---+--------------+-----------+--------+-----+
|  6|    Nike Shoes|   Clothing|      30|99.99|
|  7|  Adidas Shoes|   Clothing|      25|89.99|
| 12|        Apples|       Food|     100|  0.5|
| 13|       Bananas|       Food|     150| 0.25|
| 14|       Oranges|       Food|     120| 0.75|
| 15|Chicken Breast|       Food|      50| 3.99|
| 16| Salmon Fillet|       Food|      30| 5.99|
| 24|    Laptop Bag|Accessories|      25|29.99|
| 25|      Backpack|Accessories|      30|24.99|
| 28|         Jeans|   Clothing|      30|59.99|
| 29|       T-shirt|   Clothing|      50|14.99|
| 30|      Sneakers|   Clothing|      40|79.99|
| 31| Salmon Fillet|       Food|      30| 5.99|
+---+--------------+-----------+--------+-----+



### Grouping data using `groupBY()`

#### Aggregations/agg() are performing functions/actions like sum, avg etc on grouped data

In [9]:
# GroupBy and Aggregations

grouped_data = df.groupBy("category").agg({"quantity": "sum", "price": "avg"})

print("Grouped and Aggregated Data:")
grouped_data.show()

Grouped and Aggregated Data:
+-----------+-------------+------------------+
|   category|sum(quantity)|        avg(price)|
+-----------+-------------+------------------+
|       Food|          480|2.9116666666666666|
|     Sports|           70|             34.99|
|Electronics|           98| 586.6566666666665|
|   Clothing|          225|116.10111111111112|
|  Furniture|           41|            141.99|
|Accessories|           55|             27.49|
+-----------+-------------+------------------+



### `join` multiple dataframes using `join()`

In [10]:
df2 = df.select("id", "category").limit(10)

# joining df and df2 on basis of particular column i.e "id"
joined_data = df.join(df2, "id", "inner")

print("Joined Data:")
joined_data.show()

Joined Data:
+---+----------------+-----------+--------+-------+-----------+
| id|            name|   category|quantity|  price|   category|
+---+----------------+-----------+--------+-------+-----------+
|  1|          iPhone|Electronics|      10| 899.99|Electronics|
|  2|         Macbook|Electronics|       5|1299.99|Electronics|
|  3|            iPad|Electronics|      15| 499.99|Electronics|
|  4|      Samsung TV|Electronics|       8| 799.99|Electronics|
|  5|           LG TV|Electronics|      10| 699.99|Electronics|
|  6|      Nike Shoes|   Clothing|      30|  99.99|   Clothing|
|  7|    Adidas Shoes|   Clothing|      25|  89.99|   Clothing|
|  8| Sony Headphones|Electronics|      12| 149.99|Electronics|
|  9|Beats Headphones|Electronics|      20| 199.99|Electronics|
| 10|    Dining Table|  Furniture|      10| 249.99|  Furniture|
+---+----------------+-----------+--------+-------+-----------+



### `sort` the rows using `orderBy()`

In [11]:
# sorting the rows on basis of a single column i.e 'price'

sorted_data = df.orderBy("price")

print("Sorted Data: ")
sorted_data.show(10)

Sorted Data: 
+---+--------------+-----------+--------+-----+
| id|          name|   category|quantity|price|
+---+--------------+-----------+--------+-----+
| 13|       Bananas|       Food|     150| 0.25|
| 12|        Apples|       Food|     100|  0.5|
| 14|       Oranges|       Food|     120| 0.75|
| 15|Chicken Breast|       Food|      50| 3.99|
| 16| Salmon Fillet|       Food|      30| 5.99|
| 31| Salmon Fillet|       Food|      30| 5.99|
| 29|       T-shirt|   Clothing|      50|14.99|
| 19|      Yoga Mat|     Sports|      20|19.99|
| 34|      Yoga Mat|     Sports|      20|19.99|
| 25|      Backpack|Accessories|      30|24.99|
+---+--------------+-----------+--------+-----+
only showing top 10 rows



In [12]:
# sorting rows on basis of multiple columns & in descending order using desc()

from pyspark.sql.functions import col, desc

sorted_data = df.orderBy(col("price").desc(), col("id").desc())

print("Sorted Data Descending:")
sorted_data.show(10)

Sorted Data Descending:
+---+----------------+-----------+--------+-------+
| id|            name|   category|quantity|  price|
+---+----------------+-----------+--------+-------+
|  2|         Macbook|Electronics|       5|1299.99|
|  1|          iPhone|Electronics|      10| 899.99|
|  4|      Samsung TV|Electronics|       8| 799.99|
|  5|           LG TV|Electronics|      10| 699.99|
| 26|          Camera|Electronics|      10| 599.99|
|  3|            iPad|Electronics|      15| 499.99|
| 10|    Dining Table|  Furniture|      10| 249.99|
| 32|  Leather Jacket|   Clothing|      15| 199.99|
| 17|  Leather Jacket|   Clothing|      15| 199.99|
|  9|Beats Headphones|Electronics|      20| 199.99|
+---+----------------+-----------+--------+-------+
only showing top 10 rows



#### get `Distinct / Unique` values in a column using `distinct()`

In [13]:
distinct_rows = df.select("category").distinct()

print("Distinct Product Categories: ")
distinct_rows.show()

Distinct Product Categories: 
+-----------+
|   category|
+-----------+
|       Food|
|     Sports|
|Electronics|
|   Clothing|
|  Furniture|
|Accessories|
+-----------+



#### get `distinct/unque` rows in a Dataframe

In [14]:
# Get unique/distinct rows
# at initial there is 35 rows.

# Treat rows as duplicates when all columns values are same in two rows
unique_df = df.dropDuplicates()

unique_df = unique_df.orderBy("id")
unique_df.show(35)

# Duplicates are removed so fewer rows are there

+---+----------------+-----------+--------+-------+
| id|            name|   category|quantity|  price|
+---+----------------+-----------+--------+-------+
|  1|          iPhone|Electronics|      10| 899.99|
|  2|         Macbook|Electronics|       5|1299.99|
|  3|            iPad|Electronics|      15| 499.99|
|  4|      Samsung TV|Electronics|       8| 799.99|
|  5|           LG TV|Electronics|      10| 699.99|
|  6|      Nike Shoes|   Clothing|      30|  99.99|
|  7|    Adidas Shoes|   Clothing|      25|  89.99|
|  8| Sony Headphones|Electronics|      12| 149.99|
|  9|Beats Headphones|Electronics|      20| 199.99|
| 10|    Dining Table|  Furniture|      10| 249.99|
| 11|      Study Desk|  Furniture|       8| 149.99|
| 12|          Apples|       Food|     100|    0.5|
| 13|         Bananas|       Food|     150|   0.25|
| 14|         Oranges|       Food|     120|   0.75|
| 15|  Chicken Breast|       Food|      50|   3.99|
| 16|   Salmon Fillet|       Food|      30|   5.99|
| 17|  Leath

#### `Drop / Remove` specified columns from dataframe

In [15]:
dropped_columns = df.drop("category")
# dropped_columns = df.drop("quantity", "category")

print("Dropped Columns:")
dropped_columns.show()

Dropped Columns:
+---+----------------+--------+-------+
| id|            name|quantity|  price|
+---+----------------+--------+-------+
|  1|          iPhone|      10| 899.99|
|  2|         Macbook|       5|1299.99|
|  3|            iPad|      15| 499.99|
|  4|      Samsung TV|       8| 799.99|
|  5|           LG TV|      10| 699.99|
|  6|      Nike Shoes|      30|  99.99|
|  7|    Adidas Shoes|      25|  89.99|
|  8| Sony Headphones|      12| 149.99|
|  9|Beats Headphones|      20| 199.99|
| 10|    Dining Table|      10| 249.99|
| 11|      Study Desk|       8| 149.99|
| 12|          Apples|     100|    0.5|
| 13|         Bananas|     150|   0.25|
| 14|         Oranges|     120|   0.75|
| 15|  Chicken Breast|      50|   3.99|
| 16|   Salmon Fillet|      30|   5.99|
| 17|  Leather Jacket|      15| 199.99|
| 18|     Winter Coat|      10| 149.99|
| 19|        Yoga Mat|      20|  19.99|
| 20|    Dumbbell Set|      15|  49.99|
+---+----------------+--------+-------+
only showing top 20 row

#### `Add` new `Column` using :`withColumn()`

##### adding new `calculated column` i.e `revenue`

In [16]:
df_with_new_column = df.withColumn("revenue", df.quantity * df.price)

print("DataFrame with New Column:")
df_with_new_column.show(10)

DataFrame with New Column:
+---+----------------+-----------+--------+-------+-------+
| id|            name|   category|quantity|  price|revenue|
+---+----------------+-----------+--------+-------+-------+
|  1|          iPhone|Electronics|      10| 899.99| 8999.9|
|  2|         Macbook|Electronics|       5|1299.99|6499.95|
|  3|            iPad|Electronics|      15| 499.99|7499.85|
|  4|      Samsung TV|Electronics|       8| 799.99|6399.92|
|  5|           LG TV|Electronics|      10| 699.99| 6999.9|
|  6|      Nike Shoes|   Clothing|      30|  99.99| 2999.7|
|  7|    Adidas Shoes|   Clothing|      25|  89.99|2249.75|
|  8| Sony Headphones|Electronics|      12| 149.99|1799.88|
|  9|Beats Headphones|Electronics|      20| 199.99| 3999.8|
| 10|    Dining Table|  Furniture|      10| 249.99| 2499.9|
+---+----------------+-----------+--------+-------+-------+
only showing top 10 rows



#### `Rename` columns using `withColumnRenamed()`

In [17]:
df_with_alias = df.withColumnRenamed("price", "product_price")

print("DataFrame with Aliased Column:")
df_with_alias.show(10)

DataFrame with Aliased Column:
+---+----------------+-----------+--------+-------------+
| id|            name|   category|quantity|product_price|
+---+----------------+-----------+--------+-------------+
|  1|          iPhone|Electronics|      10|       899.99|
|  2|         Macbook|Electronics|       5|      1299.99|
|  3|            iPad|Electronics|      15|       499.99|
|  4|      Samsung TV|Electronics|       8|       799.99|
|  5|           LG TV|Electronics|      10|       699.99|
|  6|      Nike Shoes|   Clothing|      30|        99.99|
|  7|    Adidas Shoes|   Clothing|      25|        89.99|
|  8| Sony Headphones|Electronics|      12|       149.99|
|  9|Beats Headphones|Electronics|      20|       199.99|
| 10|    Dining Table|  Furniture|      10|       249.99|
+---+----------------+-----------+--------+-------------+
only showing top 10 rows



In [18]:
# stopping the SparkSession

spark.stop()