In [1]:
# Install PySpark in the current Jupyter environment
!pip install pyspark



In [3]:
# Entry point for DataFrame and SQL in PySpark
from pyspark.sql import SparkSession

In [5]:
# Create a SparkSession
# Syntax: SparkSession.builder.appName("AppName").getOrCreate()
spark = SparkSession.builder\
    .appName("MyApp")\
    .getOrCreate()

In [7]:
# CSV file path
# Syntax: csv_file_path = "your_path_here"
csv_file_path = r"C:\Users\Admin\OneDrive\Desktop\Study\Pandas\order.csv"

# Read CSV as DataFrame
# Syntax: spark.read.csv(path, header=True, inferSchema=True)
df = spark.read.csv(csv_file_path, header=True, inferSchema=True)

# Show 5 rows
# Syntax: df.show(n)
df.show(5)

# Stop SparkSession (optional)
# Syntax: spark.stop()
# spark.stop()

+-------------------+--------------------+-------------+----+---+-----------+
|              Order|              Status|     Category|Size|Qty|      State|
+-------------------+--------------------+-------------+----+---+-----------+
|405-8078784-5731545|           Cancelled|          Set|   S|  0|MAHARASHTRA|
|171-9198151-1101146|Shipped - Deliver...|        kurta| 3XL|  1|  KARNATAKA|
|404-0687676-7273146|             Shipped|        kurta|  XL|  1|MAHARASHTRA|
|403-9615377-8133951|           Cancelled|Western Dress|   L|  0| PUDUCHERRY|
|407-1069790-7240320|             Shipped|          Top| 3XL|  1| TAMIL NADU|
+-------------------+--------------------+-------------+----+---+-----------+
only showing top 5 rows



In [9]:
# Returns a list of (column_name, data_type) for the DataFrame
df.dtypes

[('Order', 'string'),
 ('Status', 'string'),
 ('Category', 'string'),
 ('Size', 'string'),
 ('Qty', 'int'),
 ('State', 'string')]

In [11]:
# Print DataFrame schema in tree format
# Syntax: df.printSchema()
df.printSchema()

root
 |-- Order: string (nullable = true)
 |-- Status: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Size: string (nullable = true)
 |-- Qty: integer (nullable = true)
 |-- State: string (nullable = true)



In [13]:
# Rename column and show 5 rows
# Syntax: df.withColumnRenamed("old_name", "new_name")
df.withColumnRenamed('State', 'Location').show(5)

+-------------------+--------------------+-------------+----+---+-----------+
|              Order|              Status|     Category|Size|Qty|   Location|
+-------------------+--------------------+-------------+----+---+-----------+
|405-8078784-5731545|           Cancelled|          Set|   S|  0|MAHARASHTRA|
|171-9198151-1101146|Shipped - Deliver...|        kurta| 3XL|  1|  KARNATAKA|
|404-0687676-7273146|             Shipped|        kurta|  XL|  1|MAHARASHTRA|
|403-9615377-8133951|           Cancelled|Western Dress|   L|  0| PUDUCHERRY|
|407-1069790-7240320|             Shipped|          Top| 3XL|  1| TAMIL NADU|
+-------------------+--------------------+-------------+----+---+-----------+
only showing top 5 rows



In [15]:
# Select a single column and show 5 rows
# Syntax: df.select("column_name").show(n)
df.select('State').show(5)

+-----------+
|      State|
+-----------+
|MAHARASHTRA|
|  KARNATAKA|
|MAHARASHTRA|
| PUDUCHERRY|
| TAMIL NADU|
+-----------+
only showing top 5 rows



In [17]:
# Select multiple columns and show 5 rows
# Syntax: df.select("col1", "col2", ...).show(n)
df.select('Order', 'State').show(5)

+-------------------+-----------+
|              Order|      State|
+-------------------+-----------+
|405-8078784-5731545|MAHARASHTRA|
|171-9198151-1101146|  KARNATAKA|
|404-0687676-7273146|MAHARASHTRA|
|403-9615377-8133951| PUDUCHERRY|
|407-1069790-7240320| TAMIL NADU|
+-------------------+-----------+
only showing top 5 rows



In [19]:
# Rename all columns using a list
# Syntax: df.toDF(*new_column_names_list).show(n)
col = ['a', 'b', 'c', 'd', 'e', 'f']
df.toDF(*col).show(5)

+-------------------+--------------------+-------------+---+---+-----------+
|                  a|                   b|            c|  d|  e|          f|
+-------------------+--------------------+-------------+---+---+-----------+
|405-8078784-5731545|           Cancelled|          Set|  S|  0|MAHARASHTRA|
|171-9198151-1101146|Shipped - Deliver...|        kurta|3XL|  1|  KARNATAKA|
|404-0687676-7273146|             Shipped|        kurta| XL|  1|MAHARASHTRA|
|403-9615377-8133951|           Cancelled|Western Dress|  L|  0| PUDUCHERRY|
|407-1069790-7240320|             Shipped|          Top|3XL|  1| TAMIL NADU|
+-------------------+--------------------+-------------+---+---+-----------+
only showing top 5 rows



In [21]:
# Drop a column and show 5 rows
# Syntax: df.drop("column_name").show(n)
df.drop('Category').show(5)

+-------------------+--------------------+----+---+-----------+
|              Order|              Status|Size|Qty|      State|
+-------------------+--------------------+----+---+-----------+
|405-8078784-5731545|           Cancelled|   S|  0|MAHARASHTRA|
|171-9198151-1101146|Shipped - Deliver...| 3XL|  1|  KARNATAKA|
|404-0687676-7273146|             Shipped|  XL|  1|MAHARASHTRA|
|403-9615377-8133951|           Cancelled|   L|  0| PUDUCHERRY|
|407-1069790-7240320|             Shipped| 3XL|  1| TAMIL NADU|
+-------------------+--------------------+----+---+-----------+
only showing top 5 rows



In [23]:
# Drop multiple columns and show 5 rows
# Syntax: df.drop("col1", "col2", ...).show(n)
df.drop('Status', 'State').show(5)

+-------------------+-------------+----+---+
|              Order|     Category|Size|Qty|
+-------------------+-------------+----+---+
|405-8078784-5731545|          Set|   S|  0|
|171-9198151-1101146|        kurta| 3XL|  1|
|404-0687676-7273146|        kurta|  XL|  1|
|403-9615377-8133951|Western Dress|   L|  0|
|407-1069790-7240320|          Top| 3XL|  1|
+-------------------+-------------+----+---+
only showing top 5 rows



In [25]:
# Drop columns using a list and show 5 rows
# Syntax: df.drop(*list_of_columns).show(n)
col = ['Order', 'Category', 'Size', 'Qty']
df.drop(*col).show(5)

+--------------------+-----------+
|              Status|      State|
+--------------------+-----------+
|           Cancelled|MAHARASHTRA|
|Shipped - Deliver...|  KARNATAKA|
|             Shipped|MAHARASHTRA|
|           Cancelled| PUDUCHERRY|
|             Shipped| TAMIL NADU|
+--------------------+-----------+
only showing top 5 rows



In [27]:
# Get distinct rows and show n rows  
# Syntax: df.distinct().show(n)
df.distinct().show(5)

+-------------------+--------------------+--------+----+---+----------+
|              Order|              Status|Category|Size|Qty|     State|
+-------------------+--------------------+--------+----+---+----------+
|406-7807733-3785945|Shipped - Deliver...|   kurta|   S|  1| TELANGANA|
|171-9198151-1101146|Shipped - Deliver...|   kurta| 3XL|  1| KARNATAKA|
|406-9379318-6555504|             Shipped|   kurta| XXL|  1| RAJASTHAN|
|407-5443024-5233168|           Cancelled|     Set| 3XL|  0| TELANGANA|
|407-1069790-7240320|             Shipped|     Top| 3XL|  1|TAMIL NADU|
+-------------------+--------------------+--------+----+---+----------+
only showing top 5 rows



In [29]:
# Q: What does col() do in PySpark?
# A: col() is used to access a DataFrame column by name. It allows column-level operations.

# Q: What does lit() do in PySpark?
# A: lit() is used to add constant values (like numbers or strings) to a column or expression.

# Q: What does current_date() return in PySpark?
# A: current_date() returns the current system date (no time component) in the default format.

# Q: What is when() used for in PySpark?
# A: when() is used for conditional logic, similar to if-else or CASE WHEN. It must be paired with otherwise().

# Q: How do you create a column called "Status" which says "OK" if Qty > 50, else "LOW"?
# A: df = df.withColumn("Status", when(col("Qty") > 50, "OK").otherwise("LOW"))

# Q: How do you add a column "Updated" which just contains today’s date?
# A: df = df.withColumn("Updated", current_date())

In [31]:
# Import commonly used PySpark functions
# Syntax: from pyspark.sql.functions import col, lit, current_date, when
from pyspark.sql.functions import col, lit, current_date, when

In [33]:
# Create a new column by multiplying existing column
# Syntax: df.withColumn("new_col", col("existing_col") * value).show(n)
df.withColumn('Net', col('Qty') * 2).show(5)

+-------------------+--------------------+-------------+----+---+-----------+---+
|              Order|              Status|     Category|Size|Qty|      State|Net|
+-------------------+--------------------+-------------+----+---+-----------+---+
|405-8078784-5731545|           Cancelled|          Set|   S|  0|MAHARASHTRA|  0|
|171-9198151-1101146|Shipped - Deliver...|        kurta| 3XL|  1|  KARNATAKA|  2|
|404-0687676-7273146|             Shipped|        kurta|  XL|  1|MAHARASHTRA|  2|
|403-9615377-8133951|           Cancelled|Western Dress|   L|  0| PUDUCHERRY|  0|
|407-1069790-7240320|             Shipped|          Top| 3XL|  1| TAMIL NADU|  2|
+-------------------+--------------------+-------------+----+---+-----------+---+
only showing top 5 rows



In [35]:
# Add a new column with expression and show n rows  
# Syntax: df.withColumn('NewCol', expression).show(n)
df.withColumn('Net', col('Qty') * 2).show(5)

+-------------------+--------------------+-------------+----+---+-----------+---+
|              Order|              Status|     Category|Size|Qty|      State|Net|
+-------------------+--------------------+-------------+----+---+-----------+---+
|405-8078784-5731545|           Cancelled|          Set|   S|  0|MAHARASHTRA|  0|
|171-9198151-1101146|Shipped - Deliver...|        kurta| 3XL|  1|  KARNATAKA|  2|
|404-0687676-7273146|             Shipped|        kurta|  XL|  1|MAHARASHTRA|  2|
|403-9615377-8133951|           Cancelled|Western Dress|   L|  0| PUDUCHERRY|  0|
|407-1069790-7240320|             Shipped|          Top| 3XL|  1| TAMIL NADU|  2|
+-------------------+--------------------+-------------+----+---+-----------+---+
only showing top 5 rows



In [37]:
# Add a new column with current date  
# Syntax: df.withColumn('col_name', lit(current_date()))
df.withColumn('Date', lit(current_date())).show(5)

+-------------------+--------------------+-------------+----+---+-----------+----------+
|              Order|              Status|     Category|Size|Qty|      State|      Date|
+-------------------+--------------------+-------------+----+---+-----------+----------+
|405-8078784-5731545|           Cancelled|          Set|   S|  0|MAHARASHTRA|2025-08-06|
|171-9198151-1101146|Shipped - Deliver...|        kurta| 3XL|  1|  KARNATAKA|2025-08-06|
|404-0687676-7273146|             Shipped|        kurta|  XL|  1|MAHARASHTRA|2025-08-06|
|403-9615377-8133951|           Cancelled|Western Dress|   L|  0| PUDUCHERRY|2025-08-06|
|407-1069790-7240320|             Shipped|          Top| 3XL|  1| TAMIL NADU|2025-08-06|
+-------------------+--------------------+-------------+----+---+-----------+----------+
only showing top 5 rows



In [39]:
# Add new column 'Remark' based on conditions in 'Status'  
# "B" if delivered, "S" if shipped, else "C"
df.withColumn("Remark", 
    when(col("Status") == "Shipped - Delivered to Buyer", "B")
    .when(col("Status") == "Shipped", "S")
    .otherwise("C")
).show(5)

df.withColumn("Remark", when(col("Status") == "Cancelled", "Refunded").otherwise("Paid")).show(5)

+-------------------+--------------------+-------------+----+---+-----------+------+
|              Order|              Status|     Category|Size|Qty|      State|Remark|
+-------------------+--------------------+-------------+----+---+-----------+------+
|405-8078784-5731545|           Cancelled|          Set|   S|  0|MAHARASHTRA|     C|
|171-9198151-1101146|Shipped - Deliver...|        kurta| 3XL|  1|  KARNATAKA|     B|
|404-0687676-7273146|             Shipped|        kurta|  XL|  1|MAHARASHTRA|     S|
|403-9615377-8133951|           Cancelled|Western Dress|   L|  0| PUDUCHERRY|     C|
|407-1069790-7240320|             Shipped|          Top| 3XL|  1| TAMIL NADU|     S|
+-------------------+--------------------+-------------+----+---+-----------+------+
only showing top 5 rows

+-------------------+--------------------+-------------+----+---+-----------+--------+
|              Order|              Status|     Category|Size|Qty|      State|  Remark|
+-------------------+---------------

In [41]:
# Filter rows where State is 'MAHARASHTRA'  
# Syntax: df.filter(df["col_name"] == "value")  
df.filter(df["State"] == "MAHARASHTRA").show()

+-------------------+--------------------+--------+----+---+-----------+
|              Order|              Status|Category|Size|Qty|      State|
+-------------------+--------------------+--------+----+---+-----------+
|405-8078784-5731545|           Cancelled|     Set|   S|  0|MAHARASHTRA|
|404-0687676-7273146|             Shipped|   kurta|  XL|  1|MAHARASHTRA|
|405-5513694-8146768|Shipped - Deliver...|   kurta|  XS|  1|MAHARASHTRA|
|408-7955685-3083534|             Shipped|     Set|  XS|  1|MAHARASHTRA|
|408-1298370-1920302|Shipped - Deliver...|     Set|   L|  1|MAHARASHTRA|
+-------------------+--------------------+--------+----+---+-----------+



In [43]:
# Filtering the DataFrame `df` for rows where:- 'State' is exactly "MAHARASHTRA" - AND 'Remark' is exactly "Refunded"
df.filter((col("State") == "MAHARASHTRA") & (col("Status") == "Cancelled")).show()

+-------------------+---------+--------+----+---+-----------+
|              Order|   Status|Category|Size|Qty|      State|
+-------------------+---------+--------+----+---+-----------+
|405-8078784-5731545|Cancelled|     Set|   S|  0|MAHARASHTRA|
+-------------------+---------+--------+----+---+-----------+



In [45]:
# Read CSV file into DataFrame with headers and inferred schema
csv_file_path = r'C:\Users\Admin\OneDrive\Desktop\Study\Pyspark\Data.csv'
df = spark.read.csv(csv_file_path, header=True, inferSchema=True)

In [47]:
# Drop rows with any null values  
# Syntax: df.dropna()
df.dropna().show()

+---+-----+---+-------------+
| ID| Name|Age|         City|
+---+-----+---+-------------+
|  1|Alice| 25|     New York|
|  5|  Eva| 28|San Francisco|
|  8| Hank| 40|     Portland|
+---+-----+---+-------------+



In [49]:
# Drop rows with nulls in specific columns  
# Syntax: df.dropna(subset=['col1', 'col2'])
df.dropna(subset=['City', 'Name']).show()

+---+-----+----+-------------+
| ID| Name| Age|         City|
+---+-----+----+-------------+
|  1|Alice|  25|     New York|
|  2|  Bob|NULL|  Los Angeles|
|  4|David|NULL|             |
|  5|  Eva|  28|San Francisco|
|  7|Grace|NULL|      Seattle|
|  8| Hank|  40|     Portland|
| 10| Jack|NULL|       Boston|
+---+-----+----+-------------+



In [51]:
# Drop duplicate rows  
# Syntax: df.dropDuplicates()
df.dropDuplicates().show()

+---+-----+----+-------------+
| ID| Name| Age|         City|
+---+-----+----+-------------+
|  5|  Eva|  28|San Francisco|
|  8| Hank|  40|     Portland|
|  1|Alice|  25|     New York|
| 10| Jack|NULL|       Boston|
|  6|Frank|  35|         NULL|
|  9|  Ivy|  29|         NULL|
|  4|David|NULL|             |
|  2|  Bob|NULL|  Los Angeles|
|  3| NULL|  30|      Chicago|
|  7|Grace|NULL|      Seattle|
+---+-----+----+-------------+



In [53]:
# Replace all null values with 30  
# Syntax: df.fillna(30)
df.fillna(30).show()

+---+-----+---+-------------+
| ID| Name|Age|         City|
+---+-----+---+-------------+
|  1|Alice| 25|     New York|
|  2|  Bob| 30|  Los Angeles|
|  3| NULL| 30|      Chicago|
|  4|David| 30|             |
|  5|  Eva| 28|San Francisco|
|  6|Frank| 35|         NULL|
|  7|Grace| 30|      Seattle|
|  8| Hank| 40|     Portland|
|  9|  Ivy| 29|         NULL|
| 10| Jack| 30|       Boston|
+---+-----+---+-------------+



In [55]:
# Imports function to calculate the average, max value, min value, mode and median of a column
from pyspark.sql.functions import mean, max, min, mode, median

In [57]:
# Calculate and show the average (mean) of the 'Age' column  
# Syntax: df.select(mean(df.ColumnName)).show()  
df.select(mean(df.Age)).show()

+------------------+
|          avg(Age)|
+------------------+
|31.166666666666668|
+------------------+



In [59]:
# Find and show the maximum value from the 'Age' column  
# Syntax: df.select(max(df["ColumnName"])).show()  
df.select(max(df["Age"])).show()

+--------+
|max(Age)|
+--------+
|      40|
+--------+



In [61]:
# Find and show the minimum value from the 'Age' column  
# Syntax: df.select(min(df["ColumnName"])).show()  
df.select(min(df["Age"])).show()

+--------+
|min(Age)|
+--------+
|      25|
+--------+



In [63]:
# Step 1: Select mean value of Age column
t = df.select(mean(df.Age))

# Step 2: Extract that mean value from the DataFrame
mean_value = t.collect()[0][0]

# Step 3: Replace all null values with mean and show the updated DataFrame
df.fillna(mean_value).show()

# Optional: Replace nulls in Age column only
df.fillna({"Age": mean_value}).show()

+---+-----+---+-------------+
| ID| Name|Age|         City|
+---+-----+---+-------------+
|  1|Alice| 25|     New York|
|  2|  Bob| 31|  Los Angeles|
|  3| NULL| 30|      Chicago|
|  4|David| 31|             |
|  5|  Eva| 28|San Francisco|
|  6|Frank| 35|         NULL|
|  7|Grace| 31|      Seattle|
|  8| Hank| 40|     Portland|
|  9|  Ivy| 29|         NULL|
| 10| Jack| 31|       Boston|
+---+-----+---+-------------+

+---+-----+---+-------------+
| ID| Name|Age|         City|
+---+-----+---+-------------+
|  1|Alice| 25|     New York|
|  2|  Bob| 31|  Los Angeles|
|  3| NULL| 30|      Chicago|
|  4|David| 31|             |
|  5|  Eva| 28|San Francisco|
|  6|Frank| 35|         NULL|
|  7|Grace| 31|      Seattle|
|  8| Hank| 40|     Portland|
|  9|  Ivy| 29|         NULL|
| 10| Jack| 31|       Boston|
+---+-----+---+-------------+



In [65]:
# Find and show the mode (most frequent value) from the 'Age' column
# Syntax: df.select(mode(df["ColumnName"]))
df.select(mode(df.Age)).show()

+---------+
|mode(Age)|
+---------+
|       25|
+---------+



In [67]:
# Find and show the median value from the 'Age' column  
# Syntax: df.select(median(df["ColumnName"])).show()
df.select(median(df.Age)).show()

+-----------+
|median(Age)|
+-----------+
|       29.5|
+-----------+



In [69]:
# Limit the number of rows displayed to 3  
# Syntax: df.limit(n).show()  
df.limit(3).show()

+---+-----+----+-----------+
| ID| Name| Age|       City|
+---+-----+----+-----------+
|  1|Alice|  25|   New York|
|  2|  Bob|NULL|Los Angeles|
|  3| NULL|  30|    Chicago|
+---+-----+----+-----------+



In [71]:
# Rename a column temporarily using alias  
# Syntax: df.select(df["col"].alias("new_name"))  
df.select(df["Age"].alias("Years")).show()

+-----+
|Years|
+-----+
|   25|
| NULL|
|   30|
| NULL|
|   28|
|   35|
| NULL|
|   40|
|   29|
| NULL|
+-----+



In [73]:
# Return all rows as a list of Row objects  
# Syntax: df.collect()  
df.collect()

[Row(ID=1, Name='Alice', Age=25, City='New York'),
 Row(ID=2, Name='Bob', Age=None, City='Los Angeles'),
 Row(ID=3, Name=None, Age=30, City='Chicago'),
 Row(ID=4, Name='David', Age=None, City=' '),
 Row(ID=5, Name='Eva', Age=28, City='San Francisco'),
 Row(ID=6, Name='Frank', Age=35, City=None),
 Row(ID=7, Name='Grace', Age=None, City='Seattle'),
 Row(ID=8, Name='Hank', Age=40, City='Portland'),
 Row(ID=9, Name='Ivy', Age=29, City=None),
 Row(ID=10, Name='Jack', Age=None, City='Boston')]

In [75]:
# Return first 2 rows as a list  
# Syntax: df.take(n)  
df.take(2)

[Row(ID=1, Name='Alice', Age=25, City='New York'),
 Row(ID=2, Name='Bob', Age=None, City='Los Angeles')]

In [77]:
# Return only the first 5 rows from the DataFrame  
# Syntax: df.limit(n).show()  
df.limit(5).show()

+---+-----+----+-------------+
| ID| Name| Age|         City|
+---+-----+----+-------------+
|  1|Alice|  25|     New York|
|  2|  Bob|NULL|  Los Angeles|
|  3| NULL|  30|      Chicago|
|  4|David|NULL|             |
|  5|  Eva|  28|San Francisco|
+---+-----+----+-------------+



In [81]:
# Rename a column temporarily using alias  
# Syntax: df.select(df["col"].alias("alias_name"))  
df.select(df["Age"].alias("PersonAge")).show()

+---------+
|PersonAge|
+---------+
|       25|
|     NULL|
|       30|
|     NULL|
|       28|
|       35|
|     NULL|
|       40|
|       29|
|     NULL|
+---------+



In [83]:
# Collect all rows into a list on driver (beware of large data!)  
# Syntax: df.collect()  
df.collect()

[Row(ID=1, Name='Alice', Age=25, City='New York'),
 Row(ID=2, Name='Bob', Age=None, City='Los Angeles'),
 Row(ID=3, Name=None, Age=30, City='Chicago'),
 Row(ID=4, Name='David', Age=None, City=' '),
 Row(ID=5, Name='Eva', Age=28, City='San Francisco'),
 Row(ID=6, Name='Frank', Age=35, City=None),
 Row(ID=7, Name='Grace', Age=None, City='Seattle'),
 Row(ID=8, Name='Hank', Age=40, City='Portland'),
 Row(ID=9, Name='Ivy', Age=29, City=None),
 Row(ID=10, Name='Jack', Age=None, City='Boston')]

In [85]:
# Take first n rows and return as list  
# Syntax: df.take(n)  
df.take(3)

[Row(ID=1, Name='Alice', Age=25, City='New York'),
 Row(ID=2, Name='Bob', Age=None, City='Los Angeles'),
 Row(ID=3, Name=None, Age=30, City='Chicago')]

In [87]:
# Display top 20 rows of the DataFrame  
# Syntax: df.show()  
df.show()

+---+-----+----+-------------+
| ID| Name| Age|         City|
+---+-----+----+-------------+
|  1|Alice|  25|     New York|
|  2|  Bob|NULL|  Los Angeles|
|  3| NULL|  30|      Chicago|
|  4|David|NULL|             |
|  5|  Eva|  28|San Francisco|
|  6|Frank|  35|         NULL|
|  7|Grace|NULL|      Seattle|
|  8| Hank|  40|     Portland|
|  9|  Ivy|  29|         NULL|
| 10| Jack|NULL|       Boston|
+---+-----+----+-------------+



In [89]:
# Print DataFrame schema in tree format  
# Syntax: df.printSchema()  
df.printSchema()

root
 |-- ID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- City: string (nullable = true)



In [91]:
# Select specific columns from the DataFrame  
# Syntax: df.select("col1", "col2")  
df.select("Name", "Age").show()

+-----+----+
| Name| Age|
+-----+----+
|Alice|  25|
|  Bob|NULL|
| NULL|  30|
|David|NULL|
|  Eva|  28|
|Frank|  35|
|Grace|NULL|
| Hank|  40|
|  Ivy|  29|
| Jack|NULL|
+-----+----+



In [93]:
# Drop rows with null values  
# Syntax: df.dropna()  
df.dropna().show()

+---+-----+---+-------------+
| ID| Name|Age|         City|
+---+-----+---+-------------+
|  1|Alice| 25|     New York|
|  5|  Eva| 28|San Francisco|
|  8| Hank| 40|     Portland|
+---+-----+---+-------------+



In [95]:
# Replace all null values with 30  
# Syntax: df.fillna(30)  
df.fillna(30).show()

+---+-----+---+-------------+
| ID| Name|Age|         City|
+---+-----+---+-------------+
|  1|Alice| 25|     New York|
|  2|  Bob| 30|  Los Angeles|
|  3| NULL| 30|      Chicago|
|  4|David| 30|             |
|  5|  Eva| 28|San Francisco|
|  6|Frank| 35|         NULL|
|  7|Grace| 30|      Seattle|
|  8| Hank| 40|     Portland|
|  9|  Ivy| 29|         NULL|
| 10| Jack| 30|       Boston|
+---+-----+---+-------------+



In [97]:
# Remove duplicate rows from the DataFrame  
# Syntax: df.dropDuplicates()  
df.dropDuplicates().show()

+---+-----+----+-------------+
| ID| Name| Age|         City|
+---+-----+----+-------------+
|  5|  Eva|  28|San Francisco|
|  8| Hank|  40|     Portland|
|  1|Alice|  25|     New York|
| 10| Jack|NULL|       Boston|
|  6|Frank|  35|         NULL|
|  9|  Ivy|  29|         NULL|
|  4|David|NULL|             |
|  2|  Bob|NULL|  Los Angeles|
|  3| NULL|  30|      Chicago|
|  7|Grace|NULL|      Seattle|
+---+-----+----+-------------+



In [99]:
# Return unique rows only (like SQL DISTINCT)  
# Syntax: df.distinct()  
df.distinct().show()

+---+-----+----+-------------+
| ID| Name| Age|         City|
+---+-----+----+-------------+
|  5|  Eva|  28|San Francisco|
|  8| Hank|  40|     Portland|
|  1|Alice|  25|     New York|
| 10| Jack|NULL|       Boston|
|  6|Frank|  35|         NULL|
|  9|  Ivy|  29|         NULL|
|  4|David|NULL|             |
|  2|  Bob|NULL|  Los Angeles|
|  3| NULL|  30|      Chicago|
|  7|Grace|NULL|      Seattle|
+---+-----+----+-------------+



In [101]:
# Add or update a column in the DataFrame  
# Syntax: df.withColumn("NewCol", expr)  
df.withColumn("AgePlus5", df["Age"] + 5).show()

+---+-----+----+-------------+--------+
| ID| Name| Age|         City|AgePlus5|
+---+-----+----+-------------+--------+
|  1|Alice|  25|     New York|      30|
|  2|  Bob|NULL|  Los Angeles|    NULL|
|  3| NULL|  30|      Chicago|      35|
|  4|David|NULL|             |    NULL|
|  5|  Eva|  28|San Francisco|      33|
|  6|Frank|  35|         NULL|      40|
|  7|Grace|NULL|      Seattle|    NULL|
|  8| Hank|  40|     Portland|      45|
|  9|  Ivy|  29|         NULL|      34|
| 10| Jack|NULL|       Boston|    NULL|
+---+-----+----+-------------+--------+



In [103]:
# Rename an existing column  
# Syntax: df.withColumnRenamed("old", "new")  
df.withColumnRenamed("Age", "Years").show()

+---+-----+-----+-------------+
| ID| Name|Years|         City|
+---+-----+-----+-------------+
|  1|Alice|   25|     New York|
|  2|  Bob| NULL|  Los Angeles|
|  3| NULL|   30|      Chicago|
|  4|David| NULL|             |
|  5|  Eva|   28|San Francisco|
|  6|Frank|   35|         NULL|
|  7|Grace| NULL|      Seattle|
|  8| Hank|   40|     Portland|
|  9|  Ivy|   29|         NULL|
| 10| Jack| NULL|       Boston|
+---+-----+-----+-------------+



In [105]:
# Filter rows based on condition  
# Syntax: df.filter(condition) or df.where(condition)  
df.filter(df["Age"] > 30).show()

+---+-----+---+--------+
| ID| Name|Age|    City|
+---+-----+---+--------+
|  6|Frank| 35|    NULL|
|  8| Hank| 40|Portland|
+---+-----+---+--------+



In [135]:
# Group data and apply aggregation functions  
# Syntax: df.groupBy("col").agg(F.agg_func("col"))  
from pyspark.sql.functions import sum  
df.groupBy("City").agg(sum("Age")).show()

+-------------+--------+
|         City|sum(Age)|
+-------------+--------+
|  Los Angeles|    NULL|
|San Francisco|      28|
|         NULL|      64|
|     Portland|      40|
|      Chicago|      30|
|      Seattle|    NULL|
|             |    NULL|
|     New York|      25|
|       Boston|    NULL|
+-------------+--------+



In [109]:
# Sort the DataFrame by column(s)  
# Syntax: df.orderBy("col") or df.sort("col")  
df.orderBy("Age").show()

+---+-----+----+-------------+
| ID| Name| Age|         City|
+---+-----+----+-------------+
|  2|  Bob|NULL|  Los Angeles|
|  4|David|NULL|             |
|  7|Grace|NULL|      Seattle|
| 10| Jack|NULL|       Boston|
|  1|Alice|  25|     New York|
|  5|  Eva|  28|San Francisco|
|  9|  Ivy|  29|         NULL|
|  3| NULL|  30|      Chicago|
|  6|Frank|  35|         NULL|
|  8| Hank|  40|     Portland|
+---+-----+----+-------------+



In [111]:
# Generate summary statistics for numeric columns  
# Syntax: df.describe().show()  
df.describe().show()

+-------+------------------+-----+------------------+-------+
|summary|                ID| Name|               Age|   City|
+-------+------------------+-----+------------------+-------+
|  count|                10|    9|                 6|      8|
|   mean|               5.5| NULL|31.166666666666668|   NULL|
| stddev|3.0276503540974917| NULL| 5.419102016632153|   NULL|
|    min|                 1|Alice|                25|       |
|    max|                10| Jack|                40|Seattle|
+-------+------------------+-----+------------------+-------+



In [113]:
# Count total number of rows in the DataFrame  
# Syntax: df.count()  
df.count()

10

In [115]:
# Filter rows with null in a specific column  
# Syntax: df.filter(df["col"].isNull())  
df.filter(df["Age"].isNull()).show()

+---+-----+----+-----------+
| ID| Name| Age|       City|
+---+-----+----+-----------+
|  2|  Bob|NULL|Los Angeles|
|  4|David|NULL|           |
|  7|Grace|NULL|    Seattle|
| 10| Jack|NULL|     Boston|
+---+-----+----+-----------+



In [117]:
# Filter rows where column is not null  
# Syntax: df.filter(df["col"].isNotNull())  
df.filter(df["Age"].isNotNull()).show()

+---+-----+---+-------------+
| ID| Name|Age|         City|
+---+-----+---+-------------+
|  1|Alice| 25|     New York|
|  3| NULL| 30|      Chicago|
|  5|  Eva| 28|San Francisco|
|  6|Frank| 35|         NULL|
|  8| Hank| 40|     Portland|
|  9|  Ivy| 29|         NULL|
+---+-----+---+-------------+



In [119]:
# Convert column to a different data type  
# Syntax: df.withColumn("col", df["col"].cast("new_type"))  
df.withColumn("Age", df["Age"].cast("string")).printSchema()

root
 |-- ID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- City: string (nullable = true)



In [121]:
# Apply conditional logic to create new column  
# Syntax: df.withColumn("col", when(condition, value))  
from pyspark.sql.functions import when  
df.withColumn("Senior", when(df["Age"] > 60, "Yes").otherwise("No")).show()

+---+-----+----+-------------+------+
| ID| Name| Age|         City|Senior|
+---+-----+----+-------------+------+
|  1|Alice|  25|     New York|    No|
|  2|  Bob|NULL|  Los Angeles|    No|
|  3| NULL|  30|      Chicago|    No|
|  4|David|NULL|             |    No|
|  5|  Eva|  28|San Francisco|    No|
|  6|Frank|  35|         NULL|    No|
|  7|Grace|NULL|      Seattle|    No|
|  8| Hank|  40|     Portland|    No|
|  9|  Ivy|  29|         NULL|    No|
| 10| Jack|NULL|       Boston|    No|
+---+-----+----+-------------+------+



In [123]:
# Import common functions from pyspark.sql.functions  
# Syntax: from pyspark.sql.functions import col, lit, avg, sum, count  
from pyspark.sql.functions import col, lit, avg, sum, count  
df.select(avg("Age"), count("Name")).show()

+------------------+-----------+
|          avg(Age)|count(Name)|
+------------------+-----------+
|31.166666666666668|          9|
+------------------+-----------+



In [125]:
# Drop one or more columns from DataFrame  
# Syntax: df.drop("col1", "col2")  
df.drop("Phone", "City").show()

+---+-----+----+
| ID| Name| Age|
+---+-----+----+
|  1|Alice|  25|
|  2|  Bob|NULL|
|  3| NULL|  30|
|  4|David|NULL|
|  5|  Eva|  28|
|  6|Frank|  35|
|  7|Grace|NULL|
|  8| Hank|  40|
|  9|  Ivy|  29|
| 10| Jack|NULL|
+---+-----+----+



In [127]:
# Find mode (most frequent value) from a column  
# Syntax: df.groupBy("col").count().orderBy("count", ascending=False).first()  
df.groupBy("Age").count().orderBy("count", ascending=False).first()

Row(Age=None, count=4)

In [129]:
# Find median using approxQuantile  
# Syntax: df.approxQuantile("col", [0.5], 0.01)  
df.approxQuantile("Age", [0.5], 0.01)

[29.0]