**Intialize the Spark Session**

In [0]:
from pyspark.sql import SparkSession
spark=SparkSession.builder\
      .appName("assignment-2")\
      .getOrCreate()
spark

**Basics**

In [0]:
#1.Load data with schema
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DateType
schema = StructType([
    StructField("TransactionID", StringType(), True),
    StructField("Customer", StringType(), True),
    StructField("City", StringType(), True),
    StructField("Product", StringType(), True),
    StructField("Category", StringType(), True),
    StructField("Quantity", IntegerType(), True),
    StructField("UnitPrice", IntegerType(), True),
    StructField("TotalPrice", IntegerType(), True),
    StructField("TransactionDate", DateType(), True),
    StructField("PaymentMode", StringType(), True)
])
retail= spark.read.option("header", True).schema(schema).csv("file:/Workspace/Shared/retail_data.csv")
retail.printSchema()
retail.show()

root
 |-- TransactionID: string (nullable = true)
 |-- Customer: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Product: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- UnitPrice: integer (nullable = true)
 |-- TotalPrice: integer (nullable = true)
 |-- TransactionDate: date (nullable = true)
 |-- PaymentMode: string (nullable = true)

+-------------+--------+---------+-------+-----------+--------+---------+----------+---------------+------------+
|TransactionID|Customer|     City|Product|   Category|Quantity|UnitPrice|TotalPrice|TransactionDate| PaymentMode|
+-------------+--------+---------+-------+-----------+--------+---------+----------+---------------+------------+
|        T1001|     Ali|   Mumbai| Laptop|Electronics|       1|    70000|     70000|     2024-01-15|       Card |
|        T1002|    Neha|Bangalore| Tablet|Electronics|       2|    30000|     60000|     2024-01-20|        UPI |
|        T

**Data Exploration & Filtering**

In [0]:
#3.Filter transactions where TotalPrice > 40000.
print("total price > 40000:")
retail.filter(retail.TotalPrice > 40000).show()
#4.Get unique cities from the dataset.
print("unique cities:")
retail.select("City").distinct().show()
#5.Find all transactions from "Delhi" using .filter() and .where().
print("transactions from Delhi:")
retail.where(retail.City == "Delhi").show()

total price > 40000:
+-------------+--------+---------+-------+-----------+--------+---------+----------+---------------+-----------+
|TransactionID|Customer|     City|Product|   Category|Quantity|UnitPrice|TotalPrice|TransactionDate|PaymentMode|
+-------------+--------+---------+-------+-----------+--------+---------+----------+---------------+-----------+
|        T1001|     Ali|   Mumbai| Laptop|Electronics|       1|    70000|     70000|     2024-01-15|      Card |
|        T1002|    Neha|Bangalore| Tablet|Electronics|       2|    30000|     60000|     2024-01-20|       UPI |
|        T1005|   Karan|   Mumbai|  Phone|Electronics|       1|    50000|     50000|     2024-02-15|      Card |
+-------------+--------+---------+-------+-----------+--------+---------+----------+---------------+-----------+

unique cities:
+---------+
|     City|
+---------+
|Bangalore|
|   Mumbai|
|    Delhi|
|Hyderabad|
+---------+

transactions from Delhi:
+-------------+--------+-----+-------+-----------+

**Data Manipulation**

In [0]:
#6.Add a column DiscountedPrice =TotalPrice - 10%.
retail.withColumn("DiscountedPrice", retail.TotalPrice * 0.9).show()
#7.Rename TransactionDate to TxnDate.
b=retail.withColumnRenamed("TransactionDate", "TxnDate").show()
#8.Drop the column UnitPrice.
retail.drop("UnitPrice").show()

+-------------+--------+---------+-------+-----------+--------+---------+----------+---------------+------------+---------------+
|TransactionID|Customer|     City|Product|   Category|Quantity|UnitPrice|TotalPrice|TransactionDate| PaymentMode|DiscountedPrice|
+-------------+--------+---------+-------+-----------+--------+---------+----------+---------------+------------+---------------+
|        T1001|     Ali|   Mumbai| Laptop|Electronics|       1|    70000|     70000|     2024-01-15|       Card |        63000.0|
|        T1002|    Neha|Bangalore| Tablet|Electronics|       2|    30000|     60000|     2024-01-20|        UPI |        54000.0|
|        T1003|    Ravi|Hyderabad|   Desk|  Furniture|       1|    15000|     15000|     2024-02-10|Net Banking |        13500.0|
|        T1004|    Zoya|    Delhi|  Chair|  Furniture|       4|     5000|     20000|     2024-02-12|       Card |        18000.0|
|        T1005|   Karan|   Mumbai|  Phone|Electronics|       1|    50000|     50000|     2

**Aggregations**

In [0]:
 #9.Get total sales by city.
 print("total sales by city:")
 retail.groupBy("City").sum("TotalPrice").show()
 #10.Get average unit price by category.
 print("average unit price by category:")
 retail.groupBy("Category").avg("UnitPrice").show()
 #11.Count of transactions grouped by PaymentMode.
 print("count of transactions grouped by PaymentMode:")
 retail.groupBy("PaymentMode").count().show()

total sales by city:
+---------+---------------+
|     City|sum(TotalPrice)|
+---------+---------------+
|Bangalore|          60000|
|   Mumbai|         120000|
|    Delhi|          23000|
|Hyderabad|          15000|
+---------+---------------+

average unit price by category:
+-----------+--------------+
|   Category|avg(UnitPrice)|
+-----------+--------------+
|Electronics|       37750.0|
|  Furniture|       10000.0|
+-----------+--------------+

count of transactions grouped by PaymentMode:
+------------+-----+
| PaymentMode|count|
+------------+-----+
|Net Banking |    1|
|        Cash|    1|
|        UPI |    1|
|       Card |    3|
+------------+-----+



**Window Functions**

In [0]:
#12.Use a window partitioned by City to rank transactions by TotalPrice .
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number, lag
w= Window.partitionBy("City").orderBy("TotalPrice")
print("rank transactions by TotalPrice:")
retail.withColumn("rank", row_number().over(w)).show()  
#13.Use lag function to get previous transaction amount per city.
print("previous transaction amount per city:")
retail.withColumn("prev_txn", lag("TotalPrice").over(w)).show()

rank transactions by TotalPrice:
+-------------+--------+---------+-------+-----------+--------+---------+----------+---------------+------------+----+
|TransactionID|Customer|     City|Product|   Category|Quantity|UnitPrice|TotalPrice|TransactionDate| PaymentMode|rank|
+-------------+--------+---------+-------+-----------+--------+---------+----------+---------------+------------+----+
|        T1002|    Neha|Bangalore| Tablet|Electronics|       2|    30000|     60000|     2024-01-20|        UPI |   1|
|        T1006|   Farah|    Delhi|  Mouse|Electronics|       3|     1000|      3000|     2024-02-18|        Cash|   1|
|        T1004|    Zoya|    Delhi|  Chair|  Furniture|       4|     5000|     20000|     2024-02-12|       Card |   2|
|        T1003|    Ravi|Hyderabad|   Desk|  Furniture|       1|    15000|     15000|     2024-02-10|Net Banking |   1|
|        T1005|   Karan|   Mumbai|  Phone|Electronics|       1|    50000|     50000|     2024-02-15|       Card |   1|
|        T1001|

**Joins**

In [0]:
#Join with city-region data
from pyspark.sql.functions import col, sum, avg, count, when, rank, lag, year, month, dayofmonth, udf
region_data = [("Mumbai", "West"), ("Delhi", "North"), ("Bangalore", "South"), ("Hyderabad", "South")]
region_df = spark.createDataFrame(region_data, ["City", "Region"])
a= retail.join(region_df, "City", "left")
print("joined with cities region:")
a.show()
# 15.Join with main DataFrame and group total sales by Region
print("total sales by region:")
a.groupBy("Region").agg(sum("TotalPrice").alias("RegionSales")).show()

joined with cities region:
+---------+-------------+--------+-------+-----------+--------+---------+----------+---------------+------------+------+
|     City|TransactionID|Customer|Product|   Category|Quantity|UnitPrice|TotalPrice|TransactionDate| PaymentMode|Region|
+---------+-------------+--------+-------+-----------+--------+---------+----------+---------------+------------+------+
|   Mumbai|        T1001|     Ali| Laptop|Electronics|       1|    70000|     70000|     2024-01-15|       Card |  West|
|Bangalore|        T1002|    Neha| Tablet|Electronics|       2|    30000|     60000|     2024-01-20|        UPI | South|
|Hyderabad|        T1003|    Ravi|   Desk|  Furniture|       1|    15000|     15000|     2024-02-10|Net Banking | South|
|    Delhi|        T1004|    Zoya|  Chair|  Furniture|       4|     5000|     20000|     2024-02-12|       Card | North|
|   Mumbai|        T1005|   Karan|  Phone|Electronics|       1|    50000|     50000|     2024-02-15|       Card |  West|
|    

**Nulls and Data Cleaning data**

In [0]:
#16.Introduce some nulls and replace them with default values.
print("nulls and replace them with default values:")
a.fillna(value=0,subset=["Quantity"]).show()
#17.Drop rows where Quantity is null.
print("droped rows where Quantity is null:")
a.dropna(subset=["Quantity"]).show()
#18.Fill null PaymentMode with "Unknown".
print("filled null PaymentMode with Unknown:")
a.fillna(value="Unknown",subset=["PaymentMode"]).show()

nulls and replace them with default values:
+---------+-------------+--------+-------+-----------+--------+---------+----------+---------------+------------+------+
|     City|TransactionID|Customer|Product|   Category|Quantity|UnitPrice|TotalPrice|TransactionDate| PaymentMode|Region|
+---------+-------------+--------+-------+-----------+--------+---------+----------+---------------+------------+------+
|   Mumbai|        T1001|     Ali| Laptop|Electronics|       1|    70000|     70000|     2024-01-15|       Card |  West|
|Bangalore|        T1002|    Neha| Tablet|Electronics|       2|    30000|     60000|     2024-01-20|        UPI | South|
|Hyderabad|        T1003|    Ravi|   Desk|  Furniture|       1|    15000|     15000|     2024-02-10|Net Banking | South|
|    Delhi|        T1004|    Zoya|  Chair|  Furniture|       4|     5000|     20000|     2024-02-12|       Card | North|
|   Mumbai|        T1005|   Karan|  Phone|Electronics|       1|    50000|     50000|     2024-02-15|       Ca

**Custom Functions**

In [0]:
#19.Write a UDF to label orders
@udf(StringType())
def label_order(amount):
    return "High" if amount > 50000 else "Medium" if amount >= 30000 else "Low"
u=retail.withColumn("OrderLabel", label_order(col("TotalPrice")))
print("label orders:")
u.show()

label orders:
+-------------+--------+---------+-------+-----------+--------+---------+----------+---------------+------------+----------+
|TransactionID|Customer|     City|Product|   Category|Quantity|UnitPrice|TotalPrice|TransactionDate| PaymentMode|OrderLabel|
+-------------+--------+---------+-------+-----------+--------+---------+----------+---------------+------------+----------+
|        T1001|     Ali|   Mumbai| Laptop|Electronics|       1|    70000|     70000|     2024-01-15|       Card |      High|
|        T1002|    Neha|Bangalore| Tablet|Electronics|       2|    30000|     60000|     2024-01-20|        UPI |      High|
|        T1003|    Ravi|Hyderabad|   Desk|  Furniture|       1|    15000|     15000|     2024-02-10|Net Banking |       Low|
|        T1004|    Zoya|    Delhi|  Chair|  Furniture|       4|     5000|     20000|     2024-02-12|       Card |       Low|
|        T1005|   Karan|   Mumbai|  Phone|Electronics|       1|    50000|     50000|     2024-02-15|       Card

**Date & Time**

In [0]:
#20.Extract year, month, and day from TransactionDate.
print("extract year, month, and day from TransactionDate:")
u.withColumn("year", year(col("TransactionDate"))).withColumn("month", month(col("TransactionDate"))).withColumn("day", dayofmonth(col("TransactionDate"))).show()
#21.Filter transactions that happened in February.
print("filtered transactions that happened in February:")
u.filter(month(col("TransactionDate")) == 2).show()

extract year, month, and day from TransactionDate:
+-------------+--------+---------+-------+-----------+--------+---------+----------+---------------+------------+----------+----+-----+---+
|TransactionID|Customer|     City|Product|   Category|Quantity|UnitPrice|TotalPrice|TransactionDate| PaymentMode|OrderLabel|year|month|day|
+-------------+--------+---------+-------+-----------+--------+---------+----------+---------------+------------+----------+----+-----+---+
|        T1001|     Ali|   Mumbai| Laptop|Electronics|       1|    70000|     70000|     2024-01-15|       Card |      High|2024|    1| 15|
|        T1002|    Neha|Bangalore| Tablet|Electronics|       2|    30000|     60000|     2024-01-20|        UPI |      High|2024|    1| 20|
|        T1003|    Ravi|Hyderabad|   Desk|  Furniture|       1|    15000|     15000|     2024-02-10|Net Banking |       Low|2024|    2| 10|
|        T1004|    Zoya|    Delhi|  Chair|  Furniture|       4|     5000|     20000|     2024-02-12|       Ca

**Union & Duplicate Handling**

In [0]:
#22.Duplicate the DataFrame using union() and remove duplicates.
print(" After duplicated the DataFrame using union() and removed duplicates:")
u.union(u).distinct().show()

 After duplicated the DataFrame using union() and removed duplicates:
+-------------+--------+---------+-------+-----------+--------+---------+----------+---------------+------------+----------+
|TransactionID|Customer|     City|Product|   Category|Quantity|UnitPrice|TotalPrice|TransactionDate| PaymentMode|OrderLabel|
+-------------+--------+---------+-------+-----------+--------+---------+----------+---------------+------------+----------+
|        T1003|    Ravi|Hyderabad|   Desk|  Furniture|       1|    15000|     15000|     2024-02-10|Net Banking |       Low|
|        T1001|     Ali|   Mumbai| Laptop|Electronics|       1|    70000|     70000|     2024-01-15|       Card |      High|
|        T1004|    Zoya|    Delhi|  Chair|  Furniture|       4|     5000|     20000|     2024-02-12|       Card |       Low|
|        T1005|   Karan|   Mumbai|  Phone|Electronics|       1|    50000|     50000|     2024-02-15|       Card |    Medium|
|        T1006|   Farah|    Delhi|  Mouse|Electronics| 