Creating spark session

In [100]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
                    .appName("Retail Transactions") \
                    .getOrCreate()
spark


#### Basics


 1. Load retail_data.csv into a PySpark DataFrame and display schema.

In [101]:
from google.colab import drive

drive.mount('/content/drive')

retail_df = spark.read.format('csv') \
                      .option("header", True) \
                      .load('/content/drive/MyDrive/Assignment/Retail_Transactions.csv')

retail_df.printSchema()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
root
 |-- TransactionID: string (nullable = true)
 |-- Customer: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Product: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Quantity: string (nullable = true)
 |-- UnitPrice: string (nullable = true)
 |-- TotalPrice: string (nullable = true)
 |-- TransactionDa: string (nullable = true)
 |-- PaymentMode: string (nullable = true)



 2. Infer schema as False — then manually cast columns.

In [102]:
from pyspark.sql.types import StringType, IntegerType, FloatType, StructType, StructField, DateType
from pyspark.sql.functions import trim, col

custom_schema = StructType([
    StructField("TransactionID", StringType(), True),
    StructField("Customer", StringType(), True),
    StructField("City", StringType(), True),
    StructField("Product", StringType(), True),
    StructField("Category", StringType(), True),
    StructField("Quantity", IntegerType(), True),
    StructField("UnitPrice", FloatType(), True),
    StructField("TotalPrice", FloatType(), True),
    StructField("TransactionDa", DateType(), True),
    StructField("PaymentMode", StringType(), True)
])

retail_custom_df = spark.read.format('csv') \
                             .option("header", True) \
                             .schema(custom_schema) \
                             .load('/content/drive/MyDrive/Assignment/Retail_Transactions.csv')

retail_custom_df.printSchema()

root
 |-- TransactionID: string (nullable = true)
 |-- Customer: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Product: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- UnitPrice: float (nullable = true)
 |-- TotalPrice: float (nullable = true)
 |-- TransactionDa: date (nullable = true)
 |-- PaymentMode: string (nullable = true)



Trimming extra space in column

In [103]:
for col_name in retail_custom_df.columns:
    retail_custom_df = retail_custom_df.withColumnRenamed(col_name, col_name.strip())

####  Data Exploration & Filtering


3. Filter transactions where
TotalPrice > 40000 .

In [104]:
retail_custom_df.filter(col('TotalPrice') > 40000) \
                .select(
                          'TransactionID',
                          'TotalPrice',
                          'TransactionDa',
                          'PaymentMode'
                ).show()

+-------------+----------+-------------+-----------+
|TransactionID|TotalPrice|TransactionDa|PaymentMode|
+-------------+----------+-------------+-----------+
|        T1001|   70000.0|   2024-01-15|      Card |
|        T1002|   60000.0|   2024-01-20|       UPI |
|        T1005|   50000.0|   2024-02-15|      Card |
+-------------+----------+-------------+-----------+



4.  Get unique cities from the dataset

In [105]:
retail_custom_df.select('City').distinct().show()

+---------+
|     City|
+---------+
|Bangalore|
|   Mumbai|
|    Delhi|
|Hyderabad|
+---------+



5. Find all transactions from "Delhi" using .filter() and .where()

In [106]:
retail_custom_df.filter(col("City") == "Delhi") \
         .select(
                  'TransactionID',
                  'Category',
                  'Product',
                  'Quantity',
                  'UnitPrice',
                  'TotalPrice',
                  'TransactionDa',
                  'PaymentMode'
         ).show()

+-------------+-----------+-------+--------+---------+----------+-------------+-----------+
|TransactionID|   Category|Product|Quantity|UnitPrice|TotalPrice|TransactionDa|PaymentMode|
+-------------+-----------+-------+--------+---------+----------+-------------+-----------+
|        T1004|  Furniture|  Chair|       4|   5000.0|   20000.0|   2024-02-12|      Card |
|        T1006|Electronics|  Mouse|       3|   1000.0|    3000.0|   2024-02-18|       Cash|
+-------------+-----------+-------+--------+---------+----------+-------------+-----------+



In [107]:
retail_custom_df.where(col("City") == "Delhi") \
         .select(
                  'TransactionID',
                  'Category',
                  'Product',
                  'Quantity',
                  'UnitPrice',
                  'TotalPrice',
                  'TransactionDa',
                  'PaymentMode'
         ).show()

+-------------+-----------+-------+--------+---------+----------+-------------+-----------+
|TransactionID|   Category|Product|Quantity|UnitPrice|TotalPrice|TransactionDa|PaymentMode|
+-------------+-----------+-------+--------+---------+----------+-------------+-----------+
|        T1004|  Furniture|  Chair|       4|   5000.0|   20000.0|   2024-02-12|      Card |
|        T1006|Electronics|  Mouse|       3|   1000.0|    3000.0|   2024-02-18|       Cash|
+-------------+-----------+-------+--------+---------+----------+-------------+-----------+



####  Data Manipulation

6.
Add a column DiscountedPrice
TotalPrice
10%.

In [108]:
retail_custom_df = retail_custom_df.withColumn('DiscountAmount', col('TotalPrice') * 0.1)

retail_custom_df.select(
                          'TransactionID',
                          'Product',
                          'Quantity',
                          'UnitPrice',
                          'TotalPrice',
                          'DiscountAmount',
                          'TransactionDa',
                          'PaymentMode'
).show()

+-------------+-------+--------+---------+----------+--------------+-------------+------------+
|TransactionID|Product|Quantity|UnitPrice|TotalPrice|DiscountAmount|TransactionDa| PaymentMode|
+-------------+-------+--------+---------+----------+--------------+-------------+------------+
|        T1001| Laptop|       1|  70000.0|   70000.0|        7000.0|   2024-01-15|       Card |
|        T1002| Tablet|       2|  30000.0|   60000.0|        6000.0|   2024-01-20|        UPI |
|        T1003|   Desk|       1|  15000.0|   15000.0|        1500.0|   2024-02-10|Net Banking |
|        T1004|  Chair|       4|   5000.0|   20000.0|        2000.0|   2024-02-12|       Card |
|        T1005|  Phone|       1|  50000.0|   50000.0|        5000.0|   2024-02-15|       Card |
|        T1006|  Mouse|       3|   1000.0|    3000.0|         300.0|   2024-02-18|        Cash|
+-------------+-------+--------+---------+----------+--------------+-------------+------------+



7.Rename TransactionDate to TxnDate .

In [109]:
retail_custom_df = retail_custom_df.withColumnRenamed("TransactionDa", "TxnDate")
retail_custom_df.printSchema()

root
 |-- TransactionID: string (nullable = true)
 |-- Customer: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Product: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- UnitPrice: float (nullable = true)
 |-- TotalPrice: float (nullable = true)
 |-- TxnDate: date (nullable = true)
 |-- PaymentMode: string (nullable = true)
 |-- DiscountAmount: double (nullable = true)



8. Drop the column UnitPrice .

In [110]:
retail_custom_df.drop('UnitPrice').printSchema()

root
 |-- TransactionID: string (nullable = true)
 |-- Customer: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Product: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- TotalPrice: float (nullable = true)
 |-- TxnDate: date (nullable = true)
 |-- PaymentMode: string (nullable = true)
 |-- DiscountAmount: double (nullable = true)



#### Aggregations

9. Get total sales by city

In [111]:
from pyspark.sql.functions import sum
retail_custom_df.groupBy('City') \
                .agg(sum('TotalPrice').alias('TotalSales')) \
                .sort('TotalSales', ascending=False) \
                .show()

+---------+----------+
|     City|TotalSales|
+---------+----------+
|   Mumbai|  120000.0|
|Bangalore|   60000.0|
|    Delhi|   23000.0|
|Hyderabad|   15000.0|
+---------+----------+



10. Get average unit price by category.

In [112]:
from pyspark.sql.functions import avg
retail_custom_df.groupBy('Category') \
                .agg(avg('UnitPrice')) \
                .show()

+-----------+--------------+
|   Category|avg(UnitPrice)|
+-----------+--------------+
|Electronics|       37750.0|
|  Furniture|       10000.0|
+-----------+--------------+



11. Count of transactions grouped by PaymentMode.


In [113]:
from pyspark.sql.functions import count
retail_custom_df.groupBy('PaymentMode') \
                .count() \
                .withColumnRenamed('count', 'TransactionCount') \
                .sort('TransactionCount', ascending=False) \
                .show()


+------------+----------------+
| PaymentMode|TransactionCount|
+------------+----------------+
|       Card |               3|
|Net Banking |               1|
|        Cash|               1|
|        UPI |               1|
+------------+----------------+



####  Window Functions


12. Use a window partitioned by City to rank transactions by


In [114]:
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number

window_df = Window.partitionBy('City')

retail_custom_df.withColumn('Rank', row_number().over(window_df.orderBy('TxnDate'))) \
                .select(
                          'TransactionID',
                          'City',
                          'Rank',
                          'TxnDate'
                ).show()

+-------------+---------+----+----------+
|TransactionID|     City|Rank|   TxnDate|
+-------------+---------+----+----------+
|        T1002|Bangalore|   1|2024-01-20|
|        T1004|    Delhi|   1|2024-02-12|
|        T1006|    Delhi|   2|2024-02-18|
|        T1003|Hyderabad|   1|2024-02-10|
|        T1001|   Mumbai|   1|2024-01-15|
|        T1005|   Mumbai|   2|2024-02-15|
+-------------+---------+----+----------+



13. Use lag function to get previous transaction amount per city

In [115]:
from pyspark.sql.functions import lag

window_df = Window.partitionBy('City').orderBy('TxnDate')

retail_custom_df.withColumn('PreviousTransaction', lag('TxnDate').over(window_df)) \
                .select(
                          'TransactionID',
                          'City',
                          'TxnDate',
                          'PreviousTransaction'
                ).show()

+-------------+---------+----------+-------------------+
|TransactionID|     City|   TxnDate|PreviousTransaction|
+-------------+---------+----------+-------------------+
|        T1002|Bangalore|2024-01-20|               NULL|
|        T1004|    Delhi|2024-02-12|               NULL|
|        T1006|    Delhi|2024-02-18|         2024-02-12|
|        T1003|Hyderabad|2024-02-10|               NULL|
|        T1001|   Mumbai|2024-01-15|               NULL|
|        T1005|   Mumbai|2024-02-15|         2024-01-15|
+-------------+---------+----------+-------------------+



#### Joins

14. Create a second DataFrame
city_region

In [116]:
city_region_schema = StructType([
    StructField("City", StringType(), True),
    StructField("Region", StringType(), True)
])

data = [
    ("Mumbai", "West"),
    ("Delhi", "North"),
    ("Bangalore", "South"),
    ("Hyderabad", "South")
]

city_region_df = spark.createDataFrame(data, city_region_schema)

city_region_df.printSchema()

root
 |-- City: string (nullable = true)
 |-- Region: string (nullable = true)



15. Join with main DataFrame and group total sales by Region.


In [117]:
# Joining with main dataframe

retail_custom_df = retail_custom_df.join(city_region_df, on='City', how='left')
retail_custom_df.printSchema()
# Group Total Sales by region

retail_custom_df.groupBy('Region') \
                .agg(sum('TotalPrice').alias('TotalSales')) \
                .show()


root
 |-- City: string (nullable = true)
 |-- TransactionID: string (nullable = true)
 |-- Customer: string (nullable = true)
 |-- Product: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- UnitPrice: float (nullable = true)
 |-- TotalPrice: float (nullable = true)
 |-- TxnDate: date (nullable = true)
 |-- PaymentMode: string (nullable = true)
 |-- DiscountAmount: double (nullable = true)
 |-- Region: string (nullable = true)

+------+----------+
|Region|TotalSales|
+------+----------+
| South|   75000.0|
|  West|  120000.0|
| North|   23000.0|
+------+----------+



#### Nulls and Data Cleaning


16. Introduce some nulls and replace them with default values.


In [121]:
from pyspark.sql.functions import when, rand
retail_custom_df = retail_custom_df.withColumn("Quantity",
                                                when(col('Quantity') < 2, None).otherwise(col('Quantity'))
                                              )
retail_custom_df.select('TransactionID', 'Quantity').show()
retail_custom_df = retail_custom_df.fillna({'Quantity': 1})

retail_custom_df.select('TransactionID', 'Quantity').show()

+-------------+--------+
|TransactionID|Quantity|
+-------------+--------+
|        T1002|       2|
|        T1001|    NULL|
|        T1005|    NULL|
|        T1004|       4|
|        T1006|       3|
|        T1003|    NULL|
+-------------+--------+

+-------------+--------+
|TransactionID|Quantity|
+-------------+--------+
|        T1002|       2|
|        T1001|       1|
|        T1005|       1|
|        T1004|       4|
|        T1006|       3|
|        T1003|       1|
+-------------+--------+



 17. Drop rows where
Quantity is null.

In [122]:
retail_custom_df = retail_custom_df.na.drop(subset=['Quantity'])
retail_custom_df.select('TransactionID', 'Quantity').show()

+-------------+--------+
|TransactionID|Quantity|
+-------------+--------+
|        T1002|       2|
|        T1001|       1|
|        T1005|       1|
|        T1004|       4|
|        T1006|       3|
|        T1003|       1|
+-------------+--------+



 18. Fill null
PaymentMode with "Unknown".

In [123]:
retail_custom_df = retail_custom_df.fillna({'PaymentMode': 'Unknown'})
retail_custom_df.select('TransactionID', 'PaymentMode').show()

+-------------+------------+
|TransactionID| PaymentMode|
+-------------+------------+
|        T1002|        UPI |
|        T1001|       Card |
|        T1005|       Card |
|        T1004|       Card |
|        T1006|        Cash|
|        T1003|Net Banking |
+-------------+------------+



####  Custom Functions

19.  Write a UDF to label orders:

In [125]:
from pyspark.sql.functions import udf
def label_order(amount):
  if amount > 50000: return "High"
  if amount >= 30000: return "Medium"
  return "Low"

label_order_fun = udf(label_order, StringType())
retail_custom_df = retail_custom_df.withColumn('OrderLabel', label_order_fun(col('TotalPrice')))
retail_custom_df.select(
                          'TransactionID',
                          'TotalPrice',
                          'OrderLabel'
).show()


+-------------+----------+----------+
|TransactionID|TotalPrice|OrderLabel|
+-------------+----------+----------+
|        T1002|   60000.0|      High|
|        T1001|   70000.0|      High|
|        T1005|   50000.0|    Medium|
|        T1004|   20000.0|       Low|
|        T1006|    3000.0|       Low|
|        T1003|   15000.0|       Low|
+-------------+----------+----------+



####  Date & Time


20. Extract year, month, and day from TxnDate .

In [129]:
from pyspark.sql.functions import year, month, date_format, dayofmonth as day

retail_custom_df = retail_custom_df.withColumn('Year', year(col('TxnDate'))) \
                                   .withColumn('Month', month(col('TxnDate'))) \
                                   .withColumn('MonthName', date_format(col('TxnDate'), 'MMMM')) \
                                   .withColumn('Day', day(col('TxnDate')))

retail_custom_df.select(
                          'TransactionID',
                          'TxnDate',
                          'Day',
                          'Month',
                          'MonthName',
                          'Year'

).show()

+-------------+----------+---+-----+---------+----+
|TransactionID|   TxnDate|Day|Month|MonthName|Year|
+-------------+----------+---+-----+---------+----+
|        T1002|2024-01-20| 20|    1|  January|2024|
|        T1001|2024-01-15| 15|    1|  January|2024|
|        T1005|2024-02-15| 15|    2| February|2024|
|        T1004|2024-02-12| 12|    2| February|2024|
|        T1006|2024-02-18| 18|    2| February|2024|
|        T1003|2024-02-10| 10|    2| February|2024|
+-------------+----------+---+-----+---------+----+



 21. Filter transactions that happened in February.


In [132]:
retail_custom_df.filter(col('Month') == 2) \
                .select(
                          'TransactionID',
                          'Product',
                          'Category',
                          'Quantity',
                          'UnitPrice',
                          'TotalPrice',
                          'Day',
                          'Month',
                          'Year',
                          'PaymentMode'
                ).show()

+-------------+-------+-----------+--------+---------+----------+---+-----+----+------------+
|TransactionID|Product|   Category|Quantity|UnitPrice|TotalPrice|Day|Month|Year| PaymentMode|
+-------------+-------+-----------+--------+---------+----------+---+-----+----+------------+
|        T1005|  Phone|Electronics|       1|  50000.0|   50000.0| 15|    2|2024|       Card |
|        T1004|  Chair|  Furniture|       4|   5000.0|   20000.0| 12|    2|2024|       Card |
|        T1006|  Mouse|Electronics|       3|   1000.0|    3000.0| 18|    2|2024|        Cash|
|        T1003|   Desk|  Furniture|       1|  15000.0|   15000.0| 10|    2|2024|Net Banking |
+-------------+-------+-----------+--------+---------+----------+---+-----+----+------------+



####  Union & Duplicate Handling

 22. Duplicate the DataFrame using union() and remove duplicates.

In [134]:
duplicate_df = retail_custom_df.union(retail_custom_df)

print("Before Removing Duplicates")
duplicate_df.show()

duplicate_df = duplicate_df.dropDuplicates()

print("Before Removing Duplicates")
duplicate_df.show()


Before Removing Duplicates
+---------+-------------+--------+-------+-----------+--------+---------+----------+----------+------------+--------------+------+----------+----+-----+---+---------+
|     City|TransactionID|Customer|Product|   Category|Quantity|UnitPrice|TotalPrice|   TxnDate| PaymentMode|DiscountAmount|Region|OrderLabel|Year|Month|Day|MonthName|
+---------+-------------+--------+-------+-----------+--------+---------+----------+----------+------------+--------------+------+----------+----+-----+---+---------+
|Bangalore|        T1002|    Neha| Tablet|Electronics|       2|  30000.0|   60000.0|2024-01-20|        UPI |        6000.0| South|      High|2024|    1| 20|  January|
|   Mumbai|        T1001|     Ali| Laptop|Electronics|       1|  70000.0|   70000.0|2024-01-15|       Card |        7000.0|  West|      High|2024|    1| 15|  January|
|   Mumbai|        T1005|   Karan|  Phone|Electronics|       1|  50000.0|   50000.0|2024-02-15|       Card |        5000.0|  West|    Medi

Storing the cleaned data

In [135]:
retail_custom_df.write.mode('overwrite') \
                      .parquet('/content/drive/MyDrive/Assignment/Cleaned_Retail_Transactions')