In [1]:
import pyspark
from pyspark.sql import SparkSession #Necessary for initializing pyspark
from pyspark.sql.functions import *

In [4]:
spark = SparkSession.builder.appName('Pyspark Demo').getOrCreate()

In [6]:
csv_file = "retail_sales_dataset.csv"

df = spark.read \
    .format("csv") \
    .option("inferSchema", "true") \
    .option("header", "true") \
    .load(csv_file)


# Creating a temporary view to run SQL Queries
df.createOrReplaceTempView("u") 

In [7]:
df.show()

+--------------+----------+-----------+------+---+----------------+--------+--------------+------------+
|Transaction ID|      Date|Customer ID|Gender|Age|Product Category|Quantity|Price per Unit|Total Amount|
+--------------+----------+-----------+------+---+----------------+--------+--------------+------------+
|             1|2023-11-24|    CUST001|  Male| 34|          Beauty|       3|            50|         150|
|             2|2023-02-27|    CUST002|Female| 26|        Clothing|       2|           500|        1000|
|             3|2023-01-13|    CUST003|  Male| 50|     Electronics|       1|            30|          30|
|             4|2023-05-21|    CUST004|  Male| 37|        Clothing|       1|           500|         500|
|             5|2023-05-06|    CUST005|  Male| 30|          Beauty|       2|            50|         100|
|             6|2023-04-25|    CUST006|Female| 45|          Beauty|       1|            30|          30|
|             7|2023-03-13|    CUST007|  Male| 46|     

#### Find the Most Sold Category

In [8]:
sales_by_category = df.groupBy("Product Category").agg(sum("Quantity").alias("Total Quantity Sold"))

In [12]:
sales_by_category.orderBy("Total Quantity Sold", ascending = False).show()

+----------------+-------------------+
|Product Category|Total Quantity Sold|
+----------------+-------------------+
|        Clothing|                894|
|     Electronics|                849|
|          Beauty|                771|
+----------------+-------------------+



In [14]:
most_sold_category = sales_by_category.orderBy("Total Quantity Sold", ascending = False).limit(1)
most_sold_category.show()

+----------------+-------------------+
|Product Category|Total Quantity Sold|
+----------------+-------------------+
|        Clothing|                894|
+----------------+-------------------+



#### Find Most revenue generating category

In [15]:
rev_by_category = df.groupBy("Product Category").agg(sum("Total Amount").alias("Total Amount Sold by Product Category"))

In [18]:
rev_by_category.orderBy("Total Amount Sold by Product Category", ascending = False).show()

+----------------+-------------------------------------+
|Product Category|Total Amount Sold by Product Category|
+----------------+-------------------------------------+
|     Electronics|                               156905|
|        Clothing|                               155580|
|          Beauty|                               143515|
+----------------+-------------------------------------+



In [20]:
# Most revenue Generating category
rev_by_category.orderBy("Total Amount Sold by Product Category", ascending = False).limit(1).show()

+----------------+-------------------------------------+
|Product Category|Total Amount Sold by Product Category|
+----------------+-------------------------------------+
|     Electronics|                               156905|
+----------------+-------------------------------------+



#### Most valued top 10 customers

In [22]:
rev_by_customer = df.groupBy("Customer ID").agg(sum("Total Amount").alias("Total Amount Sold by Customer ID"))

In [25]:
rev_by_customer.orderBy("Total Amount Sold by Customer ID", ascending = False).limit(10).show()

+-----------+--------------------------------+
|Customer ID|Total Amount Sold by Customer ID|
+-----------+--------------------------------+
|    CUST412|                            2000|
|    CUST595|                            2000|
|    CUST093|                            2000|
|    CUST946|                            2000|
|    CUST743|                            2000|
|    CUST269|                            2000|
|    CUST832|                            2000|
|    CUST476|                            2000|
|    CUST875|                            2000|
|    CUST487|                            2000|
+-----------+--------------------------------+



#### Find the gender buys which product category the most

In [27]:
gender_product_category =   df.groupBy("Gender", "Product Category").count()
gender_product_category.show()

+------+----------------+-----+
|Gender|Product Category|count|
+------+----------------+-----+
|  Male|        Clothing|  177|
|Female|        Clothing|  174|
|  Male|     Electronics|  172|
|Female|          Beauty|  166|
|  Male|          Beauty|  141|
|Female|     Electronics|  170|
+------+----------------+-----+



#### Create a chart for the age category vs the sales

In [37]:
df = df.withColumn("Age Category",
                     when((df['Age'] >= 1) & (df['Age'] <= 9), "1-9")
                    .when((df['Age'] >= 10) & (df['Age'] <= 19), "10-19")
                    .when((df['Age'] >= 20) & (df['Age'] <= 29), "20-29")
                    .when((df['Age'] >= 30) & (df['Age'] <= 39), "30-39")
                    .when((df['Age'] >= 40) & (df['Age'] <= 49), "40-49")
                    .when((df['Age'] >= 50) & (df['Age'] <= 59), "50-59")
                    .when(df['Age'] > 60, "60+")
                    .otherwise("Unknown"))
df.show()

+--------------+----------+-----------+------+---+----------------+--------+--------------+------------+------------+
|Transaction ID|      Date|Customer ID|Gender|Age|Product Category|Quantity|Price per Unit|Total Amount|Age Category|
+--------------+----------+-----------+------+---+----------------+--------+--------------+------------+------------+
|             1|2023-11-24|    CUST001|  Male| 34|          Beauty|       3|            50|         150|       30-39|
|             2|2023-02-27|    CUST002|Female| 26|        Clothing|       2|           500|        1000|       20-29|
|             3|2023-01-13|    CUST003|  Male| 50|     Electronics|       1|            30|          30|       50-59|
|             4|2023-05-21|    CUST004|  Male| 37|        Clothing|       1|           500|         500|       30-39|
|             5|2023-05-06|    CUST005|  Male| 30|          Beauty|       2|            50|         100|       30-39|
|             6|2023-04-25|    CUST006|Female| 45|      

In [38]:
sales_by_age_category = df.groupBy("Age Category").agg(sum("Total Amount").alias("Total Sales"))

In [39]:
sales_by_age_category = sales_by_age_category.orderBy("Age Category")

In [40]:
sales_by_age_category.show()

+------------+-----------+
|Age Category|Total Sales|
+------------+-----------+
|       10-19|      26085|
|       20-29|      97070|
|       30-39|      96325|
|       40-49|      93365|
|       50-59|      98340|
|         60+|      33225|
|     Unknown|      11590|
+------------+-----------+

