In [1]:
!rm -rf /content/spark-3.4.1-bin-hadoop3
!rm -rf /content/spark-3.3.0-bin-hadoop3


!apt-get install openjdk-8-jdk-headless -qq > /dev/null

!wget -q https://archive.apache.org/dist/spark/spark-3.3.0/spark-3.3.0-bin-hadoop3.tgz
!tar xf spark-3.3.0-bin-hadoop3.tgz

!pip install -q findspark

import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.3.0-bin-hadoop3"

import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, avg, desc, sum as _sum

spark = SparkSession.builder.appName("Retail").getOrCreate()

In [2]:
from google.colab import files
uploaded = files.upload()


Saving retail_data.csv to retail_data.csv


In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import to_date
df = spark.read.csv("retail_data.csv", header=True, inferSchema=True)
df = df.withColumn("TransactionDate", to_date("TransactionDate"))


df.show()
df.printSchema()

+-------------+--------+---------+-------+-----------+--------+---------+----------+---------------+-----------+
|TransactionID|Customer|     City|Product|   Category|Quantity|UnitPrice|TotalPrice|TransactionDate|PaymentMode|
+-------------+--------+---------+-------+-----------+--------+---------+----------+---------------+-----------+
|        T1001|     Ali|   Mumbai| Laptop|Electronics|       1|    70000|     70000|     2024-01-15|       Card|
|        T1002|    Neha|Bangalore| Tablet|Electronics|       2|    30000|     60000|     2024-01-20|        UPI|
|        T1003|    Ravi|Hyderabad|   Desk|  Furniture|       1|    15000|     15000|     2024-02-10|Net Banking|
|        T1004|    Zoya|    Delhi|  Chair|  Furniture|       4|     5000|     20000|     2024-02-12|       Card|
|        T1005|   Karan|   Mumbai|  Phone|Electronics|       1|    50000|     50000|     2024-02-15|       Card|
|        T1006|   Farah|    Delhi|  Mouse|Electronics|       3|     1000|      3000|     2024-02

In [4]:

# 2. Load without inferring schema
df = df.withColumn("Quantity", col("Quantity").cast("int")) \
       .withColumn("UnitPrice", col("UnitPrice").cast("int")) \
       .withColumn("TotalPrice", col("TotalPrice").cast("int"))


In [5]:
# 3. TotalPrice > 40000
df.filter(col("TotalPrice") > 40000).show()

# 4. Unique cities
df.select("City").distinct().show()

# 5. Transactions from Delhi
df.filter(col("City") == "Delhi").show()
df.where(col("City") == "Delhi").show()


+-------------+--------+---------+-------+-----------+--------+---------+----------+---------------+-----------+
|TransactionID|Customer|     City|Product|   Category|Quantity|UnitPrice|TotalPrice|TransactionDate|PaymentMode|
+-------------+--------+---------+-------+-----------+--------+---------+----------+---------------+-----------+
|        T1001|     Ali|   Mumbai| Laptop|Electronics|       1|    70000|     70000|     2024-01-15|       Card|
|        T1002|    Neha|Bangalore| Tablet|Electronics|       2|    30000|     60000|     2024-01-20|        UPI|
|        T1005|   Karan|   Mumbai|  Phone|Electronics|       1|    50000|     50000|     2024-02-15|       Card|
+-------------+--------+---------+-------+-----------+--------+---------+----------+---------------+-----------+

+---------+
|     City|
+---------+
|Bangalore|
|   Mumbai|
|    Delhi|
|Hyderabad|
+---------+

+-------------+--------+-----+-------+-----------+--------+---------+----------+---------------+-----------+
|T

In [6]:
# 6. Add DiscountedPrice
df = df.withColumn("DiscountedPrice", col("TotalPrice") * 0.9)

# 7. Rename TransactionDate to TxnDate
df = df.withColumnRenamed("TransactionDate", "TxnDate")

# 8. Drop UnitPrice
df = df.drop("UnitPrice")


In [7]:
# Reload the CSV fresh
df = spark.read.csv("retail_data.csv", header=True, inferSchema=True)

df = df.withColumn("TransactionDate", to_date("TransactionDate"))
df.show()
df.printSchema()

+-------------+--------+---------+-------+-----------+--------+---------+----------+---------------+-----------+
|TransactionID|Customer|     City|Product|   Category|Quantity|UnitPrice|TotalPrice|TransactionDate|PaymentMode|
+-------------+--------+---------+-------+-----------+--------+---------+----------+---------------+-----------+
|        T1001|     Ali|   Mumbai| Laptop|Electronics|       1|    70000|     70000|     2024-01-15|       Card|
|        T1002|    Neha|Bangalore| Tablet|Electronics|       2|    30000|     60000|     2024-01-20|        UPI|
|        T1003|    Ravi|Hyderabad|   Desk|  Furniture|       1|    15000|     15000|     2024-02-10|Net Banking|
|        T1004|    Zoya|    Delhi|  Chair|  Furniture|       4|     5000|     20000|     2024-02-12|       Card|
|        T1005|   Karan|   Mumbai|  Phone|Electronics|       1|    50000|     50000|     2024-02-15|       Card|
|        T1006|   Farah|    Delhi|  Mouse|Electronics|       3|     1000|      3000|     2024-02

In [8]:
from pyspark.sql.functions import col

df = df.withColumn("TotalPrice", col("TotalPrice").cast("int")) \
       .withColumn("UnitPrice", col("UnitPrice").cast("int")) \
       .withColumn("Quantity", col("Quantity").cast("int"))


In [9]:
from pyspark.sql.functions import sum, avg

# 9. Total sales by city
df.groupBy("City").agg(sum("TotalPrice").alias("TotalSales")).show()

# 10. Avg unit price by category
df.groupBy("Category").agg(avg("UnitPrice").alias("AvgUnitPrice")).show()

# 11. Count of transactions by PaymentMode
df.groupBy("PaymentMode").count().show()


+---------+----------+
|     City|TotalSales|
+---------+----------+
|Bangalore|     60000|
|   Mumbai|    120000|
|    Delhi|     23000|
|Hyderabad|     15000|
+---------+----------+

+-----------+------------+
|   Category|AvgUnitPrice|
+-----------+------------+
|Electronics|     37750.0|
|  Furniture|     10000.0|
+-----------+------------+

+-----------+-----+
|PaymentMode|count|
+-----------+-----+
|Net Banking|    1|
|       Card|    3|
|       Cash|    1|
|        UPI|    1|
+-----------+-----+



In [10]:
from pyspark.sql.window import Window
from pyspark.sql.functions import rank

windowSpec = Window.partitionBy("City").orderBy(col("TotalPrice").desc())
df = df.withColumn("RankInCity", rank().over(windowSpec))
df.select("TransactionID", "City", "TotalPrice", "RankInCity").show()


+-------------+---------+----------+----------+
|TransactionID|     City|TotalPrice|RankInCity|
+-------------+---------+----------+----------+
|        T1002|Bangalore|     60000|         1|
|        T1004|    Delhi|     20000|         1|
|        T1006|    Delhi|      3000|         2|
|        T1003|Hyderabad|     15000|         1|
|        T1001|   Mumbai|     70000|         1|
|        T1005|   Mumbai|     50000|         2|
+-------------+---------+----------+----------+



In [11]:
from pyspark.sql.functions import lag

df = df.withColumn("PrevTransaction", lag("TotalPrice").over(windowSpec))
df.select("TransactionID", "City", "TotalPrice", "PrevTransaction").show()


+-------------+---------+----------+---------------+
|TransactionID|     City|TotalPrice|PrevTransaction|
+-------------+---------+----------+---------------+
|        T1002|Bangalore|     60000|           null|
|        T1004|    Delhi|     20000|           null|
|        T1006|    Delhi|      3000|          20000|
|        T1003|Hyderabad|     15000|           null|
|        T1001|   Mumbai|     70000|           null|
|        T1005|   Mumbai|     50000|          70000|
+-------------+---------+----------+---------------+



In [12]:
from pyspark.sql.functions import when

df_dirty = df.withColumn("Quantity", when(col("TransactionID") == "T1002", None).otherwise(col("Quantity"))) \
             .withColumn("PaymentMode", when(col("TransactionID") == "T1003", None).otherwise(col("PaymentMode")))

df_dirty.show()


+-------------+--------+---------+-------+-----------+--------+---------+----------+---------------+-----------+----------+---------------+
|TransactionID|Customer|     City|Product|   Category|Quantity|UnitPrice|TotalPrice|TransactionDate|PaymentMode|RankInCity|PrevTransaction|
+-------------+--------+---------+-------+-----------+--------+---------+----------+---------------+-----------+----------+---------------+
|        T1002|    Neha|Bangalore| Tablet|Electronics|    null|    30000|     60000|     2024-01-20|        UPI|         1|           null|
|        T1004|    Zoya|    Delhi|  Chair|  Furniture|       4|     5000|     20000|     2024-02-12|       Card|         1|           null|
|        T1006|   Farah|    Delhi|  Mouse|Electronics|       3|     1000|      3000|     2024-02-18|       Cash|         2|          20000|
|        T1003|    Ravi|Hyderabad|   Desk|  Furniture|       1|    15000|     15000|     2024-02-10|       null|         1|           null|
|        T1001|     

In [13]:
df_cleaned = df_dirty.dropna(subset=["Quantity"])
df_cleaned.show()


+-------------+--------+---------+-------+-----------+--------+---------+----------+---------------+-----------+----------+---------------+
|TransactionID|Customer|     City|Product|   Category|Quantity|UnitPrice|TotalPrice|TransactionDate|PaymentMode|RankInCity|PrevTransaction|
+-------------+--------+---------+-------+-----------+--------+---------+----------+---------------+-----------+----------+---------------+
|        T1004|    Zoya|    Delhi|  Chair|  Furniture|       4|     5000|     20000|     2024-02-12|       Card|         1|           null|
|        T1006|   Farah|    Delhi|  Mouse|Electronics|       3|     1000|      3000|     2024-02-18|       Cash|         2|          20000|
|        T1003|    Ravi|Hyderabad|   Desk|  Furniture|       1|    15000|     15000|     2024-02-10|       null|         1|           null|
|        T1001|     Ali|   Mumbai| Laptop|Electronics|       1|    70000|     70000|     2024-01-15|       Card|         1|           null|
|        T1005|   Ka

In [14]:
df_cleaned = df_cleaned.fillna({"PaymentMode": "Unknown"})
df_cleaned.show()


+-------------+--------+---------+-------+-----------+--------+---------+----------+---------------+-----------+----------+---------------+
|TransactionID|Customer|     City|Product|   Category|Quantity|UnitPrice|TotalPrice|TransactionDate|PaymentMode|RankInCity|PrevTransaction|
+-------------+--------+---------+-------+-----------+--------+---------+----------+---------------+-----------+----------+---------------+
|        T1004|    Zoya|    Delhi|  Chair|  Furniture|       4|     5000|     20000|     2024-02-12|       Card|         1|           null|
|        T1006|   Farah|    Delhi|  Mouse|Electronics|       3|     1000|      3000|     2024-02-18|       Cash|         2|          20000|
|        T1003|    Ravi|Hyderabad|   Desk|  Furniture|       1|    15000|     15000|     2024-02-10|    Unknown|         1|           null|
|        T1001|     Ali|   Mumbai| Laptop|Electronics|       1|    70000|     70000|     2024-01-15|       Card|         1|           null|
|        T1005|   Ka

In [15]:
from pyspark.sql.functions import year, month, dayofmonth

df_cleaned = df_cleaned.withColumn("Year", year("TransactionDate")) \
                       .withColumn("Month", month("TransactionDate")) \
                       .withColumn("Day", dayofmonth("TransactionDate"))
df_cleaned.select("TransactionID", "TransactionDate", "Year", "Month", "Day").show()


+-------------+---------------+----+-----+---+
|TransactionID|TransactionDate|Year|Month|Day|
+-------------+---------------+----+-----+---+
|        T1001|     2024-01-15|2024|    1| 15|
|        T1003|     2024-02-10|2024|    2| 10|
|        T1004|     2024-02-12|2024|    2| 12|
|        T1005|     2024-02-15|2024|    2| 15|
|        T1006|     2024-02-18|2024|    2| 18|
+-------------+---------------+----+-----+---+



In [17]:
df_cleaned.filter(month("TransactionDate") == 2).show()


+-------------+--------+---------+-------+-----------+--------+---------+----------+---------------+-----------+----------+---------------+----+-----+---+
|TransactionID|Customer|     City|Product|   Category|Quantity|UnitPrice|TotalPrice|TransactionDate|PaymentMode|RankInCity|PrevTransaction|Year|Month|Day|
+-------------+--------+---------+-------+-----------+--------+---------+----------+---------------+-----------+----------+---------------+----+-----+---+
|        T1004|    Zoya|    Delhi|  Chair|  Furniture|       4|     5000|     20000|     2024-02-12|       Card|         1|           null|2024|    2| 12|
|        T1006|   Farah|    Delhi|  Mouse|Electronics|       3|     1000|      3000|     2024-02-18|       Cash|         2|          20000|2024|    2| 18|
|        T1003|    Ravi|Hyderabad|   Desk|  Furniture|       1|    15000|     15000|     2024-02-10|    Unknown|         1|           null|2024|    2| 10|
|        T1005|   Karan|   Mumbai|  Phone|Electronics|       1|    500

In [18]:
df_duped = df_cleaned.union(df_cleaned)
print("Count before removing duplicates:", df_duped.count())
print("Count after removing duplicates:", df_duped.distinct().count())


Count before removing duplicates: 10
Count after removing duplicates: 5
