In [1]:
from pyspark.sql import SparkSession
spark=SparkSession.builder\
    .appName("Day21")\
    .getOrCreate()

In [3]:
orders_data = [
("O001","Delhi ","Laptop","45000","2024-01-05","Completed"),
("O002","Mumbai","Mobile ","32000","05/01/2024","Completed"),
("O003","Bangalore","Tablet","30000","2024/01/06","Completed"),
("O004","Delhi","Laptop","","2024-01-07","Cancelled"),
("O005","Mumbai","Mobile","invalid","2024-01-08","Completed"),
("O006","Chennai","Tablet",None,"2024-01-08","Completed"),
("O007","Delhi","Laptop","47000","09-01-2024","Completed"),
("O008","Bangalore","Mobile","28000","2024-01-09","Completed"),
("O009","Mumbai","Laptop","55000","2024-01-10","Completed"),
("O009","Mumbai","Laptop","55000","2024-01-10","Completed")
]
columns = ["order_id","city","product","amount","order_date","status"]


In [4]:
from pyspark.sql.types import StructType, StructField, StringType
schema = StructType([
    StructField("order_id", StringType(), True),
    StructField("city",StringType(), True),
    StructField("product",StringType(), True),
    StructField("amount",StringType(), True),
    StructField("order_date",StringType(), True),
    StructField("status",StringType(), True)
])


In [7]:
df_raw = spark.createDataFrame(orders_data, schema=schema)
df_raw.show()

+--------+---------+-------+-------+----------+---------+
|order_id|     city|product| amount|order_date|   status|
+--------+---------+-------+-------+----------+---------+
|    O001|   Delhi | Laptop|  45000|2024-01-05|Completed|
|    O002|   Mumbai|Mobile |  32000|05/01/2024|Completed|
|    O003|Bangalore| Tablet|  30000|2024/01/06|Completed|
|    O004|    Delhi| Laptop|       |2024-01-07|Cancelled|
|    O005|   Mumbai| Mobile|invalid|2024-01-08|Completed|
|    O006|  Chennai| Tablet|   NULL|2024-01-08|Completed|
|    O007|    Delhi| Laptop|  47000|09-01-2024|Completed|
|    O008|Bangalore| Mobile|  28000|2024-01-09|Completed|
|    O009|   Mumbai| Laptop|  55000|2024-01-10|Completed|
|    O009|   Mumbai| Laptop|  55000|2024-01-10|Completed|
+--------+---------+-------+-------+----------+---------+



In [6]:
df_raw.printSchema()

root
 |-- order_id: string (nullable = true)
 |-- city: string (nullable = true)
 |-- product: string (nullable = true)
 |-- amount: string (nullable = true)
 |-- order_date: string (nullable = true)
 |-- status: string (nullable = true)



In [9]:
#Phase 2->Task 4
from pyspark.sql import functions as F
from pyspark.sql.types import StringType
def trim_all_string_cols(df):
    exprs = []
    for f in df.schema.fields:
        if isinstance(f.dataType, StringType):
            exprs.append(F.trim(F.col(f.name)).alias(f.name))
        else:
            exprs.append(F.col(f.name))
    return df.select(*exprs)
df = trim_all_string_cols(df_raw)


In [11]:
#Task->5
df = (
    df.withColumn("city",F.initcap(F.col("city")))
      .withColumn("product",F.initcap(F.col("product")))
      .withColumn("status",F.initcap(F.col("status")))
)

In [12]:
#Task->6
df = df.withColumn(
    "amount_int",
    F.when(F.col("amount").rlike(r"^[0-9]+$"), F.col("amount").cast("int")).otherwise(F.lit(None).cast("int"))
)

In [16]:
#Task->7
invalid_count = df.filter(F.col("amount_int").isNull()).count()
print(f"\nInvalid or null amount rows (will be excluded): {invalid_count}")
df_clean = df.filter(F.col("amount_int").isNotNull())


Invalid or null amount rows (will be excluded): 2


In [14]:
#Task->8
df = df.dropDuplicates(["order_id"])

In [23]:
#Task->9
df = df.filter(F.col("status")=="Completed")

In [21]:
#Phase-3-Task->10
revenue_by_city = (
    df_clean.groupBy("city")
    .agg(F.sum("amount_int").alias("total_revenue"))
    .orderBy(F.desc("total_revenue"))
)
revenue_by_city.show()

+---------+-------------+
|     city|total_revenue|
+---------+-------------+
|    Delhi|        92000|
|   Mumbai|        87000|
|Bangalore|        58000|
+---------+-------------+



In [20]:
#Task->11
revenue_by_product = (
    df_clean.groupBy("product")
    .agg(F.sum("amount_int").alias("total_revenue"))
    .orderBy(F.desc("total_revenue"))
)
revenue_by_product.show()

+-------+-------------+
|product|total_revenue|
+-------+-------------+
| Laptop|       147000|
| Mobile|        60000|
| Tablet|        30000|
+-------+-------------+



In [19]:
#Task->12
aov_by_city = (
    df_clean.groupBy("city")
    .agg(F.avg("amount_int").alias("avg_order_value"))
    .orderBy(F.desc("avg_order_value"))
)
aov_by_city.show()

+---------+---------------+
|     city|avg_order_value|
+---------+---------------+
|    Delhi|        46000.0|
|   Mumbai|        43500.0|
|Bangalore|        29000.0|
+---------+---------------+



In [26]:
#PHASE 4-Task->13
from pyspark.sql.window import Window
w = Window.orderBy(F.desc("total_revenue"))
city_ranked = revenue_by_city.withColumn("revenue_rank", F.rank().over(w))
city_ranked.show()

+---------+-------------+------------+
|     city|total_revenue|revenue_rank|
+---------+-------------+------------+
|    Delhi|        92000|           1|
|   Mumbai|        87000|           2|
|Bangalore|        58000|           3|
+---------+-------------+------------+



In [27]:
#task->14
top_city = city_ranked.filter(F.col("revenue_rank") == 1)
top_city.show()

+-----+-------------+------------+
| city|total_revenue|revenue_rank|
+-----+-------------+------------+
|Delhi|        92000|           1|
+-----+-------------+------------+



In [28]:
#PHASE 5-Task->15
df_clean.cache()

DataFrame[order_id: string, city: string, product: string, amount: string, order_date: string, status: string, amount_int: int, order_date_parsed: date]

In [None]:
#Task->16
revenue_by_city_1 = df_clean.groupBy("city").agg(F.sum("amount_int").alias("total_revenue")).orderBy(F.desc("total_revenue"))
revenue_by_city_1.show()

revenue_by_city_2 = df_clean.groupBy("city").agg(F.sum("amount_int").alias("total_revenue")).orderBy(F.desc("total_revenue"))
revenue_by_city_2.show()

In [None]:
revenue_by_city_2.explain(True)