In [None]:
from pyspark.sql import SparkSession
spark=SparkSession.builder\
    .appName("Read CSV Example")\
    .getOrCreate()

In [None]:
df=spark.read\
    .option("header","true")\
    .option("inferSchema","true")\
    .csv("orders.csv")

In [None]:
from pyspark.sql.functions import trim,col
df=df.withColumn("city",trim(col("city")))\
     .withColumn("category",trim(col("category")))\
     .withColumn("product",trim(col("product")))

In [None]:
from pyspark.sql.functions import initcap
df=df.withColumn("city",initcap(col("city")))\
     .withColumn("category",initcap(col("category")))\
     .withColumn("product",initcap(col("product")))

In [None]:
from pyspark.sql.functions import when,regexp_replace
df=df.withColumn("amount_clean",regexp_replace(col("amount"),",",""))
df=df.withColumn("amount_int",when(col("amount_clean").rlike("^[0-9]+$"),
                    col("amount_clean").cast("int"))
                  .otherwise(None))

In [None]:
from pyspark.sql.functions import to_date,coalesce
df=df.withColumn("order_date_clean",
                 coalesce(
                     to_date(col("order_date"),"yyyy-MM-dd"),
                     to_date(col("order_date"),"dd/MM/yyyy"),
                     to_date(col("order_date"),"yyyy/MM/dd")
                 ))

In [None]:
df=df.dropDuplicates(["order_id"])
df=df.filter(col("status")=="Completed")

In [None]:
clean_orders_df=df

In [None]:
from pyspark.sql.functions import count, sum, avg, min, max, countDistinct

customer_metrics = clean_orders_df.groupBy("customer_id").agg(
    count("*").alias("total_orders"),
    sum("amount_int").alias("total_spend"),
    avg("amount_int").alias("avg_order_value"),
    min("order_date_clean").alias("first_purchase"),
    max("order_date_clean").alias("last_purchase"),
    countDistinct("city").alias("cities_count"),
    countDistinct("category").alias("categories_count")
)

In [None]:
from pyspark.sql.functions import when
customer_segmented = customer_metrics.withColumn(
    "customer_segment",
    when((col("total_spend") >= 200000) & (col("total_orders") >= 5), "VIP")
    .when(col("total_spend") >= 100000, "Premium")
    .otherwise("Regular")
)

In [None]:
customer_segmented.groupBy("customer_segment").count()

DataFrame[customer_segment: string, count: bigint]

In [None]:
from pyspark.sql.window import Window
from pyspark.sql.functions import rank
w_overall=Window.orderBy(col("total_spend").desc())
customer_segmented=customer_segmented.withColumn("overall_rank",rank().over(w_overall))

In [None]:
customer_city_spend = clean_orders_df.groupBy(
    "city", "customer_id"
).sum("amount_int").withColumnRenamed("sum(amount_int)", "city_spend")
w_city = Window.partitionBy("city").orderBy(col("city_spend").desc())
customer_city_rank = customer_city_spend.withColumn(
    "city_rank",
    rank().over(w_city))

In [None]:
#top 3 city
customer_city_rank.filter(col("city_rank")<=3)

DataFrame[city: string, customer_id: string, city_spend: bigint, city_rank: int]

In [None]:
#top 10
customer_segmented.filter(col("overall_rank")<=10)

DataFrame[customer_id: string, total_orders: bigint, total_spend: bigint, avg_order_value: double, first_purchase: date, last_purchase: date, cities_count: bigint, categories_count: bigint, customer_segment: string, overall_rank: int]

In [None]:
#define loyalty
loyal_customers = clean_orders_df.groupBy("customer_id").agg(
    countDistinct("order_date_clean").alias("purchase_days"),
    countDistinct("category").alias("category_count")
).filter(
    (col("purchase_days") >= 3) & (col("category_count") >= 2))

In [None]:
#loyal per city
clean_orders_df.join(loyal_customers,"customer_id")\
.groupBy("city").count()

DataFrame[city: string, count: bigint]

In [None]:
clean_orders_df.join(loyal_customers, "customer_id", "left") \
    .withColumn("loyal_flag",
                when(loyal_customers.customer_id.isNotNull(), "Loyal")
                .otherwise("Non-Loyal")) \
    .groupBy("loyal_flag") \
    .sum("amount_int")

DataFrame[loyal_flag: string, sum(amount_int): bigint]

In [None]:
#time based analysis
from pyspark.sql.functions import year, month, quarter
monthly_df=clean_orders_df.withColumn("year",year(col("order_date_clean")))\
                          .withColumn("month",month(col("order_date_clean")))

In [None]:
#monthly
monthly_df.groupBy("year","month","city")\
.sum("amount_int")

DataFrame[year: int, month: int, city: string, sum(amount_int): bigint]

In [None]:
monthly_df.groupBy("year","month","category")\
.count()

DataFrame[year: int, month: int, category: string, count: bigint]

In [None]:
#performance engineering
customer_segmented.cache()
customer_segmented.explain(True)

== Parsed Logical Plan ==
'Project [unresolvedstarwithcolumns(overall_rank, 'rank() windowspecdefinition('total_spend DESC NULLS LAST, unspecifiedframe$()), None)]
+- Project [customer_id#18, total_orders#500L, total_spend#501L, avg_order_value#502, first_purchase#503, last_purchase#504, cities_count#505L, categories_count#506L, CASE WHEN ((total_spend#501L >= cast(200000 as bigint)) AND (total_orders#500L >= cast(5 as bigint))) THEN VIP WHEN (total_spend#501L >= cast(100000 as bigint)) THEN Premium ELSE Regular END AS customer_segment#525]
   +- Aggregate [customer_id#18], [customer_id#18, count(1) AS total_orders#500L, sum(amount_int#33) AS total_spend#501L, avg(amount_int#33) AS avg_order_value#502, min(order_date_clean#35) AS first_purchase#503, max(order_date_clean#35) AS last_purchase#504, count(distinct city#29) AS cities_count#505L, count(distinct category#30) AS categories_count#506L]
      +- Filter (status#24 = Completed)
         +- Deduplicate [order_id#17]
            +- 

In [None]:
segment_data = [
    (1,"VIP"), (2,"Premium"), (3,"Regular")]
segment_df = spark.createDataFrame(
    segment_data, ["segment_code","customer_segment"])
from pyspark.sql.functions import broadcast
customer_segmented = customer_segmented.join(
    broadcast(segment_df),
    "customer_segment",
    "left")

In [None]:
customer_segmented.explain(True)

== Parsed Logical Plan ==
'Join UsingJoin(LeftOuter, [customer_segment])
:- Project [customer_id#18, total_orders#500L, total_spend#501L, avg_order_value#502, first_purchase#503, last_purchase#504, cities_count#505L, categories_count#506L, customer_segment#525, overall_rank#537]
:  +- Project [customer_id#18, total_orders#500L, total_spend#501L, avg_order_value#502, first_purchase#503, last_purchase#504, cities_count#505L, categories_count#506L, customer_segment#525, overall_rank#537, overall_rank#537]
:     +- Window [rank(total_spend#501L) windowspecdefinition(total_spend#501L DESC NULLS LAST, specifiedwindowframe(RowFrame, unboundedpreceding$(), currentrow$())) AS overall_rank#537], [total_spend#501L DESC NULLS LAST]
:        +- Project [customer_id#18, total_orders#500L, total_spend#501L, avg_order_value#502, first_purchase#503, last_purchase#504, cities_count#505L, categories_count#506L, customer_segment#525]
:           +- Project [customer_id#18, total_orders#500L, total_spend#5

In [None]:
#sort:
customer_segmented.orderBy(
    col('total_spend').desc(),
    col('total_orders').desc()
)

DataFrame[customer_segment: string, customer_id: string, total_orders: bigint, total_spend: bigint, avg_order_value: double, first_purchase: date, last_purchase: date, cities_count: bigint, categories_count: bigint, overall_rank: int, segment_code: bigint]

In [None]:
#set operations
electronics = clean_orders_df.filter(col("category")=="Electronics") \
                             .select("customer_id")
grocery = clean_orders_df.filter(col("category")=="Grocery") \
                          .select("customer_id")
both = electronics.intersect(grocery)
only_one = electronics.subtract(grocery)

In [None]:
#storage strategy
customer_segmented.write.mode("overwrite")\
     .partitionBy("customer_segment")\
     .parquet("customer_master")
monthly_df.write.mode("overwrite").orc("monthly_analytics")

In [None]:
#debugging
df=df.groupBy("customer_id").sum("amount")
df.show()
#.show() returns none,not dataframe