In [0]:
from pyspark.sql import SparkSession 

In [0]:
spark = SparkSession.builder.appName('Challenges').getOrCreate()

In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql.functions import *
from pyspark.sql.types import StructType, StructField, LongType, StringType, DateType, IntegerType

### Challenge 1: Total Spend Per Customer

Calculate total amount spent by each customer.


In [0]:
from pyspark.sql import Row

data = [
    Row(customer_id=1, amount=250),
    Row(customer_id=2, amount=450),
    Row(customer_id=1, amount=100),
    Row(customer_id=3, amount=300),
    Row(customer_id=2, amount=150)
]
df = spark.createDataFrame(data)
df.show()

+-----------+------+
|customer_id|amount|
+-----------+------+
|          1|   250|
|          2|   450|
|          1|   100|
|          3|   300|
|          2|   150|
+-----------+------+



In [0]:
df.groupBy("customer_id").agg(F.sum("amount")).show()


+-----------+-----------+
|customer_id|sum(amount)|
+-----------+-----------+
|          1|        350|
|          2|        600|
|          3|        300|
+-----------+-----------+



### Challenge 2: Highest Transaction Per Day

Find the highest transaction amount for each day.


In [0]:
data = [
    Row(date='2023-01-01', amount=100),
    Row(date='2023-01-01', amount=300),
    Row(date='2023-01-02', amount=150),
    Row(date='2023-01-02', amount=200)
]
df = spark.createDataFrame(data)
df.show()

+----------+------+
|      date|amount|
+----------+------+
|2023-01-01|   100|
|2023-01-01|   300|
|2023-01-02|   150|
|2023-01-02|   200|
+----------+------+



In [0]:
df.groupBy('date').agg(F.max('amount')).show()

+----------+-----------+
|      date|max(amount)|
+----------+-----------+
|2023-01-01|        300|
|2023-01-02|        200|
+----------+-----------+



### Challenge 3: Fill Missing Cities With Default

Replace null city values with 'Unknown'.


In [0]:
data = [
    Row(customer_id=1, city='Dallas'),
    Row(customer_id=2, city=None),
    Row(customer_id=3, city='Austin'),
    Row(customer_id=4, city=None)
]
df = spark.createDataFrame(data)
df.show()

+-----------+------+
|customer_id|  city|
+-----------+------+
|          1|Dallas|
|          2|  NULL|
|          3|Austin|
|          4|  NULL|
+-----------+------+



In [0]:
df.fillna(value='Unknown', subset=['city']).show()

+-----------+-------+
|customer_id|   city|
+-----------+-------+
|          1| Dallas|
|          2|Unknown|
|          3| Austin|
|          4|Unknown|
+-----------+-------+



### Challenge 4: Compute Running Total by Customer

Use a window function to compute cumulative sum of purchases per customer.


In [0]:
data = [
    Row(customer_id=1, date='2023-01-01', amount=100),
    Row(customer_id=1, date='2023-01-02', amount=200),
    Row(customer_id=2, date='2023-01-01', amount=300),
    Row(customer_id=2, date='2023-01-02', amount=400)
]
df = spark.createDataFrame(data)
df.show()

+-----------+----------+------+
|customer_id|      date|amount|
+-----------+----------+------+
|          1|2023-01-01|   100|
|          1|2023-01-02|   200|
|          2|2023-01-01|   300|
|          2|2023-01-02|   400|
+-----------+----------+------+



In [0]:
display(df)

customer_id,date,amount
1,2023-01-01,100
1,2023-01-02,200
2,2023-01-01,300
2,2023-01-02,400


In [0]:
from pyspark.sql import Window
from pyspark.sql import functions as F
w = Window.partitionBy('customer_id')
cum_sum = df.withColumn('amount_cum', F.sum('amount').over(w))
cum_sum.show()
                                        

+-----------+----------+------+----------+
|customer_id|      date|amount|amount_cum|
+-----------+----------+------+----------+
|          1|2023-01-01|   100|       300|
|          1|2023-01-02|   200|       300|
|          2|2023-01-01|   300|       700|
|          2|2023-01-02|   400|       700|
+-----------+----------+------+----------+



+-----------+----------+------+----------+
|customer_id|      date|amount|amount_cum|
+-----------+----------+------+----------+
|          1|2023-01-01|   100|       300|
|          1|2023-01-02|   200|       300|
|          2|2023-01-01|   300|       700|
|          2|2023-01-02|   400|       700|
+-----------+----------+------+----------+



### Challenge 5: Average Sales Per Product

Find average amount per product.


In [0]:
data = [
    Row(product='A', amount=100),
    Row(product='B', amount=200),
    Row(product='A', amount=300),
    Row(product='B', amount=400)
]
df = spark.createDataFrame(data)
df.show()

+-------+------+
|product|amount|
+-------+------+
|      A|   100|
|      B|   200|
|      A|   300|
|      B|   400|
+-------+------+



In [0]:
df.groupBy('product').agg(F.avg('amount')).show()

+-------+-----------+
|product|avg(amount)|
+-------+-----------+
|      A|      200.0|
|      B|      300.0|
+-------+-----------+



### Challenge 6: Extract Year From Date

Add a column to extract year from given date.


In [0]:
data = [
    Row(customer='John', transaction_date='2022-11-01'),
    Row(customer='Alice', transaction_date='2023-01-01')
]
df = spark.createDataFrame(data)
df.show()


+--------+----------------+
|customer|transaction_date|
+--------+----------------+
|    John|      2022-11-01|
|   Alice|      2023-01-01|
+--------+----------------+



In [0]:
df_withYear = df.withColumn('year',year(df['transaction_date'])).show()

+--------+----------------+----+
|customer|transaction_date|year|
+--------+----------------+----+
|    John|      2022-11-01|2022|
|   Alice|      2023-01-01|2023|
+--------+----------------+----+



### Challenge 7: Join Product and Sales Data

Join two DataFrames on product_id to get product names with amounts.


In [0]:
products = [
    Row(product_id=1, product_name='Phone'),
    Row(product_id=2, product_name='Tablet')
]
sales = [
    Row(product_id=1, amount=500),
    Row(product_id=2, amount=800),
    Row(product_id=1, amount=200)
]
df_products = spark.createDataFrame(products)
df_sales = spark.createDataFrame(sales)
df_products.show()
df_sales.show()

+----------+------------+
|product_id|product_name|
+----------+------------+
|         1|       Phone|
|         2|      Tablet|
+----------+------------+

+----------+------+
|product_id|amount|
+----------+------+
|         1|   500|
|         2|   800|
|         1|   200|
+----------+------+



In [0]:
df_products.join(df_sales, on='product_id', how='inner').show()

+----------+------------+------+
|product_id|product_name|amount|
+----------+------------+------+
|         1|       Phone|   200|
|         1|       Phone|   500|
|         2|      Tablet|   800|
+----------+------------+------+



### Challenge 8: Split Tags Into Rows

Given a list of comma-separated tags, explode them into individual rows.


In [0]:
data = [
    Row(id=1, tags='tech,news'),
    Row(id=2, tags='sports,music'),
    Row(id=3, tags='food')
]
df = spark.createDataFrame(data)
df.show()


+---+------------+
| id|        tags|
+---+------------+
|  1|   tech,news|
|  2|sports,music|
|  3|        food|
+---+------------+



In [0]:

from pyspark.sql.functions import explode, split
df.withColumn('tags',explode(split(df['tags'],','))).show()

+---+------+
| id|  tags|
+---+------+
|  1|  tech|
|  1|  news|
|  2|sports|
|  2| music|
|  3|  food|
+---+------+



### Challenge 9: Top-N Records Per Group

For each category, return top 2 records based on score.


In [0]:
data = [
    Row(category='A', name='x', score=80),
    Row(category='A', name='y', score=90),
    Row(category='A', name='z', score=70),
    Row(category='B', name='p', score=60),
    Row(category='B', name='q', score=85)
]
df = spark.createDataFrame(data)
df.show()

+--------+----+-----+
|category|name|score|
+--------+----+-----+
|       A|   x|   80|
|       A|   y|   90|
|       A|   z|   70|
|       B|   p|   60|
|       B|   q|   85|
+--------+----+-----+



In [0]:
w = Window.partitionBy('category').orderBy(F.desc('score'))
df_ranked = df.withColumn("rank", F.row_number().over(w))
top_2 = df_ranked.filter(F.col('rank') <= 2)
top_2 = top_2.drop('rank')
top_2.show()


+--------+----+-----+
|category|name|score|
+--------+----+-----+
|       A|   y|   90|
|       A|   x|   80|
|       B|   q|   85|
|       B|   p|   60|
+--------+----+-----+



### Challenge 10: Null Safe Join

Join two datasets where join key might have nulls, handle using null-safe join.


In [0]:
data1 = [
    Row(id=1, name='John'),
    Row(id=None, name='Mike'),
    Row(id=2, name='Alice')
]
data2 = [
    Row(id=1, salary=5000),
    Row(id=None, salary=3000)
]
df1 = spark.createDataFrame(data1)
df2 = spark.createDataFrame(data2)
df1.show()
df2.show()


+----+-----+
|  id| name|
+----+-----+
|   1| John|
|NULL| Mike|
|   2|Alice|
+----+-----+

+----+------+
|  id|salary|
+----+------+
|   1|  5000|
|NULL|  3000|
+----+------+



In [0]:
df1.join(df2, df1['id'].eqNullSafe(df2['id']),'outer').show()

+----+-----+----+------+
|  id| name|  id|salary|
+----+-----+----+------+
|   1| John|   1|  5000|
|NULL| Mike|NULL|  3000|
|   2|Alice|NULL|  NULL|
+----+-----+----+------+

