You are given a history of credit card transaction data for the people of India across cities. Write an SQL to find percentage contribution of spends by females in each city.  Round the percentage to 2 decimal places. Display city, total spend , female spend and female contribution in ascending order of city.

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType
from pyspark.sql.functions import *
from pyspark.sql.window import *


# Initialize Spark session
spark = SparkSession.builder.appName("TransactionData").getOrCreate()

# Define schema
schema = StructType([
    StructField("transaction_id", IntegerType(), True),
    StructField("city", StringType(), True),
    StructField("transaction_date", StringType(), True),
    StructField("card_type", StringType(), True),
    StructField("gender", StringType(), True),
    StructField("amount", IntegerType(), True)
])


# Data
data = [
    (1, "Delhi", "2024-01-13", "Gold", "F", 500),
    (2, "Bengaluru", "2024-01-13", "Silver", "M", 1000),
    (3, "Mumbai", "2024-01-14", "Silver", "F", 1200),
    (4, "Bengaluru", "2024-01-14", "Gold", "M", 900),
    (5, "Bengaluru", "2024-01-14", "Gold", "F", 300),
    (6, "Delhi", "2024-01-15", "Silver", "M", 200),
    (7, "Mumbai", "2024-01-15", "Gold", "F", 900),
    (8, "Delhi", "2024-01-15", "Gold", "F", 800),
    (9, "Mumbai", "2024-01-15", "Silver", "F", 150),
    (10, "Mumbai", "2024-01-16", "Platinum", "F", 1900),
    (11, "Bengaluru", "2024-01-16", "Platinum", "M", 1250),
    (12, "Delhi", "2024-01-16", "Platinum", "F", 130)
]

# Create DataFrame
df = spark.createDataFrame(data, schema=schema)

#sum of total spend & converting the transaction_date to date datatype
window_spec=Window.partitionBy(col("city"))
df=df.withColumn("transaction_date",to_date(col("transaction_date"),'yyyy-MM-dd')) \
    .withColumn("total_spend",sum("amount").over(window_spec)) \
        .withColumn("female_spent",when(col("gender")=='F',col("amount")).otherwise(0))

#renaming 
#percentage calculating
#rounding & formating
df=df.groupBy(col("city")).agg(max(col("total_spend")),sum(col("female_spent"))) \
    .withColumnRenamed("max(total_spend)","total_spend") \
        .withColumnRenamed("sum(female_spent)","female_spend") \
            .withColumn("percentage",(col("female_spend")/col("total_spend")*100)) \
                .withColumn("percentage",round(col("percentage"),2)) \
                    .withColumn("percentage",format_number(col("percentage"),2))




In [0]:
df.show()

+---------+-----------+------------+----------+
|     city|total_spend|female_spend|percentage|
+---------+-----------+------------+----------+
|Bengaluru|       3450|         300|      8.70|
|    Delhi|       1630|        1430|     87.73|
|   Mumbai|       4150|        4150|    100.00|
+---------+-----------+------------+----------+

