You're working for a financial analytics company that specializes in analyzing credit card expenditures. You have a dataset containing information about users' credit card expenditures across different card companies.

Write an SQL query to find the total expenditure from other cards (excluding Mastercard) for users who hold Mastercard.  Display only the users(along with Mastercard expense and other expense) for which expense from other cards together is more than Mastercard expense.


In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

# Initialize Spark session
spark = SparkSession.builder.appName("Expenditures Table").getOrCreate()

# Define schema for the expenditures table
expenditures_schema = StructType([
    StructField("user_name", StringType(), True),
    StructField("card_company", StringType(), True),
    StructField("expenditure", IntegerType(), True)
])

# Define data for the expenditures table
expenditures_data = [
    ("user1", "Mastercard", 1000),
    ("user1", "Visa", 500),
    ("user1", "RuPay", 2000),
    ("user2", "Visa", 2000),
    ("user3", "Mastercard", 5000),
    ("user3", "Visa", 2000),
    ("user3", "Slice", 500),
    ("user3", "Amex", 1000),
    ("user4", "Mastercard", 2000)
]

# Create DataFrame for the expenditures table
expenditures_df = spark.createDataFrame(data=expenditures_data, schema=expenditures_schema)

# Show the expenditures table
expenditures_df.show()


+---------+------------+-----------+
|user_name|card_company|expenditure|
+---------+------------+-----------+
|    user1|  Mastercard|       1000|
|    user1|        Visa|        500|
|    user1|       RuPay|       2000|
|    user2|        Visa|       2000|
|    user3|  Mastercard|       5000|
|    user3|        Visa|       2000|
|    user3|       Slice|        500|
|    user3|        Amex|       1000|
|    user4|  Mastercard|       2000|
+---------+------------+-----------+



In [0]:
from pyspark.sql.functions import col, when, sum

# Step 1: Filter for users who have at least one Mastercard entry
users_with_mastercard = expenditures_df.filter(col("card_company") == "Mastercard").select("user_name").distinct()

# Step 2: Calculate `mastercard_expense` and `other_expense` for each user
cte_expenses_df = expenditures_df.join(users_with_mastercard, on="user_name", how="inner") \
    .groupBy("user_name").agg(
        sum(when(col("card_company") == "Mastercard", col("expenditure")).otherwise(0)).alias("mastercard_expense"),
        sum(when(col("card_company") != "Mastercard", col("expenditure")).otherwise(0)).alias("other_expense")
    )

# Step 3: Filter rows where `other_expense` is greater than `mastercard_expense`
result_df = cte_expenses_df.filter(col("other_expense") > col("mastercard_expense"))

# Display the result
result_df.show()


+---------+------------------+-------------+
|user_name|mastercard_expense|other_expense|
+---------+------------------+-------------+
|    user1|              1000|         2500|
+---------+------------------+-------------+



In [0]:
cte_expenses_df.display()

user_name,mastercard_expense,other_expense
user1,1000,2500
user3,5000,3500
user4,2000,0
