In [0]:
from pyspark.sql import SparkSession

# Initializing SparkSession 
spark = SparkSession.builder \
    .appName("Loan Data Processing") \
    .getOrCreate()

# File path
file_path = "/FileStore/tables/loan-1.csv"

# Load the CSV file into a PySpark DataFrame
loans_df = spark.read.format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load(file_path)

# Display the DataFrame to verify
loans_df.show(5)

+-----------+---+------+------------+--------------+-----------+------+-----------+-------------+-------------+-----------+-------+------------+----------------+------------------+
|Customer_ID|Age|Gender|  Occupation|Marital Status|Family Size|Income|Expenditure|Use Frequency|Loan Category|Loan Amount|Overdue| Debt Record| Returned Cheque| Dishonour of Bill|
+-----------+---+------+------------+--------------+-----------+------+-----------+-------------+-------------+-----------+-------+------------+----------------+------------------+
|    IB14001| 30|  MALE|BANK MANAGER|        SINGLE|          4| 50000|      22199|            6|      HOUSING| 10,00,000 |      5|      42,898|               6|                 9|
|    IB14008| 44|  MALE|   PROFESSOR|       MARRIED|          6| 51000|      19999|            4|     SHOPPING|     50,000|      3|      33,999|               1|                 5|
|    IB14012| 30|FEMALE|     DENTIST|        SINGLE|          3| 58450|      27675|            

In [0]:
loans_df.createOrReplaceTempView("challenge_data")


In [0]:
age_above_30 = spark.sql("""
    SELECT Customer_ID, Age
    FROM challenge_data
    WHERE Age > 30
""")
age_above_30.show()


+-----------+---+
|Customer_ID|Age|
+-----------+---+
|    IB14008| 44|
|    IB14022| 34|
|    IB14024| 55|
|    IB14025| 39|
|    IB14027| 51|
|    IB14031| 37|
|    IB14034| 32|
|    IB14037| 54|
|    IB14039| 45|
|    IB14041| 59|
|    IB14045| 31|
|    IB14049| 49|
|    IB14050| 56|
|    IB14054| 58|
|    IB14060| 36|
|    IB14070| 40|
|    IB14078| 45|
|    IB14082| 60|
|    IB14086| 51|
|    IB14092| 47|
+-----------+---+
only showing top 20 rows



In [0]:
income_family_filter = spark.sql("""
    SELECT Customer_ID, Income, `Family Size`
    FROM challenge_data
    WHERE Income > 30000 AND `Family Size` > 3
""")

income_family_filter.show()


+-----------+------+-----------+
|Customer_ID|Income|Family Size|
+-----------+------+-----------+
|    IB14001| 50000|          4|
|    IB14008| 51000|          6|
|    IB14018| 45767|          5|
|    IB14022| 43521|          4|
|    IB14024| 34999|          6|
|    IB14025| 46619|          6|
|    IB14031| 55999|          5|
|    IB14032| 60111|          4|
|    IB14037| 48099|          5|
|    IB14039| 45777|          7|
|    IB14041| 50999|          4|
|    IB14042| 60111|          4|
|    IB14045| 40999|          5|
|    IB14049| 45999|          4|
|    IB14054| 60000|          5|
|    IB14057| 40000|          4|
|    IB14060| 35000|          4|
|    IB14070| 38000|          4|
|    IB14078| 40000|          4|
|    IB14082| 70000|          5|
+-----------+------+-----------+
only showing top 20 rows



In [0]:
occupation_count_df = spark.sql("""
    SELECT COUNT(DISTINCT Occupation) AS unique_occupations
    FROM challenge_data
""")

occupation_count_df.show()


+------------------+
|unique_occupations|
+------------------+
|                39|
+------------------+



In [0]:
avg_income_df = spark.sql("""
    SELECT AVG(Income) AS average_income
    FROM challenge_data
""")

avg_income_df.show()


+-----------------+
|   average_income|
+-----------------+
|68339.49145299145|
+-----------------+



In [0]:
max_loan_df = spark.sql("""
    SELECT MAX(`Loan Amount`) AS max_loan
    FROM challenge_data
""")

max_loan_df.show()


+--------+
|max_loan|
+--------+
| 999,698|
+--------+



In [0]:
min_loan_df = spark.sql("""
    SELECT MIN(`Loan Amount`) AS min_loan
    FROM challenge_data
""")

min_loan_df.show()


+----------+
|  min_loan|
+----------+
| 1,00,000 |
+----------+



In [0]:
total_expenditure_df = spark.sql("""
    SELECT SUM(Expenditure) AS total_expenditure
    FROM challenge_data
""")

total_expenditure_df.show()


+-----------------+
|total_expenditure|
+-----------------+
|         13243460|
+-----------------+



In [0]:
loan_category_count_df = spark.sql("""
    SELECT `Loan Category`, COUNT(Customer_ID) AS num_customers
    FROM challenge_data
    GROUP BY `Loan Category`
""")

loan_category_count_df.show()


+------------------+-------------+
|     Loan Category|num_customers|
+------------------+-------------+
|           HOUSING|           67|
|        TRAVELLING|           53|
|       BOOK STORES|            7|
|       AGRICULTURE|           12|
|         GOLD LOAN|           77|
|  EDUCATIONAL LOAN|           20|
|        AUTOMOBILE|           60|
|          BUSINESS|           24|
|COMPUTER SOFTWARES|           35|
|           DINNING|           14|
|          SHOPPING|           35|
|       RESTAURANTS|           41|
|       ELECTRONICS|           14|
|          BUILDING|            7|
|        RESTAURANT|           20|
|   HOME APPLIANCES|           14|
+------------------+-------------+



In [0]:
avg_income_by_occupation_df = spark.sql("""
    SELECT Occupation, AVG(Income) AS avg_income
    FROM challenge_data
    GROUP BY Occupation
""")

avg_income_by_occupation_df.show()


+--------------------+------------------+
|          Occupation|        avg_income|
+--------------------+------------------+
|      CIVIL ENGINEER|60359.666666666664|
|     FIRE DEPARTMENT|55357.916666666664|
|          ACCOUNTANT| 56623.28571428572|
|        BANK MANAGER|           92191.0|
|      SYSTEM OFFICER|           56780.0|
|           NUTRITION|           55650.0|
|           DIETICIAN| 72599.16666666667|
|               CLERK|         76871.125|
|   SOFTWARE ENGINEER|           61107.8|
|AGRICULTURAL ENGI...|         82060.625|
|   ASSISTANT MANAGER|54866.166666666664|
|             TEACHER| 52812.73333333333|
| ASSISTANT PROFESSOR|53319.333333333336|
|     SYSTEM ENGINEER|60509.333333333336|
| CHARTERED APPRAISER| 76456.72727272728|
|                NAVY|        71190.9375|
|              POLICE| 49049.88888888889|
|            BUSINESS|        56682.5625|
|              FARMER| 74906.85714285714|
|              DRIVER|64450.833333333336|
+--------------------+------------

In [0]:
loan_amount_by_marital_status_df = spark.sql("""
    SELECT `Marital Status`, MAX(`Loan Amount`) AS max_loan, MIN(`Loan Amount`) AS min_loan
    FROM challenge_data
    GROUP BY `Marital Status`
""")

loan_amount_by_marital_status_df.show()


+--------------+--------+----------+
|Marital Status|max_loan|  min_loan|
+--------------+--------+----------+
|       MARRIED| 999,698| 1,02,256 |
|        SINGLE| 964,109| 1,00,000 |
+--------------+--------+----------+



In [0]:
family_size_aggregations_df = spark.sql("""
    SELECT `Family Size`, 
           AVG(Income) AS avg_income, 
           SUM(`Loan Amount`) AS total_loan_amount, 
           MAX(Expenditure) AS max_expenditure
    FROM challenge_data
    GROUP BY `Family Size`
""")

family_size_aggregations_df.show()


+-----------+------------------+-----------------+---------------+
|Family Size|        avg_income|total_loan_amount|max_expenditure|
+-----------+------------------+-----------------+---------------+
|          6|59968.545454545456|             NULL|          62541|
|          3| 71669.28787878787|             NULL|          53086|
|          5| 56102.14432989691|             NULL|          49225|
|          4| 89830.77570093458|             NULL|          48072|
|          7|58944.688524590165|             NULL|          49629|
|          2|           66428.4|             NULL|          48959|
+-----------+------------------+-----------------+---------------+



In [0]:
# Initialize SparkSession
from pyspark.sql import SparkSession
from pyspark.sql import Row

spark = SparkSession.builder.appName("Complex Join Operations").getOrCreate()

#customers DataFrame
customers_data = [
    Row(Customer_ID=1, Name="Alice", Age=34, Occupation="Engineer", Country="USA"),
    Row(Customer_ID=2, Name="Bob", Age=45, Occupation="Doctor", Country="Canada"),
    Row(Customer_ID=3, Name="Charlie", Age=29, Occupation="Teacher", Country="UK"),
    Row(Customer_ID=4, Name="David", Age=38, Occupation="Architect", Country="USA")
]

customers_df = spark.createDataFrame(customers_data)

#loans DataFrame
loans_data = [
    Row(Loan_ID=101, Customer_ID=1, Loan_Amount=50000, Loan_Type="Personal", Loan_Status="Approved", Loan_Start_Date="2023-01-10"),
    Row(Loan_ID=102, Customer_ID=2, Loan_Amount=30000, Loan_Type="Home", Loan_Status="Pending", Loan_Start_Date="2023-03-15"),
    Row(Loan_ID=103, Customer_ID=3, Loan_Amount=20000, Loan_Type="Education", Loan_Status="Rejected", Loan_Start_Date="2023-02-20"),
    Row(Loan_ID=104, Customer_ID=None, Loan_Amount=15000, Loan_Type="Car", Loan_Status="Approved", Loan_Start_Date="2023-05-01")
]

loans_df = spark.createDataFrame(loans_data)

# temporary views
customers_df.createOrReplaceTempView("customers")
loans_df.createOrReplaceTempView("loans")

In [0]:
# Perform INNER JOIN (only rows with matching Customer_ID)
inner_join_df = spark.sql("""
    SELECT c.Customer_ID, c.Name, l.Loan_ID, l.Loan_Amount, l.Loan_Status
    FROM customers c
    INNER JOIN loans l
    ON c.Customer_ID = l.Customer_ID
""")
inner_join_df.show()

+-----------+-------+-------+-----------+-----------+
|Customer_ID|   Name|Loan_ID|Loan_Amount|Loan_Status|
+-----------+-------+-------+-----------+-----------+
|          1|  Alice|    101|      50000|   Approved|
|          2|    Bob|    102|      30000|    Pending|
|          3|Charlie|    103|      20000|   Rejected|
+-----------+-------+-------+-----------+-----------+



In [0]:
# Perform LEFT JOIN (all customers, including those with no loan records)
left_join_df = spark.sql("""
    SELECT c.Customer_ID, c.Name, l.Loan_ID, l.Loan_Amount, l.Loan_Status
    FROM customers c
    LEFT JOIN loans l
    ON c.Customer_ID = l.Customer_ID
""")
left_join_df.show()

+-----------+-------+-------+-----------+-----------+
|Customer_ID|   Name|Loan_ID|Loan_Amount|Loan_Status|
+-----------+-------+-------+-----------+-----------+
|          1|  Alice|    101|      50000|   Approved|
|          2|    Bob|    102|      30000|    Pending|
|          3|Charlie|    103|      20000|   Rejected|
|          4|  David|   NULL|       NULL|       NULL|
+-----------+-------+-------+-----------+-----------+



In [0]:
# Perform RIGHT JOIN (all loans, including those without corresponding customer records)
right_join_df = spark.sql("""
    SELECT c.Customer_ID, c.Name, l.Loan_ID, l.Loan_Amount, l.Loan_Status
    FROM customers c
    RIGHT JOIN loans l
    ON c.Customer_ID = l.Customer_ID
""")
right_join_df.show()

+-----------+-------+-------+-----------+-----------+
|Customer_ID|   Name|Loan_ID|Loan_Amount|Loan_Status|
+-----------+-------+-------+-----------+-----------+
|          1|  Alice|    101|      50000|   Approved|
|          2|    Bob|    102|      30000|    Pending|
|          3|Charlie|    103|      20000|   Rejected|
|       NULL|   NULL|    104|      15000|   Approved|
+-----------+-------+-------+-----------+-----------+

