In [0]:
from pyspark.sql import SparkSession

# Initializing SparkSession 
spark = SparkSession.builder \
    .appName("Loan Data Processing") \
    .getOrCreate()

# File path
file_path = "/FileStore/tables/loan-1.csv"

# Load the CSV file into a PySpark DataFrame
loans_df = spark.read.format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load(file_path)

# Display the DataFrame to verify
loans_df.show(5)


+-----------+---+------+------------+--------------+-----------+------+-----------+-------------+-------------+-----------+-------+------------+----------------+------------------+
|Customer_ID|Age|Gender|  Occupation|Marital Status|Family Size|Income|Expenditure|Use Frequency|Loan Category|Loan Amount|Overdue| Debt Record| Returned Cheque| Dishonour of Bill|
+-----------+---+------+------------+--------------+-----------+------+-----------+-------------+-------------+-----------+-------+------------+----------------+------------------+
|    IB14001| 30|  MALE|BANK MANAGER|        SINGLE|          4| 50000|      22199|            6|      HOUSING| 10,00,000 |      5|      42,898|               6|                 9|
|    IB14008| 44|  MALE|   PROFESSOR|       MARRIED|          6| 51000|      19999|            4|     SHOPPING|     50,000|      3|      33,999|               1|                 5|
|    IB14012| 30|FEMALE|     DENTIST|        SINGLE|          3| 58450|      27675|            

In [0]:
loans_per_category = loans_df.groupBy("loan category").count()

loans_per_category.show()

+------------------+-----+
|     loan category|count|
+------------------+-----+
|           HOUSING|   67|
|        TRAVELLING|   53|
|       BOOK STORES|    7|
|       AGRICULTURE|   12|
|         GOLD LOAN|   77|
|  EDUCATIONAL LOAN|   20|
|        AUTOMOBILE|   60|
|          BUSINESS|   24|
|COMPUTER SOFTWARES|   35|
|           DINNING|   14|
|          SHOPPING|   35|
|       RESTAURANTS|   41|
|       ELECTRONICS|   14|
|          BUILDING|    7|
|        RESTAURANT|   20|
|   HOME APPLIANCES|   14|
+------------------+-----+



In [0]:
from pyspark.sql.functions import regexp_replace, col

# Clean and convert "Loan Amount" to integer
loans_df = loans_df.withColumn(
    "Loan Amount", 
    regexp_replace(col("Loan Amount"), "[^0-9]", "").cast("int")
)


# Filter loans greater than 1 Lakh (1,00,000)
high_value_loans = loans_df.filter(loans_df["Loan Amount"] > 100000)

# Count the number of unique customers
num_people_high_loans = high_value_loans.select("Customer_ID").distinct().count()

print(f"Number of people who have taken loans greater than 1 Lakh: {num_people_high_loans}")


Number of people who have taken loans greater than 1 Lakh: 449


In [0]:
#people with Income greater than 60,000
high_income_people = loans_df.filter(loans_df["Income"] > 60000)

# Count the number of unique customers
num_high_income_people = high_income_people.select("Customer_ID").distinct().count()

print(f"Number of people with Income greater than 60,000: {num_high_income_people}")


Number of people with Income greater than 60,000: 198


In [0]:
from pyspark.sql import SparkSession
# Create a temporary view for SQL queries
loans_df.createOrReplaceTempView("loans")

result_3 = spark.sql("""
    SELECT COUNT(DISTINCT Customer_ID) AS num_people
    FROM loans
    WHERE Income > 60000
""")
result_3.show()

+----------+
|num_people|
+----------+
|       198|
+----------+



In [0]:
#people with 2 or more returned cheques and income less than 50,000
result_4 = loans_df.filter((loans_df[" Returned Cheque"] >= 2) & 
                           (loans_df["Income"] < 50000))

# Count the number of unique customers
num_people_4 = result_4.select("Customer_ID").distinct().count()

print(f"Number of people with 2 or more returned cheques and income less than 50,000: {num_people_4}")


Number of people with 2 or more returned cheques and income less than 50,000: 136


In [0]:
result_4 = spark.sql("""
    SELECT COUNT(DISTINCT Customer_ID) AS num_people
    FROM loans
    WHERE ` Returned Cheque` >= 2 AND Income < 50000
""")
result_4.show()


+----------+
|num_people|
+----------+
|       136|
+----------+



In [0]:
# Ensure the DataFrame has no problematic column names
loans_df = loans_df.withColumnRenamed(" Returned Cheque", "Returned Cheque") \
                   .withColumnRenamed("Marital Status", "Marital_Status")

# Create a temporary view for SQL queries
loans_df.createOrReplaceTempView("loans")


In [0]:
from pyspark.sql.functions import lower

# Normalize the `Marital_Status` column to lowercase for consistent comparison
loans_df = loans_df.withColumn("Marital_Status", lower(loans_df["Marital_Status"]))

result_5 = loans_df.filter((loans_df["Returned Cheque"] >= 2) & 
                           (loans_df["Marital_Status"] == "single"))

num_people_5 = result_5.select("Customer_ID").distinct().count()

print(f"Number of people with 2 or more returned cheques and are single: {num_people_5}")




Number of people with 2 or more returned cheques and are single: 111


In [0]:
result_5 = spark.sql("""
    SELECT COUNT(DISTINCT Customer_ID) AS num_people
    FROM loans
    WHERE `Returned Cheque` >= 2 AND LOWER(Marital_Status) = 'single'
""")
result_5.show()


+----------+
|num_people|
+----------+
|       111|
+----------+



In [0]:
#find the number of people with expenditure > 50,000
result_6_sql = spark.sql("""
    SELECT COUNT(DISTINCT Customer_ID) AS num_people
    FROM loans
    WHERE Expenditure > 50000
""")
result_6_sql.show()


+----------+
|num_people|
+----------+
|         6|
+----------+



In [0]:
# Filter the DataFrame for expenditure > 50,000
result_6_df = loans_df.filter(loans_df["Expenditure"] > 50000)

# Count distinct Customer_ID
num_people_6 = result_6_df.select("Customer_ID").distinct().count()

print(f"Number of people with expenditure over 50,000: {num_people_6}")


Number of people with expenditure over 50,000: 6


In [0]:
result_7_sql = spark.sql("""
    SELECT COUNT(DISTINCT Customer_ID) AS num_people
    FROM loans
    WHERE Income > 30000 AND Overdue = 1
""")
result_7_sql.show()


+----------+
|num_people|
+----------+
|        62|
+----------+



In [0]:
result_7_df = loans_df.filter((loans_df["Income"] > 30000) & 
                              (loans_df["Overdue"] == 1))

num_people_7 = result_7_df.select("Customer_ID").distinct().count()

print(f"Number of members eligible for credit cards: {num_people_7}")


Number of members eligible for credit cards: 62
