In [0]:
#Loading Data
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Case Study").getOrCreate()

loan_file_path = "/FileStore/tables/loan.csv"
credit_card_file_path = "/FileStore/tables/credit_card.csv"
txn_file_path = "/FileStore/tables/txn.csv"

loan_df = spark.read.format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load(loan_file_path)

credit_df = spark.read.format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load(credit_card_file_path)

txn_df = spark.read.format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load(txn_file_path)

# Display loaded DataFrames
loan_df.show(5)  # Display first 5 rows of loan data
credit_df.show(5)  # Display first 5 rows of credit card data
txn_df.show(5)  # Display first 5 rows of transaction data


+-----------+---+------+------------+--------------+-----------+------+-----------+-------------+-------------+-----------+-------+------------+----------------+------------------+
|Customer_ID|Age|Gender|  Occupation|Marital Status|Family Size|Income|Expenditure|Use Frequency|Loan Category|Loan Amount|Overdue| Debt Record| Returned Cheque| Dishonour of Bill|
+-----------+---+------+------------+--------------+-----------+------+-----------+-------------+-------------+-----------+-------+------------+----------------+------------------+
|    IB14001| 30|  MALE|BANK MANAGER|        SINGLE|          4| 50000|      22199|            6|      HOUSING| 10,00,000 |      5|      42,898|               6|                 9|
|    IB14008| 44|  MALE|   PROFESSOR|       MARRIED|          6| 51000|      19999|            4|     SHOPPING|     50,000|      3|      33,999|               1|                 5|
|    IB14012| 30|FEMALE|     DENTIST|        SINGLE|          3| 58450|      27675|            

In [0]:
#number of loans in each category
loan_df.groupBy("Loan Category").count().show()


+------------------+-----+
|     Loan Category|count|
+------------------+-----+
|           HOUSING|   67|
|        TRAVELLING|   53|
|       BOOK STORES|    7|
|       AGRICULTURE|   12|
|         GOLD LOAN|   77|
|  EDUCATIONAL LOAN|   20|
|        AUTOMOBILE|   60|
|          BUSINESS|   24|
|COMPUTER SOFTWARES|   35|
|           DINNING|   14|
|          SHOPPING|   35|
|       RESTAURANTS|   41|
|       ELECTRONICS|   14|
|          BUILDING|    7|
|        RESTAURANT|   20|
|   HOME APPLIANCES|   14|
+------------------+-----+



In [0]:
#Number of people who have taken more than 1 lakh loan

from pyspark.sql.functions import col, regexp_replace

# First we Convert Loan Amount form String to Float
loan_df_cleaned = loan_df.withColumn("Loan Amount", regexp_replace(col("Loan Amount"), ",", "").cast("float"))

# Now, we use filter for people who have taken more than 1 lakh loan
ans = loan_df_cleaned.filter(col("Loan Amount") > 100000).count()

print(f"Number of people who have taken more than 1 lakh loan: {ans}")


Number of people who have taken more than 1 lakh loan: 450


In [0]:
#number of people with income greater than 60000 rupees

ans=loan_df.filter(col("Income") > 60000).count()
print(f"The Number of people with income greater than 60000 rupees: {ans}")


The Number of people with income greater than 60000 rupees: 198


In [0]:
# number of people with 2 or more returned cheques and income less than 50000

from pyspark.sql.functions import col

ans=loan_df.filter((col(" Returned Cheque") >= 2) & (col("Income") < 50000)).count()
print(f"The Number of people with 2 or more returned cheques and income less than 50000: {ans}")


The Number of people with 2 or more returned cheques and income less than 50000: 137


In [0]:
from pyspark.sql.functions import col

# number of people with 2 or more returned cheques and are single
ans=loan_df.filter((col(" Returned Cheque") >= 2) & (col("Marital Status") == "SINGLE")).count()
print(f"The Number of people with 2 or more returned cheques and are single: {ans}")


The Number of people with 2 or more returned cheques and are single: 111


In [0]:
#number of people with expenditure over 50000 a month

ans=loan_df.filter(col("Expenditure") > 50000).count()
print(f"The number of people with expenditure over 50000 a month: {ans}")


The number of people with expenditure over 50000 a month: 6


In [0]:
#credit card users in Spain

ans=credit_df.filter(col("Geography") == "Spain").count()
print(f"Total Credit card users in Spain: {ans}")



Total Credit card users in Spain: 2477


In [0]:
#number of members who are elgible and active in the bank

ans=credit_df.filter((col("IsActiveMember") == 1)).count()
print(f"The number of members who are elgible and active in the bank: {ans}")


The number of members who are elgible and active in the bank: 5151


In [0]:
#Maximum withdrawal amount in transactions
from pyspark.sql.functions import max

max_withdrawal = txn_df.select(max(col(" WITHDRAWAL AMT "))).collect()[0][0]
print(f"Maximum withdrawal amount in transactions: {max_withdrawal}")


Maximum withdrawal amount in transactions: 459447546.4


In [0]:
#MINIMUM WITHDRAWAL AMOUNT OF AN ACCOUNT in txn.csv
from pyspark.sql.functions import min

min_withdrawal = txn_df.select(min(col(" WITHDRAWAL AMT "))).collect()[0][0]
print(f"Minimum withdrawal amount in transactions: {min_withdrawal}")



Minimum withdrawal amount in transactions: 0.01


In [0]:
#MAXIMUM DEPOSIT AMOUNT OF AN ACCOUNT

max_deposit = txn_df.select(max(col(" DEPOSIT AMT "))).collect()[0][0]
print(f"Maximum deposit amount in transactions: {max_deposit}")



Maximum deposit amount in transactions: 544800000.0


In [0]:
#MINIMUM DEPOSIT AMOUNT OF AN ACCOUNT

min_deposit = txn_df.select(min(col(" DEPOSIT AMT "))).collect()[0][0]
print(f"Minimum deposit amount in transactions: {min_deposit}")



Minimum deposit amount in transactions: 0.01


In [0]:
#Number of transaction on each date

print("Number of transactions on each date:")
txn_count_by_date = txn_df.groupBy("VALUE DATE").count().show()


Number of transactions on each date:
+----------+-----+
|VALUE DATE|count|
+----------+-----+
| 23-Dec-16|  143|
|  7-Feb-19|   98|
| 21-Jul-15|   80|
|  9-Sep-15|   91|
| 17-Jan-15|   16|
| 18-Nov-17|   53|
| 21-Feb-18|   77|
| 20-Mar-18|   71|
| 19-Apr-18|   71|
| 21-Jun-16|   97|
| 17-Oct-17|  101|
|  3-Jan-18|   70|
|  8-Jun-18|  223|
| 15-Dec-18|   62|
|  8-Aug-16|   97|
| 17-Dec-16|   74|
|  3-Sep-15|   83|
| 21-Jan-16|   76|
|  4-May-18|   92|
|  7-Sep-17|   94|
+----------+-----+
only showing top 20 rows



In [0]:
#List of customers with withdrawal amount more than 1 lakh

print("List of customers with withdrawal amount more than 1 lakh:")
customers_with_high_withdrawals = txn_df.filter(col(" WITHDRAWAL AMT ") > 100000).select("Account No").distinct().show()




List of customers with withdrawal amount more than 1 lakh:
+-------------+
|   Account No|
+-------------+
|409000438611'|
|     1196711'|
|     1196428'|
|409000493210'|
|409000611074'|
|409000425051'|
|409000405747'|
|409000493201'|
|409000438620'|
|409000362497'|
+-------------+

