In [None]:
# Start Spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("OnlineBankingCaseStudy").getOrCreate()

In [None]:
# Load loandata.csv
loan_df = spark.read.csv("loan.csv", header=True, inferSchema=True)
# Load credit.csv
credit_df = spark.read.csv("credit card.csv", header=True, inferSchema=True)

# Load txn.csv
txn_df = spark.read.csv("txn.csv", header=True, inferSchema=True)

In [None]:
loan_df.show(5)
credit_df.show(5)
txn_df.show(5)

+-----------+---+------+------------+--------------+-----------+------+-----------+-------------+-------------+-----------+-------+------------+----------------+------------------+
|Customer_ID|Age|Gender|  Occupation|Marital Status|Family Size|Income|Expenditure|Use Frequency|Loan Category|Loan Amount|Overdue| Debt Record| Returned Cheque| Dishonour of Bill|
+-----------+---+------+------------+--------------+-----------+------+-----------+-------------+-------------+-----------+-------+------------+----------------+------------------+
|    IB14001| 30|  MALE|BANK MANAGER|        SINGLE|          4| 50000|      22199|            6|      HOUSING| 10,00,000 |      5|      42,898|               6|                 9|
|    IB14008| 44|  MALE|   PROFESSOR|       MARRIED|          6| 51000|      19999|            4|     SHOPPING|     50,000|      3|      33,999|               1|                 5|
|    IB14012| 30|FEMALE|     DENTIST|        SINGLE|          3| 58450|      27675|            

In [None]:
loan_df.printSchema()
credit_df.printSchema()
txn_df.printSchema()

root
 |-- Customer_ID: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Occupation: string (nullable = true)
 |-- Marital Status: string (nullable = true)
 |-- Family Size: integer (nullable = true)
 |-- Income: integer (nullable = true)
 |-- Expenditure: integer (nullable = true)
 |-- Use Frequency: integer (nullable = true)
 |-- Loan Category: string (nullable = true)
 |-- Loan Amount: string (nullable = true)
 |-- Overdue: integer (nullable = true)
 |--  Debt Record: string (nullable = true)
 |--  Returned Cheque: integer (nullable = true)
 |--  Dishonour of Bill: integer (nullable = true)

root
 |-- RowNumber: integer (nullable = true)
 |-- CustomerId: integer (nullable = true)
 |-- Surname: string (nullable = true)
 |-- CreditScore: integer (nullable = true)
 |-- Geography: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Tenure: integer (nullable = true)
 |-- Balance: dou

 QUERIES FOR loan.csv

In [None]:
from pyspark.sql.functions import col

# 1. Number of loans in each category
loan_df.groupBy("Loan Category").count().show()

+------------------+-----+
|     Loan Category|count|
+------------------+-----+
|           HOUSING|   67|
|        TRAVELLING|   53|
|       BOOK STORES|    7|
|       AGRICULTURE|   12|
|         GOLD LOAN|   77|
|  EDUCATIONAL LOAN|   20|
|        AUTOMOBILE|   60|
|          BUSINESS|   24|
|COMPUTER SOFTWARES|   35|
|           DINNING|   14|
|          SHOPPING|   35|
|       RESTAURANTS|   41|
|       ELECTRONICS|   14|
|          BUILDING|    7|
|        RESTAURANT|   20|
|   HOME APPLIANCES|   14|
+------------------+-----+



In [None]:
# 2. Number of people who have taken more than ₹1 lakh loan
loan_df.filter(col("Loan Amount") > 100000).count()

0

In [None]:
# 3. Number of people with income > ₹60,000
loan_df.filter(col("Income") > 60000).count()

198

In [None]:
# 4. People with 2+ returned cheques and income < ₹50,000
loan_df.filter((col(" Returned Cheque") >= 2) & (col("Income") < 50000)).count()

137

In [None]:
# 5. People with 2+ returned cheques and are single
loan_df.filter((col(" Returned Cheque") >= 2) & (col("Marital Status") == "Single")).count()

0

In [None]:
# 6. People with expenditure over ₹50,000 per month
loan_df.filter(col("Expenditure") > 50000).count()

6

In [None]:
# 7. Members eligible for credit card (assuming 'Credit_Eligible' or similar field, else skip)
loan_df.select("Customer_ID", " Returned Cheque", "Expenditure", "Income").show(5)

+-----------+----------------+-----------+------+
|Customer_ID| Returned Cheque|Expenditure|Income|
+-----------+----------------+-----------+------+
|    IB14001|               6|      22199| 50000|
|    IB14008|               1|      19999| 51000|
|    IB14012|               3|      27675| 58450|
|    IB14018|               0|      12787| 45767|
|    IB14022|               1|      11999| 43521|
+-----------+----------------+-----------+------+
only showing top 5 rows



QUERIES FOR credit.csv

In [None]:
# 1. Credit card users in Spain
credit_df.filter(col("Geography") == "Spain").count()

2477

In [None]:
# 2. Eligible (e.g., CreditScore > 600) and Active (IsActiveMember == 1)
credit_df.filter((col("CreditScore") > 600) & (col("IsActiveMember") == 1)).count()

3639

QUERIES FOR txn.csv

In [None]:
from pyspark.sql.functions import max, min, sum, count

# 1. Max withdrawal amount
txn_df.select(max(" WITHDRAWAL AMT ").alias("Max_Withdrawal")).show()

+--------------+
|Max_Withdrawal|
+--------------+
| 4.594475464E8|
+--------------+



In [None]:
# 2. Min withdrawal amount per account
txn_df.groupBy("Account No").agg(min(" WITHDRAWAL AMT ").alias("Min_Withdrawal")).show()

+-------------+--------------+
|   Account No|Min_Withdrawal|
+-------------+--------------+
|409000438611'|           0.2|
|     1196711'|          0.25|
|     1196428'|          0.25|
|409000493210'|          0.01|
|409000611074'|         120.0|
|409000425051'|          1.25|
|409000405747'|          21.0|
|409000362497'|          0.97|
|409000493201'|           2.1|
|409000438620'|          0.34|
+-------------+--------------+



In [None]:
# 3. Max deposit amount per account
txn_df.groupBy("Account No").agg(max(" DEPOSIT AMT ").alias("Max_Deposit")).show()

+-------------+-------------+
|   Account No|  Max_Deposit|
+-------------+-------------+
|409000438611'|     1.7025E8|
|     1196711'|        5.0E8|
|     1196428'|2.119594422E8|
|409000493210'|        1.5E7|
|409000611074'|    3000000.0|
|409000425051'|        1.5E7|
|409000405747'|      2.021E8|
|409000362497'|        2.0E8|
|409000493201'|    1000000.0|
|409000438620'|      5.448E8|
+-------------+-------------+



In [None]:
# 4. Min deposit amount per account
txn_df.groupBy("Account No").agg(min(" DEPOSIT AMT ").alias("Min_Deposit")).show()

+-------------+-----------+
|   Account No|Min_Deposit|
+-------------+-----------+
|409000438611'|       0.03|
|     1196711'|       1.01|
|     1196428'|        1.0|
|409000493210'|       0.01|
|409000611074'|     1320.0|
|409000425051'|        1.0|
|409000405747'|      500.0|
|409000362497'|       0.03|
|409000493201'|        0.9|
|409000438620'|       0.07|
+-------------+-----------+



In [None]:
# 5. Sum of balance in all bank accounts
txn_df.select(sum("BALANCE AMT").alias("Total_Balance")).show()

+--------------------+
|       Total_Balance|
+--------------------+
|-1.63245212011488...|
+--------------------+



In [None]:
# 6. Number of transactions on each date
txn_df.groupBy("VALUE DATE").agg(count("*").alias("Transaction_Count")).show()

+----------+-----------------+
|VALUE DATE|Transaction_Count|
+----------+-----------------+
| 23-Dec-16|              143|
|  7-Feb-19|               98|
| 21-Jul-15|               80|
|  9-Sep-15|               91|
| 17-Jan-15|               16|
| 18-Nov-17|               53|
| 21-Feb-18|               77|
| 20-Mar-18|               71|
| 19-Apr-18|               71|
| 21-Jun-16|               97|
| 17-Oct-17|              101|
|  3-Jan-18|               70|
|  8-Jun-18|              223|
| 15-Dec-18|               62|
|  8-Aug-16|               97|
| 17-Dec-16|               74|
|  3-Sep-15|               83|
| 21-Jan-16|               76|
|  4-May-18|               92|
|  7-Sep-17|               94|
+----------+-----------------+
only showing top 20 rows



In [None]:
# 7. Customers with withdrawal amount > ₹1 lakh
txn_df.filter(col(" WITHDRAWAL AMT ") > 100000).select("Account No", " WITHDRAWAL AMT ").show()

+-------------+----------------+
|   Account No| WITHDRAWAL AMT |
+-------------+----------------+
|409000611074'|        133900.0|
|409000611074'|        195800.0|
|409000611074'|        143800.0|
|409000611074'|        331650.0|
|409000611074'|        129000.0|
|409000611074'|        230013.0|
|409000611074'|        367900.0|
|409000611074'|        108000.0|
|409000611074'|        141000.0|
|409000611074'|        206000.0|
|409000611074'|        242300.0|
|409000611074'|        113250.0|
|409000611074'|        206900.0|
|409000611074'|        276000.0|
|409000611074'|        171000.0|
|409000611074'|        189800.0|
|409000611074'|        271323.0|
|409000611074'|        200600.0|
|409000611074'|        176900.0|
|409000611074'|        150050.0|
+-------------+----------------+
only showing top 20 rows

