In [None]:
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Online-Banking").getOrCreate()

In [None]:
df = spark.read.csv("/loan.csv",header=True, inferSchema=True)
df.show()

+-----------+---+------+-------------------+--------------+-----------+------+-----------+-------------+------------------+-----------+-------+------------+----------------+------------------+
|Customer_ID|Age|Gender|         Occupation|Marital Status|Family Size|Income|Expenditure|Use Frequency|     Loan Category|Loan Amount|Overdue| Debt Record| Returned Cheque| Dishonour of Bill|
+-----------+---+------+-------------------+--------------+-----------+------+-----------+-------------+------------------+-----------+-------+------------+----------------+------------------+
|    IB14001| 30|  MALE|       BANK MANAGER|        SINGLE|          4| 50000|      22199|            6|           HOUSING| 10,00,000 |      5|      42,898|               6|                 9|
|    IB14008| 44|  MALE|          PROFESSOR|       MARRIED|          6| 51000|      19999|            4|          SHOPPING|     50,000|      3|      33,999|               1|                 5|
|    IB14012| 30|FEMALE|           

In [None]:
loans_by_category = (
    df.groupBy("Loan Category").count()

)
print("Loans per category:")
loans_by_category.show(truncate=False)


Loans per category:
+------------------+-----+
|Loan Category     |count|
+------------------+-----+
|HOUSING           |67   |
|TRAVELLING        |53   |
|BOOK STORES       |7    |
|AGRICULTURE       |12   |
|GOLD LOAN         |77   |
|EDUCATIONAL LOAN  |20   |
|AUTOMOBILE        |60   |
|BUSINESS          |24   |
|COMPUTER SOFTWARES|35   |
|DINNING           |14   |
|SHOPPING          |35   |
|RESTAURANTS       |41   |
|ELECTRONICS       |14   |
|BUILDING          |7    |
|RESTAURANT        |20   |
|HOME APPLIANCES   |14   |
+------------------+-----+



In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col,regexp_replace
from pyspark.sql.types import IntegerType
spark = SparkSession.builder.appName("Online-Banking").getOrCreate()
df = spark.read.csv("/loan.csv",header=True, inferSchema=True)
cleaned_df = df.withColumn("cleaned_Loan Amount",regexp_replace(col("Loan Amount"),",","").cast(IntegerType()))
count_loans_over_1_lakh = (
    cleaned_df.filter(col("cleaned_Loan Amount") > 100000).count()
)
print(f"People with loan > ₹100,000: {count_loans_over_1_lakh}")



People with loan > ₹100,000: 450


In [None]:
count_income_gt_60000 = (
    df.filter(col("Income") > 60000)
      .count()
)
print(f"Income > ₹60,000: {count_income_gt_60000}")


Income > ₹60,000: 198


In [9]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col,regexp_replace
spark = SparkSession.builder.appName("Online-Banking").getOrCreate()
df = spark.read.csv("/loan.csv",header=True, inferSchema=True)
count_rc_ge2_income_lt_50k = (
    df.filter((col(" Returned Cheque") > 2) & (col("Income") < 50000))
      .count()
)
print(f"Returned cheques ≥2 & Income < ₹50k: {count_rc_ge2_income_lt_50k}")


Returned cheques ≥2 & Income < ₹50k: 117


In [11]:
count_rc_ge2_single = (
    df.filter((col(" Returned Cheque") >= 2) & (col("Marital Status") == "SINGLE"))
      .count()
)
print(f"Returned cheques ≥2 & Single: {count_rc_ge2_single}")


Returned cheques ≥2 & Single: 111


In [12]:
count_expense_over_50k = (
    df.filter(col("Expenditure") > 50000)
      .count()
)
print(f"Expenditure > ₹50,000: {count_expense_over_50k}")


Expenditure > ₹50,000: 6


In [15]:
credit_card_eligible_df = df.filter(
    (col("Income") >= 25000) &
    (col("Overdue") <= 1) &
    (col(" Returned Cheque") <= 1) &
    (col(" Dishonour of Bill") <= 1)
)

count_credit_card_eligible = credit_card_eligible_df.count()
print(f"Eligible for credit cards by policy (example criteria): {count_credit_card_eligible}")


Eligible for credit cards by policy (example criteria): 5


In [16]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col,regexp_replace
spark = SparkSession.builder.appName("Online-Banking").getOrCreate()
df = spark.read.csv("/credit card.csv",header=True, inferSchema=True)
df.show()

+---------+----------+---------+-----------+---------+------+---+------+---------+-------------+--------------+---------------+------+
|RowNumber|CustomerId|  Surname|CreditScore|Geography|Gender|Age|Tenure|  Balance|NumOfProducts|IsActiveMember|EstimatedSalary|Exited|
+---------+----------+---------+-----------+---------+------+---+------+---------+-------------+--------------+---------------+------+
|        1|  15634602| Hargrave|        619|   France|Female| 42|     2|      0.0|            1|             1|      101348.88|     1|
|        2|  15647311|     Hill|        608|    Spain|Female| 41|     1| 83807.86|            1|             1|      112542.58|     0|
|        3|  15619304|     Onio|        502|   France|Female| 42|     8| 159660.8|            3|             0|      113931.57|     1|
|        4|  15701354|     Boni|        699|   France|Female| 39|     1|      0.0|            2|             0|       93826.63|     0|
|        5|  15737888| Mitchell|        850|    Spain|F

In [18]:
credit_card_in_spain = (
    df.filter(
        (col("Geography") == "Spain") &
        (col("NumOfProducts") > 1)
    )
    .count()
)


eligible_and_active = (
    df.filter(
        (col("NumOfProducts") > 1) &
        (col("IsActiveMember") == 1)
    )
    .count()
)

print(f"Proxy — Credit‑card users in Spain: {credit_card_in_spain}")
print(f"Proxy — Eligible & active: {eligible_and_active}")


Proxy — Credit‑card users in Spain: 1256
Proxy — Eligible & active: 2588


In [34]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col,to_date,sum
spark = SparkSession.builder.appName("Online-Banking").getOrCreate()
df = spark.read.csv("/txn.csv",header=True, inferSchema=True)
df.show()

+-------------+--------------------+----------+----------------+-------------+-----------+
|   Account No| TRANSACTION DETAILS|VALUE DATE| WITHDRAWAL AMT | DEPOSIT AMT |BALANCE AMT|
+-------------+--------------------+----------+----------------+-------------+-----------+
|409000611074'|TRF FROM  Indiafo...| 29-Jun-17|            NULL|    1000000.0|  1000000.0|
|409000611074'|TRF FROM  Indiafo...|  5-Jul-17|            NULL|    1000000.0|  2000000.0|
|409000611074'|FDRL/INTERNAL FUN...| 18-Jul-17|            NULL|     500000.0|  2500000.0|
|409000611074'|TRF FRM  Indiafor...|  1-Aug-17|            NULL|    3000000.0|  5500000.0|
|409000611074'|FDRL/INTERNAL FUN...| 16-Aug-17|            NULL|     500000.0|  6000000.0|
|409000611074'|FDRL/INTERNAL FUN...| 16-Aug-17|            NULL|     500000.0|  6500000.0|
|409000611074'|FDRL/INTERNAL FUN...| 16-Aug-17|            NULL|     500000.0|  7000000.0|
|409000611074'|FDRL/INTERNAL FUN...| 16-Aug-17|            NULL|     500000.0|  7500000.0|

In [36]:



txn = df.withColumn("VALUE DATE", to_date(col("VALUE DATE"), "dd-MMM-yy"))


from pyspark.sql.functions import max as spark_max, min as spark_min
withdrawal_stats =df.agg(
    spark_max(" WITHDRAWAL AMT ").alias("MaxWithdrawal"),
    spark_min(" WITHDRAWAL AMT ").alias("MinWithdrawal")
)
deposit_stats = txn.agg(
    spark_max(" DEPOSIT AMT ").alias("MaxDeposit"),
    spark_min(" DEPOSIT AMT ").alias("MinDeposit")
)


balance_per_account = df.groupBy("Account No").agg(
    sum("BALANCE AMT").alias("TotalBalance")
)


txn_per_date = df.groupBy("VALUE DATE").count().orderBy("VALUE DATE")


accounts_high_withdraw = df.filter(
    col(" WITHDRAWAL AMT ") > 100000
).select("Account No").distinct()

withdrawal_stats.show()
deposit_stats.show()
balance_per_account.show(truncate=False)
txn_per_date.show(truncate=False)
accounts_high_withdraw.show(truncate=False)

+-------------+-------------+
|MaxWithdrawal|MinWithdrawal|
+-------------+-------------+
|4.594475464E8|         0.01|
+-------------+-------------+

+----------+----------+
|MaxDeposit|MinDeposit|
+----------+----------+
|   5.448E8|      0.01|
+----------+----------+

+-------------+----------------------+
|Account No   |TotalBalance          |
+-------------+----------------------+
|409000438611'|-2.4948657706833955E12|
|1196711'     |-1.60476498101275E13  |
|1196428'     |-8.1418498130721E13   |
|409000493210'|-3.2758495213209575E12|
|409000611074'|1.615533622E9         |
|409000425051'|-3.7721184116499877E9 |
|409000405747'|-2.4310804706700016E10|
|409000362497'|-5.2860004792808E13   |
|409000493201'|1.0420831829499985E9  |
|409000438620'|-7.122918679513586E12 |
+-------------+----------------------+

+----------+-----+
|VALUE DATE|count|
+----------+-----+
|1-Apr-17  |1    |
|1-Aug-15  |75   |
|1-Aug-16  |85   |
|1-Aug-17  |65   |
|1-Aug-18  |144  |
|1-Dec-15  |96   |
|1-Dec-16 