In [0]:
# Create Spark session (already available in Databricks as `spark`)
df = spark.read.csv("/FileStore/tables/LoanData__1_-1.csv", header=True, inferSchema=True)

In [0]:
# Show first few rows
df.display(5)

Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
LP001011,Male,Yes,2,Graduate,Yes,5417,4196.0,267.0,360.0,1.0,Urban,Y
LP001013,Male,Yes,0,Not Graduate,No,2333,1516.0,95.0,360.0,1.0,Urban,Y
LP001014,Male,Yes,3+,Graduate,No,3036,2504.0,158.0,360.0,0.0,Semiurban,N
LP001018,Male,Yes,2,Graduate,No,4006,1526.0,168.0,360.0,1.0,Urban,Y
LP001020,Male,Yes,1,Graduate,No,12841,10968.0,349.0,360.0,1.0,Semiurban,N


In [0]:
# Schema
df.printSchema()

root
 |-- Loan_ID: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Married: string (nullable = true)
 |-- Dependents: string (nullable = true)
 |-- Education: string (nullable = true)
 |-- Self_Employed: string (nullable = true)
 |-- ApplicantIncome: integer (nullable = true)
 |-- CoapplicantIncome: double (nullable = true)
 |-- LoanAmount: integer (nullable = true)
 |-- Loan_Amount_Term: integer (nullable = true)
 |-- Credit_History: integer (nullable = true)
 |-- Property_Area: string (nullable = true)
 |-- Loan_Status: string (nullable = true)



Basic EDA – Data Understanding



In [0]:
# 1. Shape of Data

print(f"Rows: {df.count()}, Columns: {len(df.columns)}")

Rows: 614, Columns: 13


In [0]:
# 2. Column Summary
df.describe().display()

summary,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
count,614,601,611,599,614,582,614.0,614.0,592.0,600.0,564.0,614,614
mean,,,,0.5547445255474452,,,5403.459283387622,1621.245798027101,146.41216216216216,342.0,0.8421985815602837,,
stddev,,,,0.7853289861674291,,,6109.041673387174,2926.248369224192,85.58732523570545,65.12040985461256,0.3648783192364049,,
min,LP001002,Female,No,0,Graduate,No,150.0,0.0,9.0,12.0,0.0,Rural,N
max,LP002990,Male,Yes,3+,Not Graduate,Yes,81000.0,41667.0,700.0,480.0,1.0,Urban,Y


In [0]:
# 3. Null Values
from pyspark.sql.functions import col, sum

df.select([sum(col(c).isNull().cast("int")).alias(c) for c in df.columns]).display()

Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,13,3,15,0,32,0,0,22,14,50,0,0


Data Cleaning



In [0]:
# Remove duplicates:
df = df.dropDuplicates()

In [0]:
# Fill or drop nulls:
df = df.na.drop()  # or df = df.na.fill("value")

Visualization in Databricks

In [0]:
# 1.  Loan Status Distribution 
df.groupBy("Loan_Status").count().display()

Loan_Status,count
Y,332
N,148


Databricks visualization. Run in Databricks to view.

In [0]:
# 2. Gender-wise Loan Status
df.groupBy("Gender", "Loan_Status").count().orderBy("Gender").display()

Gender,Loan_Status,count
Female,Y,54
Female,N,32
Male,Y,278
Male,N,116


Databricks visualization. Run in Databricks to view.

Databricks visualization. Run in Databricks to view.

Databricks visualization. Run in Databricks to view.

In [0]:
# 3. Education vs Loan Status
df.groupBy("Education", "Loan_Status").count().display()

Education,Loan_Status,count
Not Graduate,N,36
Graduate,N,112
Graduate,Y,271
Not Graduate,Y,61


Databricks visualization. Run in Databricks to view.

In [0]:
# 4. Property Area vs Loan Status
df.groupBy("Property_Area", "Loan_Status").count().display()

Property_Area,Loan_Status,count
Rural,Y,85
Urban,N,52
Semiurban,Y,149
Urban,Y,98
Rural,N,54
Semiurban,N,42


Databricks visualization. Run in Databricks to view.

In [0]:
# 5. ApplicantIncome vs LoanAmount
df.select("ApplicantIncome", "LoanAmount").display()

ApplicantIncome,LoanAmount
2237,63
7583,187
1625,96
1820,100
2192,45
3600,80
11500,286
2221,60
7085,84
16525,150


Databricks visualization. Run in Databricks to view.

In [0]:
# 6. Income Distribution
df.select("ApplicantIncome").display()

ApplicantIncome
2237
7583
1625
1820
2192
3600
11500
2221
7085
16525


Databricks visualization. Run in Databricks to view.

In [0]:
# 7. LoanAmount by Education
df.select("Education", "LoanAmount").display()

Education,LoanAmount
Graduate,63
Graduate,187
Graduate,96
Graduate,100
Not Graduate,45
Graduate,80
Graduate,286
Graduate,60
Graduate,84
Graduate,150


Databricks visualization. Run in Databricks to view.