
# Home Loan EDA Notebook

This notebook performs a complete Exploratory Data Analysis (EDA) on the Home Loan **train** and **test** datasets.
It follows the project phases you provided:
- Phase 1: Data collection & preparation
- Phase 2: Exploratory Data Analysis (EDA)
- Phase 3: Reporting & insights

**How to use:** Run all cells from top to bottom. The notebook will save cleaned CSVs, figures, and a PDF-ready set of images to `/mnt/data/home_loan_eda_output`.

**Notes:** The notebook pulls the datasets from the raw GitHub URLs. If your environment blocks internet access, download the CSVs locally and update the file paths in the "Load data" cell.


In [1]:

# Imports and configuration
import os
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Configure output directories
OUT_DIR = Path("/mnt/data/home_loan_eda_output")
FIG_DIR = OUT_DIR / "figures"
OUT_DIR.mkdir(parents=True, exist_ok=True)
FIG_DIR.mkdir(parents=True, exist_ok=True)

# Display settings
pd.set_option('display.max_columns', 200)
pd.set_option('display.width', 200)


ModuleNotFoundError: No module named 'matplotlib'

In [None]:

# Data URLs (change to local paths if internet is unavailable)
TRAIN_URL = "https://raw.githubusercontent.com/ek-chris/Practice_datasets/refs/heads/main/home_loan_train.csv"
TEST_URL  = "https://raw.githubusercontent.com/ek-chris/Practice_datasets/refs/heads/main/home_loan_test.csv"

def load_csv(url):
    print("Loading:", url)
    df = pd.read_csv(url)
    print("Shape:", df.shape)
    return df

train = load_csv(TRAIN_URL)
test = load_csv(TEST_URL)

# Quick peek
display(train.head())
display(test.head())


In [None]:

# Phase 1: Inspect datasets for missing values, duplicates, dtypes
def inspect(df, name="data"):
    print(f"--- {name} ---")
    print("Shape:", df.shape)
    print("Duplicates:", df.duplicated().sum())
    print("Missing counts:")
    print(df.isna().sum())
    print("\nData types:")
    print(df.dtypes)
    print("\nSummary stats (numerical):")
    display(df.describe(include=[np.number]).T)
    print("\nSummary stats (categorical):")
    display(df.describe(include=['object']).T)

inspect(train, "train")
inspect(test, "test")


In [None]:

# Task 1.3: Cleaning function
def clean_data(df):
    df = df.copy()
    # Standardize columns
    df.columns = [c.strip() for c in df.columns]
    # Replace common missing markers with NaN
    df.replace(['', ' ', 'NA', 'N/A', 'na', 'nan', 'None'], np.nan, inplace=True)
    # Trim object columns
    for col in df.select_dtypes(['object']).columns:
        df[col] = df[col].str.strip()
    # Handle dependents: '3+' -> 3
    if 'dependents' in df.columns:
        df['dependents'] = df['dependents'].replace('3+', '3')
        df['dependents'] = pd.to_numeric(df['dependents'], errors='coerce')
    # Numeric conversions (common names)
    mapping_nums = {
        'applicantincome':'applicant_income','applicant_income':'applicant_income',
        'coapplicantincome':'coapplicant_income','coapplicant_income':'coapplicant_income',
        'loanamount':'loan_amount','loan_amount':'loan_amount',
        'loan_amount_term':'loan_amount_term','loan_amount_term_months':'loan_amount_term'
    }
    for col in list(df.columns):
        if col.lower() in mapping_nums:
            try:
                df[col] = pd.to_numeric(df[col], errors='coerce')
            except:
                pass
    # Credit history numeric
    if 'credit_history' in df.columns:
        df['credit_history'] = pd.to_numeric(df['credit_history'], errors='coerce')
    # Fill numeric missing with median (except target 'loan_status')
    num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    for col in num_cols:
        if df[col].isna().sum() > 0:
            df[col].fillna(df[col].median(), inplace=True)
    # Fill categorical missing with 'Unknown' (do not touch loan_status if present and desired)
    cat_cols = df.select_dtypes(include=['object']).columns.tolist()
    for col in cat_cols:
        if df[col].isna().sum() > 0:
            df[col].fillna('Unknown', inplace=True)
    return df

train_clean = clean_data(train)
test_clean  = clean_data(test)

# Save cleaned copies
train_clean.to_csv(OUT_DIR / "train_cleaned.csv", index=False)
test_clean.to_csv(OUT_DIR / "test_cleaned.csv", index=False)

print("Saved cleaned CSVs to", OUT_DIR)
display(train_clean.head())


In [None]:

# Phase 2: Descriptive statistics and visualizations (matplotlib only)

def save_fig(fig, name):
    path = FIG_DIR / f"{name}.png"
    fig.savefig(path, bbox_inches='tight', dpi=150)
    plt.close(fig)
    print("Saved", path)
    return path

# 2.1 Descriptive stats
print("Train numerical description:")
display(train_clean.describe().T)
print("\nTrain categorical description:")
display(train_clean.describe(include=['object']).T)

# 2.2 Histograms and boxplots for key numeric features
numeric_features = ['applicant_income','coapplicant_income','loan_amount','loan_amount_term','dependents','credit_history']
# Filter only those present
numeric_features = [f for f in numeric_features if f in train_clean.columns]

for feat in numeric_features:
    # Histogram
    fig = plt.figure(figsize=(8,4))
    plt.hist(train_clean[feat].dropna(), bins=30)
    plt.title(f"Train: {feat} distribution")
    plt.xlabel(feat); plt.ylabel("count")
    save_fig(fig, f"train_hist_{feat}")
    # Boxplot by loan_status if available
    if 'loan_status' in train_clean.columns:
        fig = plt.figure(figsize=(8,4))
        # create list of arrays for each status
        groups = []
        labels = []
        for name, group in train_clean.groupby('loan_status'):
            groups.append(group[feat].dropna())
            labels.append(name)
        plt.boxplot(groups, labels=labels)
        plt.title(f"Train: {feat} by loan_status")
        save_fig(fig, f"train_box_{feat}_by_status")

# 2.3 Categorical feature counts (bar charts)
categorical_features = ['education','self_employed','property_area','married']
categorical_features = [f for f in categorical_features if f in train_clean.columns]
for cat in categorical_features:
    fig = plt.figure(figsize=(6,4))
    counts = train_clean[cat].value_counts()
    plt.bar(counts.index.astype(str), counts.values)
    plt.title(f"Train: {cat} value counts")
    plt.xticks(rotation=45, ha='right')
    save_fig(fig, f"train_bar_{cat}")

# 2.4 Scatterplots for relationships (applicant_income vs loan_amount)
if 'applicant_income' in train_clean.columns and 'loan_amount' in train_clean.columns:
    fig = plt.figure(figsize=(8,6))
    plt.scatter(train_clean['applicant_income'], train_clean['loan_amount'], alpha=0.6)
    plt.xlabel('applicant_income'); plt.ylabel('loan_amount')
    plt.title('Applicant income vs Loan amount (train)')
    save_fig(fig, 'train_scatter_income_loan')


In [None]:

# 2.4 Correlation matrix (numeric only) and cross-tabulations
num = train_clean.select_dtypes(include=[np.number])
corr = num.corr()

# Save correlation heatmap using matplotlib
fig = plt.figure(figsize=(8,6))
plt.imshow(corr, interpolation='nearest')
plt.colorbar()
plt.xticks(range(len(corr.columns)), corr.columns, rotation=90)
plt.yticks(range(len(corr.columns)), corr.columns)
plt.title("Train: Correlation matrix (numeric)")
save_fig(fig, "train_corr_matrix")

# Cross-tab: credit_history vs loan_status (percentage by row)
if 'credit_history' in train_clean.columns and 'loan_status' in train_clean.columns:
    ct = pd.crosstab(train_clean['credit_history'], train_clean['loan_status'], normalize='index') * 100
    display(ct.round(2))
    ct.to_csv(OUT_DIR / "crosstab_credit_history_vs_status_pct.csv")


In [None]:

# 2.5 Identify outliers: Using IQR method for numeric features
outlier_report = {}
for col in numeric_features:
    Q1 = train_clean[col].quantile(0.25)
    Q3 = train_clean[col].quantile(0.75)
    IQR = Q3 - Q1
    low = Q1 - 1.5 * IQR
    high = Q3 + 1.5 * IQR
    outliers = train_clean[(train_clean[col] < low) | (train_clean[col] > high)]
    outlier_report[col] = {
        'low_threshold': low, 'high_threshold': high, 'num_outliers': len(outliers)
    }

import json
print("Outlier summary:")
print(json.dumps(outlier_report, indent=2))

# Save a small sample of outliers for review
outlier_samples = pd.concat([train_clean[train_clean[col] > outlier_report[col]['high_threshold']].head(5) for col in numeric_features if outlier_report[col]['num_outliers']>0], ignore_index=True)
outlier_samples.to_csv(OUT_DIR / "outlier_samples.csv", index=False)
display(outlier_samples.head())


In [None]:

# Phase 3: Reporting & Insights (automated summary)
summary = []

# Basic approval rates
if 'loan_status' in train_clean.columns:
    approval_counts = train_clean['loan_status'].value_counts(normalize=True) * 100
    summary.append("Loan approval distribution (train):\n" + approval_counts.round(2).to_string())

# Credit history effect
if 'credit_history' in train_clean.columns and 'loan_status' in train_clean.columns:
    ct = pd.crosstab(train_clean['credit_history'], train_clean['loan_status'], normalize='index') * 100
    summary.append("\nCredit history vs Loan status (%)\n" + ct.round(2).to_string())

# Education effect
if 'education' in train_clean.columns and 'loan_status' in train_clean.columns:
    ct2 = pd.crosstab(train_clean['education'], train_clean['loan_status'], normalize='index') * 100
    summary.append("\nEducation vs Loan status (%)\n" + ct2.round(2).to_string())

# Output summary to text file
summary_text = "\n\n".join(summary)
with open(OUT_DIR / "eda_summary.txt", "w") as f:
    f.write(summary_text)

print("Summary written to", OUT_DIR / "eda_summary.txt")
print("\n--- Summary preview ---\n")
print(summary_text)


In [None]:

# List created files
for p in sorted(OUT_DIR.rglob("*")):
    print(p.relative_to("/mnt/data"))
