In [2]:
import pandas as pd
import numpy as np
from faker import Faker
from datetime import timedelta

fake = Faker("en_IN")
np.random.seed(42)

# -----------------------------
# CONFIG
# -----------------------------
N_ACCOUNTS = 10000
N_AGENTS = 50
N_BRANCHES = 20
START_DATE = pd.to_datetime("2022-01-01")

# -----------------------------
# BRANCH MASTER
# -----------------------------
branch_master = pd.DataFrame({
    "branch_id": [f"BR{i:03}" for i in range(1, N_BRANCHES + 1)],
    "branch_name": [f"Branch_{i}" for i in range(1, N_BRANCHES + 1)],
    "region": np.random.choice(["South", "North", "East", "West"], N_BRANCHES),
    "state": np.random.choice(
        ["TN", "KA", "MH", "DL", "AP", "TG", "KL"], N_BRANCHES
    )
})

# -----------------------------
# COLLECTION AGENTS
# -----------------------------
collection_agents = pd.DataFrame({
    "agent_id": [f"AG{i:03}" for i in range(1, N_AGENTS + 1)],
    "agent_name": [fake.name() for _ in range(N_AGENTS)],
    "role": np.random.choice(["Telecaller", "Field Officer"], N_AGENTS, p=[0.6, 0.4]),
    "branch_id": np.random.choice(branch_master["branch_id"], N_AGENTS),
    "joining_date": [fake.date_between("-3y", "-6m") for _ in range(N_AGENTS)],
    "active_flag": True
})

# -----------------------------
# LOAN ACCOUNTS
# -----------------------------
product_map = {
    "Gold Loan": (50000, 500000),
    "MSME Loan": (200000, 2500000),
    "Personal Loan": (50000, 1000000),
    "LAP": (500000, 5000000)
}

products = np.random.choice(
    list(product_map.keys()),
    size=N_ACCOUNTS,
    p=[0.4, 0.3, 0.2, 0.1]
)

loan_amounts = [
    np.random.randint(*product_map[p]) for p in products
]

current_dpd = np.random.choice(
    [0, 15, 30, 45, 60, 90],
    size=N_ACCOUNTS,
    p=[0.55, 0.15, 0.12, 0.08, 0.06, 0.04]
)

loan_accounts = pd.DataFrame({
    "account_id": [f"LN{i:07}" for i in range(1, N_ACCOUNTS + 1)],
    "customer_id": [f"CUST{i:07}" for i in range(1, N_ACCOUNTS + 1)],
    "product_type": products,
    "loan_amount": loan_amounts,
    "outstanding_principal": (np.array(loan_amounts) * np.random.uniform(0.3, 0.9, N_ACCOUNTS)).astype(int),
    "interest_rate": np.random.uniform(10, 22, N_ACCOUNTS).round(2),
    "emi_amount": (np.array(loan_amounts) / np.random.choice([12, 24, 36, 48], N_ACCOUNTS)).astype(int),
    "tenure_months": np.random.choice([12, 24, 36, 48], N_ACCOUNTS),
    "disbursement_date": START_DATE + pd.to_timedelta(np.random.randint(0, 700, N_ACCOUNTS), unit="D"),
    "branch_id": np.random.choice(branch_master["branch_id"], N_ACCOUNTS),
    "current_dpd": current_dpd
})

loan_accounts["max_historical_dpd"] = loan_accounts["current_dpd"] + np.random.randint(0, 30, N_ACCOUNTS)
loan_accounts["risk_segment"] = pd.cut(
    loan_accounts["current_dpd"],
    bins=[-1, 0, 30, 60, 999],
    labels=["Current", "Bucket1", "Bucket2", "Bucket3"]
)

loan_accounts["account_status"] = np.where(
    loan_accounts["current_dpd"] >= 90, "NPA", "Active"
)

loan_accounts["last_payment_date"] = loan_accounts["disbursement_date"] + pd.to_timedelta(
    np.random.randint(30, 600, N_ACCOUNTS), unit="D"
)

loan_accounts["next_due_date"] = loan_accounts["last_payment_date"] + pd.to_timedelta(30, unit="D")

loan_accounts["collateral_value"] = np.where(
    loan_accounts["product_type"].isin(["Gold Loan", "LAP"]),
    loan_accounts["loan_amount"] * np.random.uniform(0.8, 1.2, N_ACCOUNTS),
    None
)

# -----------------------------
# REPAYMENT SCHEDULE
# -----------------------------
repayment_rows = []

for _, row in loan_accounts.iterrows():
    for m in range(1, 13):
        due_date = row["disbursement_date"] + pd.DateOffset(months=m)
        dpd = row["current_dpd"]

        if dpd == 0:
            status = "Paid"
            delay = 0
        elif dpd <= 30:
            status = np.random.choice(["Paid", "Partial"], p=[0.6, 0.4])
            delay = np.random.randint(1, 20)
        else:
            status = np.random.choice(["Missed", "Partial"], p=[0.7, 0.3])
            delay = np.random.randint(15, 60)

        repayment_rows.append({
            "schedule_id": fake.uuid4(),
            "account_id": row["account_id"],
            "due_date": due_date,
            "emi_due": row["emi_amount"],
            "principal_due": int(row["emi_amount"] * 0.7),
            "interest_due": int(row["emi_amount"] * 0.3),
            "payment_status": status,
            "payment_date": due_date + timedelta(days=delay) if status != "Missed" else None,
            "days_delayed": delay if status != "Paid" else 0
        })

repayment_schedule = pd.DataFrame(repayment_rows)

# -----------------------------
# DPD HISTORY
# -----------------------------
dpd_history_rows = []

for _, row in loan_accounts.iterrows():
    base_dpd = row["current_dpd"]
    for m in range(6):
        dpd_val = max(0, base_dpd + np.random.randint(-15, 20))
        if dpd_val <= 0:
            bucket = "0"
        elif dpd_val <= 30:
            bucket = "1-30"
        elif dpd_val <= 60:
            bucket = "31-60"
        elif dpd_val <= 90:
            bucket = "61-90"
        else:
            bucket = "90+"

        dpd_history_rows.append({
            "dpd_record_id": fake.uuid4(),
            "account_id": row["account_id"],
            "as_of_date": pd.to_datetime("2023-01-01") + pd.DateOffset(months=m),
            "dpd": dpd_val,
            "dpd_bucket": bucket,
            "bucket_entry_date": pd.to_datetime("2023-01-01") + pd.DateOffset(months=m)
        })

dpd_history = pd.DataFrame(dpd_history_rows)

# -----------------------------
# COLLECTION INTERACTIONS
# -----------------------------
interaction_rows = []

for acc in loan_accounts.sample(6000)["account_id"]:
    for _ in range(np.random.randint(1, 6)):
        interaction_rows.append({
            "interaction_id": fake.uuid4(),
            "account_id": acc,
            "agent_id": np.random.choice(collection_agents["agent_id"]),
            "interaction_date": fake.date_between("-90d", "today"),
            "contact_type": np.random.choice(["Call", "Visit", "WhatsApp"]),
            "contact_outcome": np.random.choice(["Connected", "No Answer"], p=[0.6, 0.4]),
            "discussion_result": np.random.choice(["PTP", "Paid", "Dispute", "No Response"])
        })

collection_interactions = pd.DataFrame(interaction_rows)

# -----------------------------
# PROMISE TO PAY
# -----------------------------
ptp_rows = []

for _, row in collection_interactions[collection_interactions["discussion_result"] == "PTP"].iterrows():
    kept = np.random.choice(["Kept", "Broken"], p=[0.55, 0.45])
    ptp_rows.append({
        "ptp_id": fake.uuid4(),
        "account_id": row["account_id"],
        "agent_id": row["agent_id"],
        "ptp_date": fake.date_between("today", "+15d"),
        "ptp_amount": np.random.randint(1000, 50000),
        "ptp_created_date": row["interaction_date"],
        "ptp_status": kept,
        "actual_payment_date": fake.date_between("today", "+20d") if kept == "Kept" else None,
        "actual_payment_amount": np.random.randint(1000, 50000) if kept == "Kept" else 0
    })

promise_to_pay = pd.DataFrame(ptp_rows)

# -----------------------------
# ACCOUNT RISK SCORE (DERIVED)
# -----------------------------
account_risk_score = loan_accounts[[
    "account_id", "current_dpd", "risk_segment"
]].copy()

account_risk_score["missed_emi_count_90d"] = np.random.randint(0, 5, len(account_risk_score))
account_risk_score["ptp_break_rate"] = np.random.uniform(0, 1, len(account_risk_score)).round(2)
account_risk_score["last_contact_gap_days"] = np.random.randint(1, 45, len(account_risk_score))

account_risk_score["risk_score"] = (
    account_risk_score["current_dpd"] * 0.6 +
    account_risk_score["missed_emi_count_90d"] * 10 +
    account_risk_score["ptp_break_rate"] * 20
).astype(int)

account_risk_score["priority_flag"] = pd.cut(
    account_risk_score["risk_score"],
    bins=[-1, 30, 60, 999],
    labels=["Low", "Medium", "High"]
)

# -----------------------------
# COLLECTION KPI SUMMARY
# -----------------------------
collection_kpi_summary = (
    collection_interactions
    .groupby("agent_id")
    .size()
    .reset_index(name="total_accounts_handled")
)

collection_kpi_summary["month"] = pd.to_datetime("2023-12-01")
collection_kpi_summary["contact_rate"] = np.random.uniform(0.4, 0.8, len(collection_kpi_summary))
collection_kpi_summary["recovery_amount"] = np.random.randint(200000, 5000000, len(collection_kpi_summary))
collection_kpi_summary["recovery_rate"] = np.random.uniform(0.2, 0.7, len(collection_kpi_summary))
collection_kpi_summary["ptp_success_rate"] = np.random.uniform(0.3, 0.8, len(collection_kpi_summary))
collection_kpi_summary["avg_resolution_days"] = np.random.randint(5, 40, len(collection_kpi_summary))

# -----------------------------
# EXPORT FILES
# -----------------------------
loan_accounts.to_csv("loan_accounts.csv", index=False)
repayment_schedule.to_csv("repayment_schedule.csv", index=False)
dpd_history.to_csv("dpd_history.csv", index=False)
collection_interactions.to_csv("collection_interactions.csv", index=False)
promise_to_pay.to_csv("promise_to_pay.csv", index=False)
collection_agents.to_csv("collection_agents.csv", index=False)
branch_master.to_csv("branch_master.csv", index=False)
account_risk_score.to_csv("account_risk_score.csv", index=False)
collection_kpi_summary.to_csv("collection_kpi_summary.csv", index=False)

print("✅ NBFC DATASET GENERATED SUCCESSFULLY")


✅ NBFC DATASET GENERATED SUCCESSFULLY
