In [0]:
# Databricks notebook or Python cell
import numpy as np
import pandas as pd
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()
np.random.seed(42)

# -----------------------------
# 1. Dataset size
# -----------------------------
n = 300_000

# -----------------------------
# 2. Demographics
# -----------------------------
age = np.random.randint(21, 70, n)
gender = np.random.choice(['Male', 'Female'], n)
marital_status = np.random.choice(['Single', 'Married', 'Divorced', 'Widowed'], n, p=[0.4, 0.45, 0.1, 0.05])
dependents = np.random.poisson(1.2, n)

# -----------------------------
# 3. Employment & income
# -----------------------------
employment_type = np.random.choice(['Salaried', 'Self-Employed', 'Contract'], n, p=[0.7, 0.2, 0.1])
employment_length = np.random.randint(0, 30, n)
annual_income = np.random.lognormal(mean=10.5, sigma=0.5, size=n)  # ~ $30k–$150k
occupation_risk = np.random.choice(['Low', 'Medium', 'High'], n, p=[0.6, 0.3, 0.1])

# -----------------------------
# 4. Credit profile
# -----------------------------
credit_score = np.clip(np.random.normal(680, 60, n), 300, 850)
num_open_accounts = np.random.randint(1, 15, n)
num_delinquencies = np.random.poisson(0.3, n)
avg_utilization_ratio = np.random.beta(2, 5, n)
num_credit_inquiries = np.random.poisson(1.5, n)

# -----------------------------
# 5. Loan details
# -----------------------------
loan_amount = np.random.randint(2_000, 50_000, n)
loan_term_months = np.random.choice([12, 24, 36, 48, 60], n, p=[0.1, 0.2, 0.3, 0.25, 0.15])
interest_rate = np.random.uniform(5, 25, n)
purpose = np.random.choice(['Home', 'Car', 'Education', 'Business', 'Personal'], n, p=[0.25, 0.2, 0.2, 0.15, 0.2])
existing_loans_count = np.random.poisson(1.5, n)

# -----------------------------
# 6. Derived ratios
# -----------------------------
debt_to_income = np.clip(np.random.normal(0.3, 0.1, n), 0, 1)
payment_to_income = np.clip((loan_amount / (annual_income / 12)) * 0.1, 0, 1)
credit_limit_utilization = np.clip(avg_utilization_ratio + np.random.normal(0, 0.05, n), 0, 1)
total_credit_balance = loan_amount * np.random.uniform(0.8, 1.2, n)

# -----------------------------
# 7. Behavioural / temporal
# -----------------------------
recent_missed_payments = np.random.poisson(0.2, n)
time_since_last_default = np.random.randint(0, 120, n)
months_with_bank = np.random.randint(6, 240, n)
region_risk_tier = np.random.choice(['Low', 'Medium', 'High'], n, p=[0.7, 0.2, 0.1])

# -----------------------------
# 8. Default probability logic
# -----------------------------
# weighted risk factors
risk_score = (
    (850 - credit_score) * 0.002
    + debt_to_income * 2.5
    + payment_to_income * 1.2
    + (num_delinquencies > 0).astype(int) * 1.5
    + (avg_utilization_ratio > 0.6).astype(int) * 1.0
    + (occupation_risk == 'High').astype(int) * 1.0
)
prob_default = 1 / (1 + np.exp(-risk_score + np.random.normal(0, 0.2, n)))
default_flag = np.random.binomial(1, prob_default)

# -----------------------------
# 9. Assemble dataframe
# -----------------------------
df = pd.DataFrame({
    "age": age,
    "gender": gender,
    "marital_status": marital_status,
    "dependents": dependents,
    "employment_type": employment_type,
    "employment_length": employment_length,
    "annual_income": annual_income.round(2),
    "occupation_risk": occupation_risk,
    "credit_score": credit_score.round(0),
    "num_open_accounts": num_open_accounts,
    "num_delinquencies": num_delinquencies,
    "avg_utilization_ratio": avg_utilization_ratio.round(2),
    "num_credit_inquiries": num_credit_inquiries,
    "loan_amount": loan_amount,
    "loan_term_months": loan_term_months,
    "interest_rate": interest_rate.round(2),
    "purpose": purpose,
    "existing_loans_count": existing_loans_count,
    "debt_to_income": debt_to_income.round(2),
    "payment_to_income": payment_to_income.round(2),
    "credit_limit_utilization": credit_limit_utilization.round(2),
    "total_credit_balance": total_credit_balance.round(2),
    "recent_missed_payments": recent_missed_payments,
    "time_since_last_default": time_since_last_default,
    "months_with_bank": months_with_bank,
    "region_risk_tier": region_risk_tier,
    "default_flag": default_flag
})

# -----------------------------
# 10. Save to Databricks Delta
# -----------------------------
spark_df = spark.createDataFrame(df)
spark_df.write.mode("overwrite").saveAsTable("raw_credit_data")

print("✅ Synthetic credit-risk dataset created: banking.raw_credit_data")
print(f"Rows: {n}, Columns: {len(df.columns)}")