In [0]:
# Databricks notebook: Step 2 – Training Feature Engineering (Spark Connect Safe)
from pyspark.sql import SparkSession, functions as F
from pyspark.ml.feature import VectorAssembler

spark = SparkSession.builder.getOrCreate()

# --------------------------------------------------
# 1. Load raw data (take 30 % sample for training)
# --------------------------------------------------
df = spark.read.table("raw_credit_data")
df = df.sample(fraction=0.3, seed=42)

# --------------------------------------------------
# 2. Basic cleaning
# --------------------------------------------------
df = (
    df.fillna({
        "gender": "Unknown",
        "marital_status": "Unknown",
        "employment_type": "Unknown",
        "occupation_risk": "Unknown",
        "purpose": "Unknown",
        "region_risk_tier": "Unknown"
    })
    .withColumn("debt_to_income", F.when(F.col("debt_to_income") > 1, 1).otherwise(F.col("debt_to_income")))
    .withColumn("payment_to_income", F.when(F.col("payment_to_income") > 1, 1).otherwise(F.col("payment_to_income")))
    .withColumn("credit_limit_utilization",
                F.when(F.col("credit_limit_utilization") > 1, 1)
                 .otherwise(F.col("credit_limit_utilization")))
)

# --------------------------------------------------
# 3. Derived ratios
# --------------------------------------------------
df = (
    df.withColumn("loan_to_income", F.col("loan_amount") / F.col("annual_income"))
      .withColumn("credit_to_income", F.col("total_credit_balance") / F.col("annual_income"))
      .withColumn("installment_ratio",
                  F.col("payment_to_income") / (F.col("debt_to_income") + F.lit(1e-6)))
)

# --------------------------------------------------
# 4. Manual ordinal mappings (no model fitting)
# --------------------------------------------------
def map_col(df, colname, mapping_dict, default_val=0):
    mapping_expr = F.create_map(
        [F.lit(i) for kv in [[k, v] for k, v in mapping_dict.items()] for i in kv]
    )
    return df.withColumn(f"{colname}_idx", F.coalesce(mapping_expr[F.col(colname)], F.lit(default_val)))

df = map_col(df, "gender", {"Male": 0, "Female": 1, "Unknown": 2})
df = map_col(df, "marital_status", {"Single": 0, "Married": 1, "Divorced": 2, "Widowed": 3, "Unknown": 4})
df = map_col(df, "employment_type", {"Salaried": 0, "Self-Employed": 1, "Contract": 2, "Unknown": 3})
df = map_col(df, "occupation_risk", {"Low": 0, "Medium": 1, "High": 2, "Unknown": 3})
df = map_col(df, "purpose", {"Home": 0, "Car": 1, "Education": 2, "Business": 3, "Personal": 4, "Unknown": 5})
df = map_col(df, "region_risk_tier", {"Low": 0, "Medium": 1, "High": 2, "Unknown": 3})

# --------------------------------------------------
# 5. Assemble numeric + encoded features
# --------------------------------------------------
numeric_features = [
    "age", "dependents", "employment_length", "annual_income", "credit_score",
    "num_open_accounts", "num_delinquencies", "avg_utilization_ratio",
    "num_credit_inquiries", "loan_amount", "loan_term_months", "interest_rate",
    "existing_loans_count", "debt_to_income", "payment_to_income",
    "credit_limit_utilization", "total_credit_balance", "recent_missed_payments",
    "time_since_last_default", "months_with_bank",
    "loan_to_income", "credit_to_income", "installment_ratio"
]

assembler_inputs = numeric_features + [
    "gender_idx", "marital_status_idx", "employment_type_idx",
    "occupation_risk_idx", "purpose_idx", "region_risk_tier_idx"
]

assembler = VectorAssembler(inputCols=assembler_inputs, outputCol="features")
train_features = assembler.transform(df).select("features", "default_flag")

# --------------------------------------------------
# 6. Save training feature table
# --------------------------------------------------
train_features.write.mode("overwrite").saveAsTable("default.features_credit_train")

print("✅ Training feature table created: features_credit_train")
print(f"Rows: {train_features.count()}, Columns: {len(assembler_inputs)}")

In [0]:
%sql
select count(*) from features_credit_train