In [25]:
import numpy as np
import pandas as pd

In [26]:
# Configuration
NUM_SAMPLES = 20000  # Adjust as needed
BASE_SCORE = 650      # Mid-range FICO score for scaling
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

In [27]:
# Feature configurations (questions, answers, probabilities, and score impacts)
feature_config = {
    # Payment History (Weight: 40%)
    "utility_bill_payment": {
        "options": ["Always on time", "Occasionally late", "Frequently late", "Never paid"],
        "prob": [0.6, 0.25, 0.1, 0.05],  # Most pay on time
        "score_impact": [40, -20, -60, -100]  # Strongest impact
    },
    "service_deposit_required": {
        "options": ["No", "Yes, but refunded", "Yes, not refunded"],
        "prob": [0.7, 0.2, 0.1],
        "score_impact": [30, -10, -50]
    },

    # Financial Management (Weight: 30%)
    "debt_collection_history": {
        "options": ["No", "Yes, once", "Yes, more than once"],
        "prob": [0.75, 0.2, 0.05],
        "score_impact": [30, -40, -80]
    },
    "savings_account": {
        "options": ["Yes, sufficient", "Yes, but limited", "No"],
        "prob": [0.3, 0.4, 0.3],
        "score_impact": [30, 10, -20]
    },

    # Income Stability (Weight: 15%)
    "employment_tenure": {
        "options": ["Less than 6 months", "6 months to 1 year", "1 to 3 years", "Over 3 years"],
        "prob": [0.2, 0.3, 0.3, 0.2],
        "score_impact": [-20, 0, 20, 40]
    },
    "additional_income": {
        "options": ["No", "Yes, occasionally", "Yes, regularly"],
        "prob": [0.5, 0.3, 0.2],
        "score_impact": [-10, 10, 20]
    },

    # Housing Stability (Weight: 10%)
    "housing_status": {
        "options": ["Own", "Rent", "Live with family/friends"],
        "prob": [0.4, 0.5, 0.1],
        "score_impact": [20, 0, -15]
    },
    "eviction_history": {
        "options": ["No", "Yes, once", "Yes, more than once"],
        "prob": [0.85, 0.1, 0.05],
        "score_impact": [10, -30, -60]
    },

    # Social/Community (Weight: 5%)
    "community_savings_group": {
        "options": ["Yes, active member", "Yes, inactive member", "No"],
        "prob": [0.2, 0.2, 0.6],
        "score_impact": [15, 5, -5]
    },
    "mobile_money_account": {
        "options": ["Yes, actively used", "Yes, rarely used", "No"],
        "prob": [0.6, 0.3, 0.1],
        "score_impact": [10, 5, -10]
    }
}

In [28]:
# Generate synthetic data
data = {}

In [29]:
for feature, config in feature_config.items():
    data[feature] = np.random.choice(
        config["options"],
        size=NUM_SAMPLES,
        p=config["prob"]
    )

In [30]:
# Calculate credit score based on weighted impacts
score_impacts = np.zeros(NUM_SAMPLES)

In [31]:
for feature, config in feature_config.items():
    impact_map = {opt: imp for opt, imp in zip(config["options"], config["score_impact"])}
    score_impacts += [impact_map[val] for val in data[feature]]

In [32]:
# Scale to FICO-like range (300-850)
min_impact = sum(min(c["score_impact"]) for c in feature_config.values())  # Theoretical min
max_impact = sum(max(c["score_impact"]) for c in feature_config.values())  # Theoretical max

In [33]:
# Normalize and add noise for realism
data["credit_score"] = np.interp(
    score_impacts,
    [min_impact, max_impact],
    [300, 850]
).astype(int) + np.random.normal(0, 15, NUM_SAMPLES).astype(int)

In [34]:
# Create DataFrame
df = pd.DataFrame(data)

In [35]:
df.to_csv("credit_score_dataset.csv", index=False)