In [1]:
import pandas as pd
import random

In [2]:
num_records = 10000

In [3]:
# Define function to generate synthetic dataset with effective weights for risk score calculation
def generate_credit_risk_dataset(num_records):
    data = []

    for _ in range(num_records):
        # Applicant details
        age = random.randint(18, 65)
        marital_status = random.choice(['Single', 'Married', 'Divorced'])
        no_of_dependants = random.randint(0, 6)
        education_level = random.choice(['High School', 'Bachelor’s Degree', 'Master’s Degree', 'PhD'])

        # Guarantor details
        relationship_to_student = random.choice(['Parent', 'Sibling', 'Guardian', 'Other'])
        income_in_kes = random.randint(10000, 150000)
        additional_income = random.randint(0, 50000)
        employment_length = random.randint(0, 47)
        employment_status = random.choice(['Employed', 'Self-Employed', 'Unemployed'])
        guarantor_credit_score = random.randint(300, 850)

        # Financial behavior
        existing_loans = random.choice([True, False])
        outstanding_loan_amount = random.randint(0, 100000) if existing_loans else 0
        monthly_repayment_amount = random.randint(0, 50000) if existing_loans else 0
        monthly_expenses = random.randint(10000, 50000)
        missed_payments_last_year = random.randint(0, 5)
        financial_counseling = random.choice([True, False])

        # Risk score calculation - Apply weights
        risk_score = 0

        # Weights assigned to each factor
        weights = {
            'age': 5,  # Lower age = higher risk
            'marital_status': 3,  # Married or divorced may lower risk slightly
            'no_of_dependants': 2,  # More dependents increase risk
            'education_level': 5,  # Higher education may lower risk
            'income_in_kes': 8,  # Higher income reduces risk
            'additional_income': 4,  # Additional income reduces risk
            'employment_length': 6,  # Longer employment reduces risk
            'employment_status': 7,  # Employment status is crucial
            'guarantor_credit_score': 15,  # Strong guarantor = lower risk
            'existing_loans': 10,  # Existing loans increase risk
            'outstanding_loan_amount': 7,  # Higher outstanding amount = higher risk
            'monthly_repayment_amount': 6,  # Higher repayments = higher risk
            'monthly_expenses': 8,  # Higher expenses = higher risk
            'missed_payments_last_year': 12,  # Missed payments greatly increase risk
            'financial_counseling': 4  # Counseling reduces risk slightly
        }

        # Normalize features (scaling them between 0 and 1)
        normalized_age = 1 - ((age - 18) / (65 - 18))  # Younger = Higher Risk
        normalized_income = (income_in_kes - 10000) / (150000 - 10000)
        normalized_additional_income = additional_income / 50000
        normalized_expenses = (monthly_expenses - 10000) / (50000 - 10000)
        normalized_missed_payments = missed_payments_last_year / 5  # Normalize missed payments between 0 and 1
        normalized_employment_length = employment_length / 47
        normalized_guarantor_credit_score = (guarantor_credit_score - 300) / (850 - 300)
        normalized_outstanding_loan_amount = outstanding_loan_amount / 100000
        normalized_monthly_repayment = monthly_repayment_amount / 50000

        # Compute risk score with weights
        risk_score += normalized_age * weights['age']
        risk_score += {'Single': 1, 'Married': 0.5, 'Divorced': 0.7}[marital_status] * weights['marital_status']
        risk_score += (no_of_dependants / 6) * weights['no_of_dependants']
        risk_score += {'High School': 1, 'Bachelor’s Degree': 0.7, 'Master’s Degree': 0.5, 'PhD': 0.3}[education_level] * weights['education_level']
        risk_score += (1 - normalized_income) * weights['income_in_kes']  # Higher income reduces risk
        risk_score += (1 - normalized_additional_income) * weights['additional_income']
        risk_score += (1 - normalized_employment_length) * weights['employment_length']
        risk_score += {'Employed': 0.3, 'Self-Employed': 0.5, 'Unemployed': 1}[employment_status] * weights['employment_status']
        risk_score += (1 - normalized_guarantor_credit_score) * weights['guarantor_credit_score']
        risk_score += (1 if existing_loans else 0) * weights['existing_loans']
        risk_score += normalized_outstanding_loan_amount * weights['outstanding_loan_amount']
        risk_score += normalized_monthly_repayment * weights['monthly_repayment_amount']
        risk_score += normalized_expenses * weights['monthly_expenses']
        risk_score += normalized_missed_payments * weights['missed_payments_last_year']
        risk_score -= (1 if financial_counseling else 0) * weights['financial_counseling']  # Financial counseling reduces risk

        # Normalize risk score between 0 and 100
        risk_score = min(max(risk_score, 0), 100)

        # Determine risk level based on the risk score
        if risk_score <= 40:
            risk_level = 'Low'
        elif risk_score <= 70:
            risk_level = 'Medium'
        else:
            risk_level = 'High'

        # Append record to data
        data.append({
            'age': age,
            'marital_status': marital_status,
            'no_of_dependants': no_of_dependants,
            'education_level': education_level,
            'relationship_to_student': relationship_to_student,
            'income_in_kes': income_in_kes,
            'additional_income': additional_income,
            'employment_length': employment_length,
            'employment_status': employment_status,
            'guarantor_credit_score': guarantor_credit_score,
            'existing_loans': existing_loans,
            'outstanding_loan_amount': outstanding_loan_amount,
            'monthly_repayment_amount': monthly_repayment_amount,
            'monthly_expenses': monthly_expenses,
            'missed_payments_last_year': missed_payments_last_year,
            'financial_counseling': financial_counseling,
            'risk_score': risk_score,
            'risk_level': risk_level
        })

    # Convert to DataFrame
    return pd.DataFrame(data)

In [4]:
credit_risk = generate_credit_risk_dataset(num_records)

In [5]:
credit_risk.to_csv("credit_risk_dataset.csv", index=False)