<a href="https://colab.research.google.com/github/BrindhaHema/CreditRisk_StressTesting/blob/main/synthdatagen.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
import pandas as pd
import numpy as np
!pip install faker
from faker import Faker
import random
from datetime import date

n = 5000  # number of synthetic borrowers
fake = Faker()

sectors = ['Consumer', 'Mortgage', 'SME', 'Corporate']
regions = ['East', 'West', 'North', 'South']
collateral_types = ['Property', 'Unsecured', 'Inventory', 'Plant/Equipment']
product_types = ['Personal Loan', 'Credit Card', 'Home Loan', 'Business Loan', 'Term Loan']

gdp_growth = 5.2
unemployment = 7.2
interest_rate = 8.0

rows = []
for i in range(1, n + 1):
    income = np.round(np.random.uniform(30000, 150000), 2)
    dti = np.round(np.random.uniform(0.1, 0.8), 3)
    credit_score = int(np.random.normal(650, 30))
    sector = random.choice(sectors)
    region = random.choice(regions)
    ead = int(np.random.uniform(10000, 1000000))
    lgd = np.round(np.random.uniform(0.15, 0.6), 2)
    prob_default = np.round(np.random.uniform(0.003, 0.08), 3) # Renamed 'pd' to 'prob_default'
    collateral_value = int(ead * np.random.uniform(0.3, 2.0)) if random.random() > 0.25 else 0
    collateral_type = random.choice(collateral_types) if collateral_value > 0 else 'Unsecured'
    product_type = random.choice(product_types)
    loan_year = random.randint(2017, 2025)

    # Convert date strings to datetime.date objects for reliable parsing by faker
    start_date_obj = date(loan_year, 1, 1)
    end_date_obj = date(loan_year, 12, 31)
    loan_start_date = fake.date_between(start_date=start_date_obj, end_date=end_date_obj)

    loan_vintage = f"{2025 - loan_year} yrs"
    default_flag = np.random.binomial(1, prob_default) # Use 'prob_default'
    recovery = int(ead * np.random.uniform(0.4, 0.9)) if default_flag == 1 else ead
    arrears_30d = np.random.binomial(1, prob_default * 2) # Use 'prob_default'
    arrears_60d = np.random.binomial(1, prob_default) # Use 'prob_default'
    arrears_90d = np.random.binomial(1, prob_default * 0.5) # Use 'prob_default'
    row = [
        i, income, dti, credit_score, sector, region, ead, lgd, prob_default, default_flag, recovery, # Use 'prob_default'
        collateral_value, collateral_type, product_type, str(loan_start_date), loan_vintage,
        interest_rate, gdp_growth, unemployment, arrears_30d, arrears_60d, arrears_90d
    ]
    rows.append(row)

columns = [
    'borrower_id', 'income', 'dti', 'credit_score', 'sector', 'region', 'ead', 'lgd', 'pd', 'default_flag', 'recovery',
    'collateral_value', 'collateral_type', 'product_type', 'loan_start_date', 'loan_vintage',
    'interest_rate', 'gdp_growth', 'unemployment', 'arrears_30d', 'arrears_60d', 'arrears_90d'
]

synthetic_df = pd.DataFrame(rows, columns=columns)
synthetic_df.head(10)
synthetic_df.to_csv('synthetic_credit_risk_data.csv', index=False)

