In [4]:
import numpy as np
import pandas as pd

In [5]:
Random_State = 42
np.random.seed(Random_State)

In [6]:
n_samples = 5000

In [7]:
age = np.random.randint(21, 65, n_samples)
annual_income = np.random.normal(800000, 300000, n_samples).clip(200000, None)
#.normal -> noraml(Gaussian) Distribution
# Mean -> Average Income = 8lakhs
# Standard Deviation -> Income will typically vary by about 3 lakhs (above/below)
employment_years = np.random.randint(0, 40, n_samples)
credit_score = np.random.normal(680, 60, n_samples).clip(300, 900)
existing_loans = np.random.poisson(2, n_samples)
debt_to_income = np.random.uniform(0.1, 0.8, n_samples)
loan_amount = np.random.normal(500000, 250000, n_samples).clip(50000, None)
interest_rate = np.random.uniform(8, 24, n_samples)
loan_tenure_months = np.random.choice([12, 24, 36, 48, 60], n_samples)

risk_score = (
    (900 - credit_score) * 0.4 +
    debt_to_income * 300 +
    existing_loans * 50 +
    (loan_amount / annual_income) * 200 +
    (interest_rate - 8) * 10
)

default_probability = 1 / (1 + np.exp(-risk_score / 200))
default = np.random.binomial(1, default_probability)
# Bernoulli Trial when n=1
    # Return 1 -> default occurred / 0 -> no default

data = pd.DataFrame({
    "age": age,
    "annual_income": annual_income,
    "employment_years": employment_years,
    "credit_score": credit_score,
    "existing_loans": existing_loans,
    "debt_to_income": debt_to_income,
    "loan_amount": loan_amount,
    "interest_rate": interest_rate,
    "loan_tenure_months": loan_tenure_months,
    "default": default
})

data.head()

Unnamed: 0,age,annual_income,employment_years,credit_score,existing_loans,debt_to_income,loan_amount,interest_rate,loan_tenure_months,default
0,59,369660.4,7,709.460237,4,0.74793,246709.974845,23.047913,60,1
1,49,482800.2,26,699.789384,4,0.460381,588472.735054,20.596041,36,1
2,35,670362.9,14,676.293436,4,0.43677,355519.892644,19.768616,60,1
3,63,1269386.0,12,689.805669,4,0.797273,911677.761702,22.214834,12,1
4,28,725974.8,15,644.163264,1,0.368751,779753.497258,20.316479,12,1


In [8]:
noisy_data=data.copy()

In [15]:
income_noise=np.random.normal(0,50000,size=len(noisy_data))
noisy_data["annual_income"] += income_noise
credit_score_noise = np.random.normal(0,25,size = len(noisy_data))
noisy_data["credit_score"] += credit_score_noise

In [17]:
missing_rate = 0.1
for col in ["employement_years","annual_income"]:
    missing_indices = noisy_data.sample(frac = missing_rate).index
    noisy_data.loc[missing_indices, col] = np.nan
    

In [24]:
outlier_rate = 0.02
outlier_count = int(outlier_rate * len(noisy_data))
outlier_indices = np.random.choice(noisy_data.index, outlier_count, replace = False)
noisy_data.loc[outlier_indices, "loan_amount"] *= 3

In [25]:
noisy_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   age                 5000 non-null   int32  
 1   annual_income       4500 non-null   float64
 2   employment_years    5000 non-null   int32  
 3   credit_score        5000 non-null   float64
 4   existing_loans      5000 non-null   int32  
 5   debt_to_income      5000 non-null   float64
 6   loan_amount         5000 non-null   float64
 7   interest_rate       5000 non-null   float64
 8   loan_tenure_months  5000 non-null   int64  
 9   default             5000 non-null   int32  
 10  employement_years   0 non-null      float64
dtypes: float64(6), int32(4), int64(1)
memory usage: 351.7 KB


In [26]:
noisy_data

Unnamed: 0,age,annual_income,employment_years,credit_score,existing_loans,debt_to_income,loan_amount,interest_rate,loan_tenure_months,default,employement_years
0,59,2.632675e+05,7,704.681540,4,0.747930,2.467100e+05,23.047913,60,1,
1,49,4.018144e+05,26,737.022174,4,0.460381,5.884727e+05,20.596041,36,1,
2,35,6.159279e+05,14,610.390292,4,0.436770,3.555199e+05,19.768616,60,1,
3,63,1.210814e+06,12,686.724862,4,0.797273,9.116778e+05,22.214834,12,1,
4,28,7.601178e+05,15,667.809923,1,0.368751,7.797535e+05,20.316479,12,1,
...,...,...,...,...,...,...,...,...,...,...,...
4995,53,8.530625e+05,24,736.475308,4,0.451083,5.386894e+05,18.420542,12,0,
4996,52,6.439126e+05,16,748.172880,0,0.519246,6.767036e+05,16.307141,24,1,
4997,49,6.650251e+05,6,688.531098,1,0.286024,1.244723e+06,14.612556,24,1,
4998,41,1.308588e+06,18,609.291813,2,0.242305,1.024998e+05,13.566304,24,1,
