In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random

In [None]:
# Population size
n = 100000

In [None]:
# Beta distribution parameters to simulate skewed credit score distribution
a, b = 4, 2  # Shape parameters for the beta distribution (right-skewed)

# Generate beta distributed values and scale them to credit score range (300 to 850)
np.random.seed(42)
credit_scores = np.random.beta(a, b, size=n)
credit_scores = 300 + credit_scores * 550  # Scale beta values to the range [300, 850]
credit_scores = [round(score) for score in credit_scores]

In [None]:
# Log-normal parameters for income distribution
mean_log_income = np.log(50000)  # Median income around $60,000
std_dev_log_income = .8  # Adjust standard deviation for income distribution

# Generate synthetic income data (log-normal distribution, without capping extreme outliers)
incomes = np.random.lognormal(mean=mean_log_income, sigma=std_dev_log_income, size=round(n*1.03))
incomes = [round(income, 2) for income in incomes]

# Filter values that fall within the range [min_income, max_income]
min_income = 10000
max_income = 500000
incomes = np.array(incomes)
incomes = incomes[(incomes >= min_income) & (incomes <= max_income)]
incomes = incomes[:n]

In [None]:
# Generate synthetic data with a uniform distribution
min_loan = 1000
max_loan = 30000
loan_amounts = np.random.uniform(min_loan, max_loan, n)
loan_amounts = [round(amt, 2) for amt in loan_amounts]

In [None]:
# Distribution of employment type
distribution = {
    "self": 0.20,
    "govt": 0.15,
    "private": 0.65
}

# Create the list based on the given distribution
employment_types = [[key] * int(n * distribution[key]) for key in distribution]
employment_types = np.concatenate(employment_types).tolist()

# If there's a rounding error and the total doesn't add to n, adjust
while len(employment_types) < n:
    employment_types.append("private")  # Assuming 'private' gets the remainder

random.shuffle(employment_types)  # Shuffle to randomize the list

In [None]:
# Distribution of education type
distribution = {
    "advanced": 0.14,
    "bachelors": 0.43,
    "associate": 0.1,
    "high_school": 0.23,
    "less_than_high_school": 0.1
}

# Create the list based on the given distribution
education_types = [[key] * int(n * distribution[key]) for key in distribution]
education_types = np.concatenate(education_types).tolist()

# If there's a rounding error and the total doesn't add to n, adjust
while len(education_types) < n:
    education_types.append("bachelors")  # Assuming 'bachelors' gets the remainder

random.shuffle(education_types)  # Shuffle to randomize the list

In [None]:
# Create a DataFrame
df = pd.DataFrame({
    'income': incomes,
    'credit_score': credit_scores,
    'amount_requested': loan_amounts,
    'employment_type': employment_types,
    'education_type': education_types
})

In [None]:
df.head(20)

In [None]:
df.describe()

In [None]:
# Plot 1: Histogram of Credit Score Distribution
plt.figure(figsize=(10,6))
plt.hist(df['credit_score'], bins=20, color='lightblue', edgecolor='black')
plt.title('Distribution of Credit Scores (Beta Distribution)')
plt.xlabel('Credit Score')
plt.ylabel('Number of People')
# plt.grid(True)
plt.show()

In [None]:
# Plot 2: Histogram of Income Distribution
plt.figure(figsize=(10,6))
plt.hist(df['income'], bins=100, color='lightgreen', edgecolor='black')
plt.title('Distribution of Incomes (Log-Normal Distribution)')
plt.xlabel('Annual Income ($)')
plt.ylabel('Number of People')
plt.xscale('log')  # Log scale to better visualize income extremes
# plt.grid(True)
plt.show()

In [None]:
# Plot 3: Scatterplot of Credit Score vs. Income
plt.figure(figsize=(10,6))
plt.scatter(df['income'], df['credit_score'], alpha=0.5, s=1)
plt.title('Income vs Credit Score')
plt.xlabel('Annual Income ($)')
plt.ylabel('Credit Score')
# plt.xscale('log')  # Log scale to better visualize income extremes
# plt.grid(True)
plt.show()

In [None]:
# Save dataset to a CSV file
df.to_csv('loan_data.csv', index=False)