In [3]:
import random
import pandas as pd

# Define possible housing types
housing_types = ["1&2-Room Flat", "3-Room Flat", "4-Room Flat", "5-Room", "Executive Flat", "Condominium", "Apartment", "Landed Property"]

# Helper function to calculate CPF balance
def calculate_cpf_balance(age, yearly_income):
    if age <= 55:
        total_cpf = yearly_income * 0.37 * (age - 20)
    else:
        total_cpf = yearly_income * 0.37 * 35 + yearly_income * 0.22 * (age - 55)
    return total_cpf

# Helper function to calculate savings
def calculate_savings(yearly_income, age):
    return yearly_income * 0.20 * (age - 20) * 0.5

def generate_score(base, variance=3):
    return max(1, min(10, base + random.randint(-variance, variance)))

# Initialize list to store generated data
data = []

for _ in range(50000):
    # Generate age (between 20 and 65 years old)
    age = random.randint(20, 65)
    
    # Select a random housing type with conditions for younger people
    if age < 30:
        housingtype = random.choice(housing_types[:5])  # More likely to live in smaller flats
    else:
        housingtype = random.choice(housing_types)
    
    # Generate yearly income with a trend based on age
    if age <= 25:
        yearly_income = random.randint(30, 60) * 1000
    elif age <= 40:
        yearly_income = random.randint(40, 80) * 1000
    else:
        yearly_income = random.randint(50, 150) * 1000
    
    # Calculate CPF balance
    cpf_balance = calculate_cpf_balance(age, yearly_income)
    
    # Generate yearly expenditure based on the given conditions
    if age < 30:
        yearly_expenditure = random.randint(5, 20) * 1000  # Lower expenses for young adults
    else:
        monthly_expenses = 2500 + 500 + 1000  # Rent + groceries + car mortgage
        other_expenses = random.randint(1, 5) * 1000  # Other monthly expenses
        yearly_expenditure = (monthly_expenses + other_expenses) * 12
    
    # Calculate savings
    savings = calculate_savings(yearly_income, age)
    
    # Calculate base score using a weighted approach considering income, expenditure, and savings
    income_factor = yearly_income / 10000
    expenditure_factor = yearly_expenditure / 10000
    savings_factor = savings / 10000
    
    base_score = (income_factor - expenditure_factor + savings_factor) / 3
    
    # Normalize base score to fit within 1 to 10 range
    base_score = max(1, min(10, int(base_score)))
    
    # Calculate quality of life, disaster preparedness, and retirement readiness around the base score
    quality_of_life = base_score
    disaster_preparedness = generate_score(base_score)
    retirement_readiness = generate_score(base_score)
    
    # Append generated data to the list
    data.append([age, housingtype, yearly_income, cpf_balance, yearly_expenditure, savings, quality_of_life, disaster_preparedness, retirement_readiness])

# Create a DataFrame from the generated data
columns = ['age', 'housingtype', 'yearly_income', 'cpf_balance', 'yearly_expenditure', 'savings', 'quality_of_life', 'disaster_preparedness', 'retirement_readiness']
df = pd.DataFrame(data, columns=columns)

# Display the generated DataFrame
df.to_csv("../data/mockdata.csv", index=False)
df


Unnamed: 0,age,housingtype,yearly_income,cpf_balance,yearly_expenditure,savings,quality_of_life,disaster_preparedness,retirement_readiness
0,23,4-Room Flat,45000,49950.0,6000,13500.0,1,3,2
1,28,1&2-Room Flat,75000,222000.0,20000,60000.0,3,4,3
2,54,3-Room Flat,102000,1283160.0,60000,346800.0,10,10,7
3,50,1&2-Room Flat,77000,854700.0,84000,231000.0,7,4,9
4,41,Executive Flat,109000,846930.0,96000,228900.0,8,10,8
...,...,...,...,...,...,...,...,...,...
49995,35,Apartment,71000,394050.0,84000,106500.0,3,5,4
49996,31,Executive Flat,67000,272690.0,72000,73700.0,2,5,2
49997,58,Landed Property,84000,1143240.0,60000,319200.0,10,9,10
49998,22,Executive Flat,50000,37000.0,5000,10000.0,1,1,4
