In [13]:
import numpy as np
import pandas as pd
import random

random.seed(42)
np.random.seed(42)

print("Libraries loaded.")

Libraries loaded.


In [14]:
import numpy as np
import pandas as pd
import random

random.seed(42)
np.random.seed(42)

def generate_single_investor():
    # 1. Age: centred around mid-30s, clipped to a realistic adult range
    age = int(np.random.normal(loc=35, scale=10))
    age = min(max(age, 20), 75)

    # 2. Income band
    income_band = random.choices(
        ["Low", "Medium", "High"],
        weights=[0.3, 0.5, 0.2],
        k=1
    )[0]

    # 3. Financial knowledge (slightly higher in mid-age, lowest in oldest group)
    if age <= 30:
        fk_weights = [0.25, 0.45, 0.30]   # Low, Medium, High
    elif age <= 50:
        fk_weights = [0.30, 0.45, 0.25]
    else:
        fk_weights = [0.40, 0.45, 0.15]
    financial_knowledge = random.choices(
        ["Low", "Medium", "High"],
        weights=fk_weights,
        k=1
    )[0]

    # 4. Investment experience (tends to rise with age)
    if age <= 30:
        ie_weights = [0.45, 0.40, 0.15]   # No experience, Basic, Advanced
    elif age <= 50:
        ie_weights = [0.25, 0.50, 0.25]
    else:
        ie_weights = [0.30, 0.45, 0.25]
    investment_experience = random.choices(
        ["No experience", "Basic", "Advanced"],
        weights=ie_weights,
        k=1
    )[0]

    # 5. Risk preference (younger = more tolerant, older = more cautious)
    if age <= 30:
        rp_weights = [0.20, 0.45, 0.35]   # Avoid, Some risk, High risk
    elif age <= 50:
        rp_weights = [0.30, 0.45, 0.25]
    else:
        rp_weights = [0.45, 0.40, 0.15]
    risk_preference = random.choices(
        ["Avoid risk", "Comfortable with some risk", "Comfortable with high risk"],
        weights=rp_weights,
        k=1
    )[0]

    # 6. Loss aversion (older investors more uncomfortable with losses)
    if age <= 30:
        la_weights = [0.25, 0.45, 0.30]   # Large swings, small losses, very uncomfortable
    elif age <= 50:
        la_weights = [0.20, 0.50, 0.30]
    else:
        la_weights = [0.15, 0.45, 0.40]
    loss_aversion = random.choices(
        ["Can accept large swings", "Can accept small losses", "Very uncomfortable with losses"],
        weights=la_weights,
        k=1
    )[0]

    # 7. Investment horizon: depends on age (this is the big realism part)
    if age <= 30:
        # younger = mostly long horizon
        horizon_options = [10, 12, 15, 20, 25]
        horizon_weights = [0.25, 0.25, 0.20, 0.20, 0.10]
    elif age <= 50:
        # mid-age = mix of medium and long
        horizon_options = [5, 7, 10, 12, 15]
        horizon_weights = [0.25, 0.30, 0.25, 0.15, 0.05]
    else:
        # older = mainly short horizon
        horizon_options = [1, 3, 5, 7, 10]
        horizon_weights = [0.30, 0.30, 0.25, 0.10, 0.05]
    investment_horizon_years = random.choices(
        horizon_options,
        weights=horizon_weights,
        k=1
    )[0]

    # 8. Primary goal: younger want growth, older want preservation
    if age <= 30:
        pg_weights = [0.15, 0.35, 0.50]   # Capital preservation, Balanced, Aggressive
    elif age <= 50:
        pg_weights = [0.25, 0.45, 0.30]
    else:
        pg_weights = [0.50, 0.35, 0.15]
    primary_goal = random.choices(
        ["Capital preservation", "Balanced growth", "Aggressive growth"],
        weights=pg_weights,
        k=1
    )[0]

    # ---------- Risk score (used to derive risk_profile) ----------
    score = 0

    # Age: younger people can normally afford more risk
    if age <= 30:
        score += 2
    elif age <= 50:
        score += 1

    # Income
    if income_band == "Medium":
        score += 1
    elif income_band == "High":
        score += 2

    # Financial knowledge
    if financial_knowledge == "Medium":
        score += 1
    elif financial_knowledge == "High":
        score += 2

    # Investment experience
    if investment_experience == "Basic":
        score += 1
    elif investment_experience == "Advanced":
        score += 2

    # Risk preference
    if risk_preference == "Comfortable with some risk":
        score += 1
    elif risk_preference == "Comfortable with high risk":
        score += 2

    # Loss aversion (reverse: less loss-averse = more risk score)
    if loss_aversion == "Can accept small losses":
        score += 1
    elif loss_aversion == "Can accept large swings":
        score += 2

    # Horizon: longer horizon supports more risk
    if 3 <= investment_horizon_years <= 7:
        score += 1
    elif investment_horizon_years > 7:
        score += 2

    # Primary goal
    if primary_goal == "Balanced growth":
        score += 1
    elif primary_goal == "Aggressive growth":
        score += 2

    # Map total score to risk profile
    if score <= 7:
        risk_profile = "Conservative"
    elif score <= 11:
        risk_profile = "Moderate"
    else:
        risk_profile = "Aggressive"

    return {
        "age": age,
        "income_band": income_band,
        "financial_knowledge": financial_knowledge,
        "investment_experience": investment_experience,
        "risk_preference": risk_preference,
        "loss_aversion": loss_aversion,
        "investment_horizon_years": investment_horizon_years,
        "primary_goal": primary_goal,
        "risk_profile": risk_profile
    }

print("Realistic generator ready.")

Realistic generator ready.


In [15]:
n_samples = 5000

rows = [generate_single_investor() for _ in range(n_samples)]
df = pd.DataFrame(rows)

print(df.shape)
print(df["risk_profile"].value_counts())

(5000, 9)
risk_profile
Moderate        3075
Conservative    1459
Aggressive       466
Name: count, dtype: int64


In [16]:
print("Nulls per column:")
print(df.isnull().sum())

print("\nUnique values in investment_experience:")
print(df["investment_experience"].unique())

print("\nUnique values in loss_aversion:")
print(df["loss_aversion"].unique())

Nulls per column:
age                         0
income_band                 0
financial_knowledge         0
investment_experience       0
risk_preference             0
loss_aversion               0
investment_horizon_years    0
primary_goal                0
risk_profile                0
dtype: int64

Unique values in investment_experience:
['Basic' 'No experience' 'Advanced']

Unique values in loss_aversion:
['Very uncomfortable with losses' 'Can accept small losses'
 'Can accept large swings']


In [17]:
df.to_csv("investor_profiles.csv", index=False)
print("Clean file saved.")

Clean file saved.


In [18]:
df.head()

Unnamed: 0,age,income_band,financial_knowledge,investment_experience,risk_preference,loss_aversion,investment_horizon_years,primary_goal,risk_profile
0,39,Medium,Low,Basic,Avoid risk,Very uncomfortable with losses,10,Aggressive growth,Conservative
1,33,Low,Medium,No experience,Avoid risk,Can accept small losses,5,Capital preservation,Conservative
2,41,Medium,Medium,No experience,Comfortable with some risk,Very uncomfortable with losses,5,Aggressive growth,Conservative
3,50,Medium,Medium,No experience,Comfortable with high risk,Can accept small losses,5,Capital preservation,Conservative
4,32,High,Medium,Advanced,Comfortable with some risk,Can accept small losses,15,Balanced growth,Moderate
