In [1]:
import pandas as pd
import numpy as np

In [2]:
# Set seed for reproducibility
np.random.seed(42)
n = 100000  # Number of records

# ------------------ Base Data ------------------
df = pd.DataFrame({
    "Policy_ID": [f"POL{100000 + i}" for i in range(n)],
    "Customer_Age": np.random.randint(18, 70, size=n),
    "Gender": np.random.choice(["Male", "Female"], size=n, p=[0.50, 0.50]),
    "Policy_Type": np.random.choice(["Health", "Auto", "Life", "Property"], size=n, p=[0.3, 0.3, 0.2, 0.2]),
    "Annual_Income": np.random.normal(60000, 20000, size=n).clip(15000, 200000).round(2),
    "Property_Age": np.random.randint(0, 20, size=n),
    "Claim_History": np.random.poisson(lam=1.2, size=n)
})

In [3]:
# ------------------ Property Age Fix ------------------
# Property_Age > 0 for Auto and Property; 0 otherwise
df["Property_Age"] = df.apply(
    lambda x: np.random.randint(1, 20) if x["Policy_Type"] in ["Auto", "Property"] else 0,
    axis=1
)

In [4]:
# ------------------ Risk Score ------------------
conditions = [
    (df["Claim_History"] >= 3) | (df["Property_Age"] > 15),
    (df["Claim_History"] == 2),
    (df["Claim_History"] <= 1) & (df["Property_Age"] < 10)
]
choices = ["High", "Medium", "Low"]
df["Risk_Score"] = np.select(conditions, choices, default="High")

In [5]:
# ------------------ Premium Amount ------------------
premium_base = {"Health": 300, "Auto": 500, "Life": 400, "Property": 450}
df["Premium_Amount"] = df.apply(
    lambda x: premium_base[x["Policy_Type"]] + (x["Customer_Age"] - 18) * 2 +
              (100 if x["Risk_Score"] == "High" else (50 if x["Risk_Score"] == "Medium" else 0)),
    axis=1
).round(2)

In [6]:
# ------------------ Claim Amount ------------------
df["Claim_Amount"] = df.apply(
    lambda x: np.random.normal(
        loc=5000 if x["Risk_Score"] == "Low" else (15000 if x["Risk_Score"] == "Medium" else 30000),
        scale=3000),
    axis=1
).clip(0, 50000).round(2)

# Ensure Premium_Amount is never greater than Claim_Amount
df["Premium_Amount"] = df[["Premium_Amount", "Claim_Amount"]].min(axis=1)

In [7]:
# ------------------ Fraudulent Claim ------------------
fraud_prob = df["Risk_Score"].map({"Low": 0.05, "Medium": 0.15, "High": 0.4}).clip(0, 1)
df["Fraudulent_Claim"] = np.random.binomial(1, fraud_prob)

In [8]:
# ------------------ Save or Preview ------------------
# Save to CSV
df.to_csv("Realistic_Insurance_Dataset.csv", index=False)

# Preview
df.head()


Unnamed: 0,Policy_ID,Customer_Age,Gender,Policy_Type,Annual_Income,Property_Age,Claim_History,Risk_Score,Premium_Amount,Claim_Amount,Fraudulent_Claim
0,POL100000,56,Female,Health,91465.94,0,2,Medium,426.0,14091.8,0
1,POL100001,69,Female,Property,99375.03,9,1,Low,552.0,5916.76,1
2,POL100002,46,Male,Health,79249.71,0,1,Low,356.0,5953.27,0
3,POL100003,32,Male,Auto,78744.75,7,1,Low,528.0,13882.75,0
4,POL100004,60,Male,Life,51210.11,0,2,Medium,534.0,10990.53,0
