In [5]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

# -----------------------------
# Define parameters
# -----------------------------
num_rows_per_tier = 12500  # 50k rows total for 4 tiers
tiers = ['Basic', 'Standard', 'Premium', 'Gold']
states = ['NSW','VIC','QLD','SA','WA','TAS','NT','ACT']
insurance_types = ['Life','Health','Home','Car','Travel']
payment_freq = ['Monthly','Quarterly','Annually']
claim_status = ['Approved','Rejected','Pending']

data_list = []

np.random.seed(42)

# -----------------------------
# Generate patterned data
# -----------------------------
for tier in tiers:
    for _ in range(num_rows_per_tier):
        # Age by tier
        if tier in ['Basic','Standard']:
            age = np.random.randint(20, 40)
        elif tier == 'Premium':
            age = np.random.randint(31, 60)
        else:  # Gold
            age = np.random.randint(50, 80)
        
        # Risk score based on age
        if age <= 30:
            risk_score = round(np.random.uniform(0.1,0.3),2)
        elif age <= 50:
            risk_score = round(np.random.uniform(0.3,0.6),2)
        elif age <= 70:
            risk_score = round(np.random.uniform(0.5,0.8),2)
        else:
            risk_score = round(np.random.uniform(0.7,0.95),2)
        
        # Insurance type
        ins_type = np.random.choice(insurance_types)
        
        # Premium based on type and tier
        if ins_type in ['Life','Health']:
            annual_premium = round(np.random.uniform(1000,5000),2)
        elif ins_type in ['Home','Car']:
            annual_premium = round(np.random.uniform(500,4000),2)
        else:
            annual_premium = round(np.random.uniform(300,2000),2)
        
        # Adjust premium slightly based on tier
        if tier=='Standard':
            annual_premium *= 1.1
        elif tier=='Premium':
            annual_premium *= 1.3
        elif tier=='Gold':
            annual_premium *= 1.5
        annual_premium = round(annual_premium,2)
        
        # Claim amount proportional to premium & risk
        claim_amount = round(annual_premium * np.random.uniform(1,5) * risk_score,2)
        
        # Policy dates
        start_date = datetime(2023,1,1) + timedelta(days=np.random.randint(0,730))
        end_date = start_date + timedelta(days=365)
        
        # Payment frequency biased by tier
        if tier in ['Premium','Gold']:
            freq = np.random.choice(['Annually','Quarterly'], p=[0.7,0.3])
        else:
            freq = np.random.choice(['Monthly','Quarterly'], p=[0.7,0.3])
        
        # Claim status random
        status = np.random.choice(claim_status, p=[0.7,0.2,0.1])
        
        # Random state
        state = np.random.choice(states)
        
        data_list.append([
            age, state, ins_type, annual_premium, claim_amount, status,
            start_date.strftime('%d-%m-%Y'), end_date.strftime('%d-%m-%Y'),
            tier, freq, risk_score
        ])

# -----------------------------
# Create DataFrame
# -----------------------------
columns = ['Age','State','Insurance Type','Annual Premium (AUD)','Claim Amount (AUD)',
           'Claim Status','Policy Start Date','Policy End Date','Product Tier',
           'Payment Frequency','Risk Score']

df = pd.DataFrame(data_list, columns=columns)

# Shuffle dataset
df = df.sample(frac=1).reset_index(drop=True)

# Save to CSV
df.to_csv('synthetic_insurance_50k.csv', index=False)
print("Dataset generated with 50k rows!")


Dataset generated with 50k rows!
