In [1]:
import random
import string
from typing import List, Dict, Any
from faker import Faker
import pandas as pd
from pydantic import BaseModel, Field, ValidationError
from langchain import PromptTemplate
from langchain.schema import BaseOutputParser
from langchain.chat_models import ChatOpenAI
from langchain.prompts.few_shot import FewShotPromptTemplate
from datetime import date
from pydantic import BaseModel

In [2]:
import os 
os.environ["OPENAI_API_KEY"] = "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"

In [3]:
# In[3]:
# -----------------------------
# 1. Define the Pydantic schema
# -----------------------------
class InsuranceClaim(BaseModel):
    ic_number: str
    plate_number: str 
    age: int
    months_as_customer: int
    vehicle_age_years: int
    vehicle_make: str
    policy_expired_flag: int
    deductible_amount: float
    repair_amount: float
    market_value: float
    damage_severity_score: float 
    at_fault_flag: int
    time_to_report_days: int
    claim_reported_to_police_flag: int
    license_type_missing_flag: int
    num_third_parties: int
    num_witnesses: int
    approval_flag: int
    claim_description: str
    customer_background: str

In [4]:
# ------------------------------------
# 2. Deterministic logic for approval
# ------------------------------------

def compute_approval(features: Dict[str, Any]) -> int:
    """
    Compute approval based on deterministic rules with weights reflecting real-world influence.
    """
    score = 0.0

    # Favorable conditions
    if features['at_fault_flag'] == 0:
        score += 1.0
    if features['time_to_report_days'] < 7:
        score += 0.8
    if features['claim_reported_to_police_flag'] == 1:
        score += 0.7
    if features['policy_expired_flag'] == 0:
        score += 1.0
    if features['months_as_customer'] > 60:
        score += 0.5
    if features['num_witnesses'] >= 1:
        score += 0.3
    if features['num_third_parties'] >= 1:
        score += 0.2
    if features['license_type_missing_flag'] == 0:
        score += 0.3
    if features['damage_severity_score'] <= 0.5 and features['repair_amount'] < features['market_value']:
        score += 0.4

    # Unfavorable conditions
    if features['at_fault_flag'] == 1:
        score -= 0.5
    if features['time_to_report_days'] > 7:
        score -= 0.7
    if features['claim_reported_to_police_flag'] == 0:
        score -= 0.5
    if features['policy_expired_flag'] == 1:
        score -= 1.0
    if features['months_as_customer'] < 36:
        score -= 0.3
    if features['age'] < 23 or features['age'] > 70:
        score -= 0.5
    if features['vehicle_age_years'] > 15:
        score -= 0.4
    if features['repair_amount'] > features['market_value']:
        score -= 0.8
    if features['damage_severity_score'] > 0.75 and features['repair_amount'] > 0.5 * features['market_value']:
        score -= 0.6
    if features['license_type_missing_flag'] == 1:
        score -= 0.4
    if features['deductible_amount'] < 200 and features['repair_amount'] > 5000:
        score -= 0.5
    if features['num_witnesses'] == 0 and features['num_third_parties'] <= 2:
        score -= 0.5

    # Threshold for binary decision
    threshold = 1.0
    return 1 if score >= threshold else 0

In [5]:
from typing import Dict, Any
import random
import numpy as np
fake = Faker('en_US')  # Malaysian locale
random.seed(42)
np.random.seed(42)

# ---------------------------------------
# 3. Feature generator with controlled noise
# ---------------------------------------

# Vehicle models and approximate first release year for Malaysia
VEHICLE_MAKES = [
    "Perodua Myvi", "Proton X70", "Honda City", "Toyota Vios",
    "Nissan Almera", "Mazda 3", "BMW 3 Series"
]
VEHICLE_WEIGHTS = [0.45, 0.25, 0.10, 0.08, 0.06, 0.04, 0.02]
VEHICLE_RELEASE_YEAR = {
    "Perodua Myvi": 2005,
    "Proton X70": 2018,
    "Honda City": 2008,
    "Toyota Vios": 2007,
    "Nissan Almera": 2012,
    "Mazda 3": 2003,
    "BMW 3 Series": 2000
}

# Fixed market values (in MYR) per vehicle model
VEHICLE_MARKET_VALUES = {
    "Perodua Myvi": 50000.0,
    "Proton X70": 110000.0,
    "Honda City": 80000.0,
    "Toyota Vios": 75000.0,
    "Nissan Almera": 70000.0,
    "Mazda 3": 130000.0,
    "BMW 3 Series": 250000.0
}

# Deductible tiers in MYR typical in Malaysia
DEDUCTIBLE_OPTIONS = [200.0, 300.0, 500.0, 800.0, 1000.0]

# Sample customer age (truncated normal)
def sample_age() -> int:
    return int(np.clip(np.random.normal(40, 12), 18, 80))

# Sample customer tenure
def sample_months_as_customer(age: int) -> int:
    # Generate tenure then cap to realistic maximum based on age
    if random.random() < 0.7:
        tenure = int(np.random.exponential(scale=12))
    else:
        tenure = int(36 + np.random.exponential(scale=24))
    max_tenure = max((age - 18) * 12, 0)
    return min(tenure, max_tenure)

# Sample time to report (Poisson)
def sample_time_to_report() -> int:
    return int(np.clip(np.random.poisson(lam=5), 0, 30))

# Sample damage severity (uniform mid-range)
def sample_damage_severity() -> float:
    return float(np.round(random.uniform(0.4, 0.8), 2))

# Generate Malaysian NRIC
def generate_malaysian_ic() -> str:
    dob = fake.date_of_birth(minimum_age=18, maximum_age=80)
    prefix = dob.strftime('%y%m%d')
    middle = f"{random.randint(1, 99):02d}"
    suffix = f"{random.randint(0, 9999):04d}"
    return f"{prefix}-{middle}-{suffix}"

# Generate Malaysian vehicle plate number
def generate_plate_number() -> str:
    # Format: 3 letters followed by 4 digits, e.g., ABC1234
    letters = ''.join(random.choices(string.ascii_uppercase, k=3))
    numbers = f"{random.randint(0, 9999):04d}"
    return f"{letters}{numbers}"

# Derive vehicle age based on make
def sample_vehicle_age(make: str) -> int:
    current_year = date.today().year
    release_year = VEHICLE_RELEASE_YEAR.get(make, current_year)
    model_year = random.randint(release_year, current_year)
    return current_year - model_year

# Generate base features
def generate_base_features() -> Dict[str, Any]:
    age = sample_age()
    months_as_customer = sample_months_as_customer(age)
    vehicle_make = random.choices(VEHICLE_MAKES, weights=VEHICLE_WEIGHTS, k=1)[0]
    vehicle_age_years = sample_vehicle_age(vehicle_make)
    policy_expired_flag = random.choices([0, 1], weights=[0.92, 0.08])[0]
    deductible_amount = random.choice(DEDUCTIBLE_OPTIONS)
    market_value = VEHICLE_MARKET_VALUES[vehicle_make]
    damage_severity_score = sample_damage_severity()
    repair_amount = round(min(
        market_value,
        market_value * damage_severity_score * random.uniform(0.3, 0.5)
    ), 2)
    at_fault_flag = random.choices([0, 1], weights=[0.75, 0.25])[0]
    time_to_report_days = sample_time_to_report()
    claim_reported_to_police_flag = random.choices([0, 1], weights=[0.35, 0.65])[0]
    license_type_missing_flag = random.choices([0, 1], weights=[0.97, 0.03])[0]
    num_third_parties = int(np.random.poisson(lam=0.5))
    num_witnesses = int(np.random.poisson(lam=1.0))

    return {
        'ic_number': generate_malaysian_ic(),
        'plate_number': generate_plate_number(),  # New field
        'age': age,
        'months_as_customer': months_as_customer,
        'vehicle_age_years': vehicle_age_years,
        'vehicle_make': vehicle_make,
        'policy_expired_flag': policy_expired_flag,
        'deductible_amount': deductible_amount,
        'market_value': market_value,
        'damage_severity_score': damage_severity_score,
        'repair_amount': repair_amount,
        'at_fault_flag': at_fault_flag,
        'time_to_report_days': time_to_report_days,
        'claim_reported_to_police_flag': claim_reported_to_police_flag,
        'license_type_missing_flag': license_type_missing_flag,
        'num_third_parties': num_third_parties,
        'num_witnesses': num_witnesses
    }

In [6]:
# ------------------------------------------------
# 4. LLM-based noise injection for descriptions
# ------------------------------------------------

class JSONOutputParser(BaseOutputParser):
    def parse(self, text: str) -> Any:
        import json
        return json.loads(text)

DESCRIPTION_EXAMPLES = [
    ("Minor rear-end collision at low speed. No injuries reported.", "Long-term customer, diligent in reporting and full documentation available."),
    ("Multi-vehicle pile-up on highway. Moderate damage. Police report filed.", "Customer has clean record, no prior claims in last 5 years.")
]

In [7]:
# ------------------------------------------------
# 5. Main generator with enforced ratio
# ------------------------------------------------

def generate_synthetic_claims(n: int, seed: int = 42) -> pd.DataFrame:
    random.seed(seed)
    Faker.seed(seed)
    approved_records: List[Dict[str, Any]] = []
    declined_records: List[Dict[str, Any]] = []

    # Determine target counts
    n_approved = int(n * 0.9)
    n_declined = n - n_approved

    # Generate approved records
    while len(approved_records) < n_approved:
        base = generate_base_features()
        if compute_approval(base) == 1:
            base['approval_flag'] = 1
            payout = max(0, base['repair_amount'] - base['deductible_amount'])
            base['coverage_amount'] = min(payout, base['market_value'])
            # Inject noise for descriptions here (existing LLM logic)
            # ...
            approved_records.append(base)

    # Generate declined records
    while len(declined_records) < n_declined:
        base = generate_base_features()
        if compute_approval(base) == 0:
            base['approval_flag'] = 0
            base['coverage_amount'] = 0.0
            # Inject noise for descriptions here (existing LLM logic)
            # ...
            declined_records.append(base)

    # Combine and shuffle
    df_balanced = pd.DataFrame(approved_records + declined_records)
    df_balanced = df_balanced.sample(frac=1, random_state=seed).reset_index(drop=True)
    return df_balanced

In [8]:
# ------------------------------------------------
# Generate
# ------------------------------------------------
if __name__ == "__main__":
    df = generate_synthetic_claims(5000)

In [9]:
df.shape

(5000, 19)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 19 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   ic_number                      5000 non-null   object 
 1   plate_number                   5000 non-null   object 
 2   age                            5000 non-null   int64  
 3   months_as_customer             5000 non-null   int64  
 4   vehicle_age_years              5000 non-null   int64  
 5   vehicle_make                   5000 non-null   object 
 6   policy_expired_flag            5000 non-null   int64  
 7   deductible_amount              5000 non-null   float64
 8   market_value                   5000 non-null   float64
 9   damage_severity_score          5000 non-null   float64
 10  repair_amount                  5000 non-null   float64
 11  at_fault_flag                  5000 non-null   int64  
 12  time_to_report_days            5000 non-null   i

In [11]:
df[df['approval_flag'] == 1]

Unnamed: 0,ic_number,plate_number,age,months_as_customer,vehicle_age_years,vehicle_make,policy_expired_flag,deductible_amount,market_value,damage_severity_score,repair_amount,at_fault_flag,time_to_report_days,claim_reported_to_police_flag,license_type_missing_flag,num_third_parties,num_witnesses,approval_flag,coverage_amount
0,470119-43-2401,CJF7824,55,7,10,Perodua Myvi,0,300.0,50000.0,0.59,10435.92,0,5,0,0,0,0,1,10135.92
1,861126-24-0804,WGA4211,57,72,15,Mazda 3,0,500.0,130000.0,0.65,30914.07,0,2,0,0,0,0,1,30414.07
2,510806-66-3510,DIR2848,33,8,7,Perodua Myvi,0,300.0,50000.0,0.64,14265.18,1,4,1,0,2,0,1,13965.18
3,471112-27-1765,AGL9493,48,1,7,Perodua Myvi,0,300.0,50000.0,0.41,7169.51,0,3,0,0,1,2,1,6869.51
4,021203-43-7571,RNP4370,35,1,17,Perodua Myvi,0,500.0,50000.0,0.75,12039.21,0,6,0,0,0,0,1,11539.21
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,050929-91-5447,JLR2108,36,34,15,Perodua Myvi,0,300.0,50000.0,0.65,11535.06,1,6,1,0,0,1,1,11235.06
4996,870714-98-2553,VUA6249,70,16,15,Honda City,1,500.0,80000.0,0.56,14943.18,0,4,1,0,0,1,1,14443.18
4997,840120-21-1878,RVE2411,47,18,18,Toyota Vios,0,800.0,75000.0,0.77,28003.83,0,5,1,0,0,1,1,27203.83
4998,510731-64-8421,WNN7920,46,151,5,Proton X70,0,800.0,110000.0,0.45,17942.64,0,5,1,0,0,0,1,17142.64


In [12]:
import os

filename = "Training_Dataset.csv"

# Optional: delete if exists (not strictly necessary)
if os.path.exists(filename):
    os.remove(filename)

# Save the new file
df.to_csv(filename, index=False)