In [4]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# For reproducibility
random.seed(42)
np.random.seed(42)

# ============================================================================
# STEP 1: DATA GENERATION
# ============================================================================

def generate_accident_dataset(n_samples=10000):
    """
    Generate synthetic insurance accident description dataset
    """
    print("Generating synthetic accident dataset...")
    
    # Templates for different accident types
    vehicle_templates = [
        "Vehicle collided with {object} at {location}. Driver experienced {injury}. {weather} conditions.",
        "Rear-ended by another vehicle while {action}. Suffered {injury}. Police report filed.",
        "Car accident on {road} involving {vehicles} vehicles. {injury} reported. {damage} to vehicle.",
        "Hit and run incident in {location}. Vehicle {damage}. Driver had {injury}.",
        "Single vehicle accident due to {cause}. {injury} sustained. Vehicle {damage}."
    ]
    
    medical_templates = [
        "Slipped and fell at {location}. Resulted in {injury}. Required {treatment}.",
        "Food poisoning at {restaurant}. Symptoms included {symptoms}. Hospitalized for {days} days.",
        "Allergic reaction to {allergen}. Experienced {symptoms}. Emergency treatment required.",
        "Sudden {condition} while {activity}. Transported to hospital. {treatment} administered.",
        "Pre-existing {condition} aggravated during travel. Required {treatment}."
    ]
    
    travel_templates = [
        "Flight delayed for {hours} hours. Missed {connection}. Incurred {cost} in expenses.",
        "Luggage lost on flight from {origin} to {destination}. Contained {items}.",
        "Trip cancelled due to {reason}. Non-refundable bookings worth ${amount}.",
        "Fell ill with {illness} during trip to {country}. Medical expenses ${amount}.",
        "Passport stolen in {location}. Had to {action}. Additional costs incurred."
    ]
    
    property_templates = [
        "Water damage to {property} due to {cause}. Estimated repair cost ${amount}.",
        "Fire damage at {location}. {items} destroyed. Total loss estimated at ${amount}.",
        "Burglary at residence. {items} stolen. Property damage includes {damage}.",
        "Storm damage to {property}. {damage} observed. Temporary accommodation needed.",
        "Vandalism to {property}. {damage} requiring professional repair."
    ]
    
    sports_templates = [
        "Injured {bodypart} while playing {sport}. {injury} diagnosed. Treatment: {treatment}.",
        "Collision with {object} during {sport} activity. Sustained {injury}.",
        "Equipment failure during {sport} resulted in {injury}. Required {treatment}.",
        "Overexertion during {activity} caused {injury}. Doctor recommended {treatment}.",
        "Fall during {sport} practice. {injury} to {bodypart}. X-ray showed {diagnosis}."
    ]
    
    # Vocabulary for template filling
    vocab = {
        'object': ['another vehicle', 'tree', 'pole', 'barrier', 'pedestrian', 'cyclist', 'wall', 'divider'],
        'location': ['highway', 'parking lot', 'intersection', 'shopping mall', 'office building', 'residential area', 'downtown'],
        'injury': ['whiplash', 'back pain', 'concussion', 'bruises', 'fractured ribs', 'neck strain', 'knee injury', 'shoulder pain'],
        'weather': ['Rainy', 'Foggy', 'Clear', 'Snowy', 'Icy', 'Windy', 'Normal'],
        'action': ['stopped at traffic light', 'parking', 'changing lanes', 'turning left', 'slowing down'],
        'road': ['Highway 1', 'Main Street', 'Park Avenue', 'Interstate 95', 'Route 66'],
        'vehicles': ['2', '3', '4', 'multiple'],
        'damage': ['front bumper damaged', 'total loss', 'minor scratches', 'significant damage', 'rear damage'],
        'cause': ['tire blowout', 'brake failure', 'slippery road', 'animal crossing', 'mechanical failure'],
        'symptoms': ['nausea, vomiting, diarrhea', 'severe headache, fever', 'difficulty breathing', 'severe pain', 'dizziness'],
        'treatment': ['emergency surgery', 'medication', 'physical therapy', 'observation', 'specialized care'],
        'restaurant': ['local restaurant', 'hotel restaurant', 'street vendor', 'airport cafe', 'cruise ship dining'],
        'days': ['1', '2', '3', '4', '5'],
        'allergen': ['peanuts', 'shellfish', 'medication', 'insect sting', 'latex'],
        'condition': ['chest pain', 'asthma attack', 'diabetic emergency', 'seizure', 'heart palpitations'],
        'activity': ['exercising', 'traveling', 'working', 'dining', 'sightseeing'],
        'hours': ['3', '6', '12', '24', '48'],
        'connection': ['connecting flight', 'cruise departure', 'tour booking', 'hotel reservation'],
        'cost': ['$500', '$1000', '$2000', '$5000'],
        'origin': ['Singapore', 'New York', 'London', 'Tokyo', 'Sydney'],
        'destination': ['Paris', 'Bangkok', 'Dubai', 'Hong Kong', 'Los Angeles'],
        'items': ['electronics, clothing, documents', 'jewelry, camera equipment', 'business documents, laptop'],
        'reason': ['medical emergency', 'natural disaster', 'political unrest', 'family emergency', 'weather'],
        'amount': ['1000', '2500', '5000', '10000', '25000'],
        'illness': ['dengue fever', 'food poisoning', 'COVID-19', 'stomach flu', 'altitude sickness'],
        'country': ['Thailand', 'India', 'Mexico', 'Egypt', 'Brazil'],
        'property': ['home', 'apartment', 'office', 'vehicle', 'storage unit'],
        'bodypart': ['knee', 'ankle', 'shoulder', 'wrist', 'back', 'elbow'],
        'sport': ['football', 'basketball', 'tennis', 'skiing', 'cycling', 'swimming'],
        'diagnosis': ['minor fracture', 'ligament tear', 'muscle strain', 'no fracture', 'hairline fracture']
    }
    
    # Generate data
    data = []
    
    # Define category distribution
    categories = {
        'vehicle': (vehicle_templates, 0.35),
        'medical': (medical_templates, 0.25),
        'travel': (travel_templates, 0.20),
        'property': (property_templates, 0.10),
        'sports': (sports_templates, 0.10)
    }
    
    for i in range(n_samples):
        # Generate NRIC (Singapore format)
        nric = f"S{random.randint(1000000, 9999999)}{random.choice(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H'])}"
        
        # Generate policy number
        year = random.randint(2020, 2024)
        policy_num = f"POL-{year}-{random.randint(100000, 999999)}"
        
        # Select category based on distribution
        rand_val = random.random()
        cumulative = 0
        selected_category = None
        
        for category, (templates, prob) in categories.items():
            cumulative += prob
            if rand_val <= cumulative:
                selected_category = category
                selected_templates = templates
                break
        
        # Generate description
        template = random.choice(selected_templates)
        description = template
        
        # Fill in template
        for key in vocab:
            if f"{{{key}}}" in description:
                description = description.replace(f"{{{key}}}", random.choice(vocab[key]))
        
        # Add some noise/variations
        if random.random() < 0.3:
            additional = random.choice([
                " No witnesses present.",
                " Insurance claim to follow.",
                " Photos attached.",
                " Seeking full compensation.",
                " Previous claim history: none.",
                " Urgent processing requested."
            ])
            description += additional
        
        data.append({
            'ClaimNo_AT': policy_num,
            'AccidentDesc': description,
            'AccidentTypeDesc': selected_category,  # Hidden label for evaluation
            'DriverNric': nric,
        })
    
    df = pd.DataFrame(data)
    print(f"Generated {len(df)} accident descriptions")
    print(f"Category distribution:\n{df['AccidentTypeDesc'].value_counts()}")
    
    return df

In [5]:
df = generate_accident_dataset(n_samples=10000)

Generating synthetic accident dataset...
Generated 10000 accident descriptions
Category distribution:
vehicle     3506
medical     2502
travel      2056
property     980
sports       956
Name: AccidentTypeDesc, dtype: int64


In [6]:
df.to_csv('/Users/bervynwong/Downloads/INCOME Travel Insurance Portfolio Analysis Project/Profitability Model/Free Text classification/accident_description_raw.csv', index=False)

In [None]:
Analyse accident_description_raw.csv. This is just a sample dataset I'm working on now, these are insurance accident description claims. My actual dataset has about over 300000 accident description, all written in free text, i.e. raw descriptions.

My goal: find a way or a model that can parse all these specific keywords or any kinds of repetitive patterns within the language, and come up with useful classifiers/categories to outline the kind of accident type they are.
For example, if the raw description contains words like 'cough, sore throat, flu' they would automatically fall under a certain category. Similarly, if the raw description contains words like 'knee sprain, sprained shoulder' they would be characterised as 'Sprains and strains'. These examples are not exhaustive.

1) Identify what kind of data I am working with, i.e. structured unstructured data or unlabelled data 

2) Identify what kind of data science or machine learning problem this is.

3) Recommend algorithms or models that can help to achieve my end goal.