In [1]:
import numpy as np
import pandas as pd
from faker import Faker

# Setting up Faker for data generation
fake = Faker(locale='en_US')
Faker.seed(0)

# Number of records
num_records = 50000

# Generate data fields for medical malpractice and personal injury cases
client_data = {
    'case_type': [np.random.choice(['Medical Malpractice', 'Personal Injury'], p=[0.3, 0.7]) for _ in range(num_records)],
    'case_status': [np.random.choice(['Open', 'Closed', 'Settled', 'In Progress'], p=[0.4, 0.2, 0.3, 0.1]) for _ in range(num_records)],
    'marketing_source': [np.random.choice(['Billboard', 'TV', 'Online', 'Referral', 'Radio'], p=[0.1, 0.2, 0.4, 0.2, 0.1]) for _ in range(num_records)],
    'injury_type': [np.random.choice(['Fracture', 'Burn', 'Head Trauma', 'Soft Tissue Injury', 'Spinal Injury', 'Laceration', 'Psychological Trauma'], p=[0.15, 0.05, 0.1, 0.3, 0.1, 0.2, 0.1]) for _ in range(num_records)],
    'medical_expense_amount': [int(np.abs(np.random.normal(loc=15000, scale=5000) + np.random.normal(0, 1000))) for _ in range(num_records)],  # Normal distribution with noise, absolute value
    'potential_settlement_amount': [int(np.abs(np.random.normal(loc=75000, scale=20000) + np.random.normal(0, 5000))) for _ in range(num_records)],  # Normal distribution with noise, absolute value
    'attorney_name': [np.random.choice(["William J. Thompson", "Elizabeth M. Harris", "Michael A. Stevens", "Sarah K. Connor", "Jonathan P. Reed"], p=[0.2, 0.2, 0.2, 0.2, 0.2]) for _ in range(num_records)],
    'injury_severity_score': [np.random.randint(1, 10) for _ in range(num_records)],
    'client_age': [np.random.randint(18, 90) for _ in range(num_records)],
    'client_gender': [np.random.choice(['Male', 'Female', 'Non-binary'], p=[0.45, 0.45, 0.1]) for _ in range(num_records)],
    'case_priority': [np.random.choice(['Low', 'Medium', 'High'], p=[0.3, 0.5, 0.2]) for _ in range(num_records)],
    'insurance_claim_made': [np.random.choice([True, False], p=[0.6, 0.4]) for _ in range(num_records)],
    'insured': [np.random.choice([0, 1], p=[0.7, 0.3]) for _ in range(num_records)],
    'next_action': [np.random.choice(['Follow up', 'Await Documents', 'Schedule Meeting', 'Await Court Hearing'], p=[0.25, 0.25, 0.25, 0.25]) for _ in range(num_records)],
}

# Introduce outliers in 'medical_expense_amount' and 'potential_settlement_amount'
num_outliers = int(0.01 * num_records)  # 1% of the records

for _ in range(num_outliers):
    client_data['medical_expense_amount'][np.random.randint(0, num_records)] = int(np.abs(np.random.normal(loc=100000, scale=20000)))  # Extreme high value, absolute value
    client_data['potential_settlement_amount'][np.random.randint(0, num_records)] = int(np.abs(np.random.normal(loc=500000, scale=100000)))  # Extreme high value, absolute value

# Create a DataFrame
client_df = pd.DataFrame(client_data)

# Save to an Excel file
client_df.to_excel('./data/law_group_case_data_with_noise_and_outliers.xlsx', index=False)

print("Dataset with noise, outliers, and unequal category counts has been created and saved as 'law_group_case_data_with_noise_and_outliers.xlsx'")

Dataset with noise, outliers, and unequal category counts has been created and saved as 'law_group_case_data_with_noise_and_outliers.xlsx'
