# Generating Fake Data

This script uses Faker, NumPy, and Pandas to generate a synthetic dataset of 100,000 records, representing client case information for a law group specializing in medical malpractice and personal injury.

It covers fields such as case details, doctor referral information, case status, marketing information, and more.

The dataset is then written to a Parquet file using Apache Arrow.

In [5]:
from faker import Faker
import pyarrow as pa
import pyarrow.parquet as pq
import numpy as np
import pandas as pd
import uuid
import random

In [6]:
fake = Faker()
Faker.seed(42)

# Define the number of records
num_records = 100_000

In [7]:

# Generators to generate each field iteratively
def generate_case_id():
    while True:
        yield str(uuid.uuid4())

def generate_client_name():
    while True:
        yield fake.name()

def generate_date_of_birth():
    while True:
        yield fake.date_of_birth(minimum_age=18, maximum_age=80)

def generate_gender():
    while True:
        yield random.choice(["Male", "Female", "Other"])

def generate_phone_number():
    while True:
        yield fake.phone_number()

def generate_email():
    while True:
        yield fake.email()

def generate_case_open_date():
    while True:
        yield fake.date_between(start_date="-5y", end_date="today")

def generate_case_type():
    while True:
        yield random.choice(["Medical Malpractice", "Personal Injury"])

def generate_case_status():
    while True:
        yield random.choice(["Open", "Closed", "Pending Settlement", "In Litigation", "Referred Out"])

def generate_doctor_practitioner_name():
    while True:
        yield fake.name()

def generate_doctor_practitioner_uuid():
    while True:
        yield str(uuid.uuid4())

def generate_medical_practice_name():
    while True:
        yield fake.company()

def generate_referral_location():
    while True:
        yield fake.city()

def generate_referral_date():
    while True:
        yield fake.date_between(start_date="-5y", end_date="today")

def generate_marketing_channel():
    while True:
        yield random.choice(["Online Search", "TV Ad", "Billboard", "Social Media", "Referral", "Radio Ad"])

def generate_referral_source():
    while True:
        yield random.choice(["Doctor", "Former Client", "Online Review", "Friend/Family"])

def generate_injury_severity():
    while True:
        yield random.choice(["Minor", "Moderate", "Severe", "Catastrophic"])

def generate_injury_type():
    while True:
        yield random.choice(["Head Injury", "Fracture", "Burn", "Internal Injury", "Spinal Injury", "Soft Tissue Injury"])

def generate_client_recovery_status():
    while True:
        yield random.choice(["Recovered", "Ongoing Treatment", "Permanent Disability", "Deceased"])

def generate_settlement_amount():
    while True:
        yield np.round(random.uniform(5_000, 2_000_000), 2) if random.random() > 0.3 else None

def generate_trial_date():
    while True:
        yield fake.date_between(start_date="today", end_date="+1y") if random.random() > 0.7 else None

def generate_court_name():
    while True:
        yield fake.company() if random.random() > 0.7 else None

def generate_legal_representation_type():
    while True:
        yield random.choice(["Contingency", "Hourly", "Pro Bono"])

def generate_client_satisfaction_score():
    while True:
        yield random.randint(1, 10)

def generate_claim_amount():
    while True:
        yield np.round(random.uniform(10_000, 3_000_000), 2)

def generate_litigation_phase():
    while True:
        yield random.choice(["Discovery", "Mediation", "Trial", "Settlement Negotiations", "Appeal"])

def generate_lawyer_assigned():
    while True:
        yield fake.name()

def generate_practice_area():
    while True:
        yield random.choice(["Personal Injury", "Medical Malpractice", "Product Liability"])

def generate_case_priority():
    while True:
        yield random.choice(["High", "Medium", "Low"])

def generate_case_notes():
    while True:
        yield fake.text(max_nb_chars=200)

In [8]:
# Create a dictionary of generators
data_generators = {
    "case_id": generate_case_id(),
    "client_name": generate_client_name(),
    "date_of_birth": generate_date_of_birth(),
    "gender": generate_gender(),
    "phone_number": generate_phone_number(),
    "email": generate_email(),
    "case_open_date": generate_case_open_date(),
    "case_type": generate_case_type(),
    "case_status": generate_case_status(),
    "doctor_practitioner_name": generate_doctor_practitioner_name(),
    "doctor_practitioner_uuid": generate_doctor_practitioner_uuid(),
    "medical_practice_name": generate_medical_practice_name(),
    "referral_location": generate_referral_location(),
    "referral_date": generate_referral_date(),
    "marketing_channel": generate_marketing_channel(),
    "referral_source": generate_referral_source(),
    "injury_severity": generate_injury_severity(),
    "injury_type": generate_injury_type(),
    "client_recovery_status": generate_client_recovery_status(),
    "settlement_amount": generate_settlement_amount(),
    "trial_date": generate_trial_date(),
    "court_name": generate_court_name(),
    "legal_representation_type": generate_legal_representation_type(),
    "client_satisfaction_score": generate_client_satisfaction_score(),
    "claim_amount": generate_claim_amount(),
    "litigation_phase": generate_litigation_phase(),
    "lawyer_assigned": generate_lawyer_assigned(),
    "practice_area": generate_practice_area(),
    "case_priority": generate_case_priority(),
    "case_notes": generate_case_notes(),
}

In [9]:
# Generate the data in separate categories and create DataFrames
case_info_columns = [
    "case_id", "client_name", "date_of_birth", "gender", "phone_number", "email", "case_open_date", "case_type", "case_status", "case_priority", "case_notes"
]
doctor_referral_columns = [
    "case_id", "doctor_practitioner_name", "doctor_practitioner_uuid", "medical_practice_name", "referral_location", "referral_date"
]
marketing_info_columns = [
    "case_id", "marketing_channel", "referral_source"
]
legal_details_columns = [
    "case_id", "settlement_amount", "trial_date", "court_name", "legal_representation_type", "client_satisfaction_score", "claim_amount", "litigation_phase", "lawyer_assigned", "practice_area"
]
injury_details_columns = [
    "case_id", "injury_severity", "injury_type", "client_recovery_status"
]

In [10]:
# Generate DataFrames for each category
def generate_dataframe(columns):
    return pd.DataFrame({col: [next(data_generators[col]) for _ in range(num_records)] for col in columns})

In [11]:
case_info_df = generate_dataframe(case_info_columns)
doctor_referral_df = generate_dataframe(doctor_referral_columns)
marketing_info_df = generate_dataframe(marketing_info_columns)
legal_details_df = generate_dataframe(legal_details_columns)
injury_details_df = generate_dataframe(injury_details_columns)

# Save each DataFrame as a separate Parquet file
pq.write_table(pa.Table.from_pandas(case_info_df), "../data/law_group_case_info.parquet")
pq.write_table(pa.Table.from_pandas(doctor_referral_df), "../data/law_group_doctor_referrals.parquet")
pq.write_table(pa.Table.from_pandas(marketing_info_df), "../data/law_group_marketing_info.parquet")
pq.write_table(pa.Table.from_pandas(legal_details_df), "../data/law_group_legal_details.parquet")
pq.write_table(pa.Table.from_pandas(injury_details_df), "../data/law_group_injury_details.parquet")

print("Synthetic dataset of 100,000 records generated and saved as separate Parquet files.")

Synthetic dataset of 100,000 records generated and saved as separate Parquet files.
