# Dummy Data Generation Notebook

This notebook generates dummy data and saves it to both JSON and CSV files. 
If the files exist, it appends new data rather than overwriting.

## Steps:
1. Import required libraries and initialize constants.
2. Pre-generate random data pools for efficiency.
3. Define data generation, flattening, and prefix-removal functions.
4. Generate records, append them to existing files (if any), or create new files if none exist.
5. Run the code cell to produce new dummy data each time.

# How to Use

- Simply run each cell in order.
- The last cell will generate X new records and append them to `../data/people.json` and `../data/people.csv`.
- Re-running the last cell (or entire notebook) will add more records to these files without overwriting existing data.



In [1]:
# Cell 1: Imports and Constants
import random
import string
import csv
import json
import os
from datetime import datetime
from faker import Faker # type: ignore

fake = Faker()

current_year = "2024"

genders = ["Male", "Female", "Not specified"]
disabilities = [
    None,
    "Visual impairment",
    "Hearing impairment",
    "Physical disability",
    "Intellectual disability",
    "Mental health condition"
]
marital_statuses = ["Single", "Married", "Divorced", "Widowed"]
id_types = ["passport", "national id", "military id"]
languages = ["English", "Swahili", "French", "Arabic", "Hindi", "Chinese", "Spanish"]
religions = ["Christianity", "Islam", "Hinduism", "Buddhism", "Atheism", "Others"]
social_media_choices = ["Facebook", "Twitter", "Instagram", "LinkedIn", "None"]
communication_modes = ["Phone", "Email", "SMS", "WhatsApp", "Physical Mail"]
employment_statuses = ["Employed", "Self-Employed", "Unemployed", "Student", "Retired"]
industries = ["Agriculture", "IT", "Finance", "Healthcare", "Education", "Construction", "Retail"]
income_brackets = ["<$100", "$100-$500", "$501-$1000", "$1001-$2000", ">$2000"]
education_levels = ["None", "Primary", "Secondary", "Diploma", "Bachelor's", "Master's", "PhD", "Other"]
housing_statuses = ["Owned", "Rented", "Mortgaged", "Other"]
housing_types = ["Apartment", "Detached House", "Semi-Detached", "Other"]

utilities = ["Electricity", "Water", "Internet"]
assets = ["House", "Car", "Livestock", "Land"]
technology = ["Smartphone", "Computer", "Internet"]

sub_counties_wards = {
    "MVITA": ["TUDOR", "TONONOKA", "GANJONI/SHIMANZI", "MAKADARA/MJI WA KALE", "MAJENGO"],
    "NYALI": ["ZIWA LA NG'OMBE", "FRERE TOWN", "MKOMANI", "KONGOWEA", "KADZANDANI"],
    "LIKONI": ["MTONGWE", "SHIKA ADABU", "BOFU", "LIKONI", "TIMBWANI"],
    "JOMVU": ["JOMVU KUU", "MIKINDANI", "MIRITINI"],
    "CHANGAMWE": ["PORT REITZ", "KIPEVU", "AIRPORT", "CHANGAMWE", "CHAANI"],
    "KISAUNI": ["MJAMBERE", "JUNDA", "BAMBURI", "MWAKIRUNGE", "MTOPANGA", "MAGOGONI", "SHANZU"]
}

counties = ["Mombasa"]  
chosen_sub_county = random.choice(list(sub_counties_wards.keys()))
chosen_wards = sub_counties_wards[chosen_sub_county]


In [2]:
# Helper Functions
def random_phone_number():
    return "+254" + "".join(random.choices(string.digits, k=9))

def random_id_number():
    return "".join(random.choices(string.ascii_uppercase + string.digits, k=10))

def random_boolean():
    return random.choice([True, False])

def random_list_of_skills(num=3):
    skill_pool = ["Computer Literacy", "Project Management", "Data Analysis", "Carpentry", "Welding",
                  "Driving", "Accounting", "Programming", "Public Speaking", "Teaching"]
    return random.sample(skill_pool, k=min(num, len(skill_pool)))

def random_assets(assets_list):
    return {asset: random_boolean() for asset in assets_list}

def random_technology_access(tech_list):
    return {tech: random_boolean() for tech in tech_list}

def random_utilities_access(utilities_list):
    return {util: random_boolean() for util in utilities_list}

def random_memberships():
    groups = ["Community Welfare Group", "Church Choir", "Mosque Committee", "Sports Club", "Book Club"]
    return random.sample(groups, k=random.randint(1, 2))

def generate_youth_id(first_name, last_name, year):
    abbreviation = (first_name[0] + last_name[0]).upper()
    random_str = "".join(random.choices(string.ascii_uppercase + string.digits, k=4))
    return f"{abbreviation}-{year}-{random_str}"

#############################
# Pre-generation of pools
#############################

NUM_NAMES = 100   # Generate 100 first names, last names, etc. and pick from them.
first_names_pool = [fake.first_name() for _ in range(NUM_NAMES)]
last_names_pool = [fake.last_name() for _ in range(NUM_NAMES)]
middle_names_pool = [fake.first_name() for _ in range(NUM_NAMES)]
streets_pool = [fake.street_name() for _ in range(NUM_NAMES)]
companies_pool = [fake.company() for _ in range(NUM_NAMES)]
emails_pool = [fake.email() for _ in range(NUM_NAMES)]
addresses_pool = [fake.address() for _ in range(NUM_NAMES)]
health_insurance_pool = [fake.company() for _ in range(NUM_NAMES)]
health_conditions_pool = [fake.sentence(nb_words=5) for _ in range(NUM_NAMES)]
allergies_pool = [fake.word() for _ in range(NUM_NAMES)]
occupation_pool = [fake.job() for _ in range(NUM_NAMES)]
emergency_contact_name_pool = [fake.name() for _ in range(NUM_NAMES)]
identification_numbers_pool = ["".join(random.choices(string.ascii_uppercase + string.digits, k=10)) for _ in range(NUM_NAMES)]

def pick(lst):
    return random.choice(lst)

#############################
# Single-pass flatten & prefix removal
#############################
def flatten_and_remove_prefix(d, parent_key='', sep='_'):
    """
    Flatten the dictionary and remove the top-level category prefix in one go.
    For top-level fields like Youth_ID, date_added, date_updated (which do not have underscores),
    we leave them as is.
    
    For fields like "Miscellaneous_emergency_contact_name", we only keep "emergency_contact_name".
    """
    items = []
    for k, v in d.items():
        new_key = f"{parent_key}{sep}{k}" if parent_key else k
        if isinstance(v, dict):
            items.extend(flatten_and_remove_prefix(v, new_key, sep).items())
        elif isinstance(v, list):
            val_str = ", ".join(map(str, v))
            # Check if we can remove prefix now
            parts = new_key.split(sep, 1)
            if len(parts) > 1:
                new_key = parts[1]
            items.append((new_key, val_str))
        else:
            val_str = str(v) if v is not None else ""
            parts = new_key.split(sep, 1)
            if len(parts) > 1:
                # Remove the prefix (category)
                new_key = parts[1]
            items.append((new_key, val_str))
    return dict(items)


In [3]:

def generate_dummy_data():
    first_name = pick(first_names_pool)
    middle_name = pick(middle_names_pool)
    last_name = pick(last_names_pool)
    year_of_birth = random.randint(1940, 2005)
    month_of_birth = random.randint(1, 12)
    day_of_birth = random.randint(1, 28)
    gender = random.choice(genders)
    disabled = random_boolean()
    if disabled:
        disability_details = random.choice([d for d in disabilities if d])
    else:
        disability_details = None
    marital = random.choice(marital_statuses)
    id_type_val = random.choice(id_types)
    id_number = pick(identification_numbers_pool)
    primary_language = random.choice(languages)
    secondary_language = random.choice(languages)
    religion = random.choice(religions)
    
    phone = random_phone_number()
    email = pick(emails_pool)
    social_media = random.choice(social_media_choices)
    # Communication mode ensure at least one
    c_modes = random.sample(communication_modes, k=random.randint(1, 3))
    
    county = "Mombasa"
    sub_county = chosen_sub_county
    ward = random.choice(chosen_wards)
    street = pick(streets_pool)
    
    employment_status = random.choice(employment_statuses)
    if employment_status in ["Employed", "Self-Employed"]:
        occupation = pick(occupation_pool)
        industry = random.choice(industries)
        if employment_status == "Employed":
            employer_name = pick(companies_pool)
            work_address = pick(addresses_pool)
        else:
            employer_name = None
            work_address = None
        monthly_income_bracket = random.choice(income_brackets)
        years_experience = random.randint(0,30)
    else:
        occupation = None
        industry = None
        employer_name = None
        work_address = None
        monthly_income_bracket = None
        years_experience = None
    
    education_level = random.choice(education_levels)
    skills = random_list_of_skills()
    
    household_size = random.randint(1,10)
    dependents = random.randint(0,household_size-1)
    is_head_of_household = random_boolean()
    housing_status = random.choice(housing_statuses)
    housing_type = random.choice(housing_types)
    utilities_access = random_utilities_access(utilities)
    
    health_insurance = pick(health_insurance_pool)
    health_conditions = pick(health_conditions_pool)
    allergies = pick(allergies_pool)
    blood_group = random.choice(["A", "B", "AB", "O"])
    
    demographic_income_bracket = random.choice(income_brackets)
    asset_ownership = random_assets(assets)
    technology_access = random_technology_access(technology)
    
    emergency_contact_name = pick(emergency_contact_name_pool)
    emergency_contact_relationship = random.choice(["Parent", "Sibling", "Spouse", "Friend", "Child", "Relative"])
    emergency_contact_phone = random_phone_number()
    memberships = random_memberships()
    volunteer_work = random_boolean()
    voting_status = random_boolean()
    preferred_engagement_mode = random.choice(["Online", "Physical Meetings", "Phone Calls"])
    
    youth_id = generate_youth_id(first_name, last_name, current_year)
    now_str = datetime.now().isoformat()
    date_added = now_str
    date_updated = now_str

    record = {
        "Youth_ID": youth_id,
        "date_added": date_added,
        "date_updated": date_updated,
        "PII": {
            "first_name": first_name,
            "middle_name": middle_name,
            "last_name": last_name,
            "year_of_birth": year_of_birth,
            "month_of_birth": month_of_birth,
            "day_of_birth": day_of_birth,
            "gender": gender,
            "is_person_with_disability": disabled,
            "disability_details": disability_details,
            "marital_status": marital,
            "identification_document_type": id_type_val,
            "identification_document_number": id_number,
            "primary_language": primary_language,
            "secondary_language": secondary_language,
            "religion": religion
        },
        "Contact Information": {
            "phone_number": phone,
            "email_address": email,
            "preferred_social_media": social_media,
            "modes_of_communication": c_modes
        },
        "Address": {
            "county": county,
            "sub_county": sub_county,
            "ward": ward,
            "street": street
        },
        "Employment/Occupation Information": {
            "employment_status": employment_status,
            "occupation": occupation,
            "industry": industry,
            "employer_name": employer_name,
            "work_address": work_address,
            "monthly_income_bracket": monthly_income_bracket,
            "years_of_experience": years_experience
        },
        "Education Information": {
            "highest_education_level": education_level,
            "skills_certifications": skills
        },
        "Household Information": {
            "household_size": household_size,
            "number_of_dependents": dependents,
            "is_head_of_household": is_head_of_household,
            "housing_status": housing_status,
            "housing_type": housing_type,
            "access_to_utilities": utilities_access
        },
        "Health Information": {
            "health_insurance_provider": health_insurance,
            "health_conditions": health_conditions,
            "allergies": allergies,
            "blood_group": blood_group
        },
        "Demographic Information": {
            "income_bracket": demographic_income_bracket,
            "ownership_of_assets": asset_ownership,
            "access_to_technology": technology_access
        },
        "Miscellaneous": {
            "emergency_contact_name": emergency_contact_name,
            "emergency_contact_relationship": emergency_contact_relationship,
            "emergency_contact_phone_number": emergency_contact_phone,
            "membership_in_groups": memberships,
            "volunteer_community_work": volunteer_work,
            "voting_status": voting_status,
            "preferred_mode_of_engagement": preferred_engagement_mode
        }
    }
    
    return record


In [4]:
# Generate and append data to files
num_records = 500
new_data = [generate_dummy_data() for _ in range(num_records)]

# JSON Handling: Append if file exists
json_filename = '../data/people.json'
existing_data = []
if os.path.exists(json_filename):
    with open(json_filename, 'r', encoding='utf-8') as f:
        try:
            existing_data = json.load(f)
        except json.JSONDecodeError:
            existing_data = []

existing_data.extend(new_data)

# Save updated JSON
with open(json_filename, 'w', encoding='utf-8') as jsonfile:
    json.dump(existing_data, jsonfile, indent=4, ensure_ascii=False)

# Flatten and remove prefix
final_records = [flatten_and_remove_prefix(record) for record in new_data]

csv_filename = '../data/people.csv'
file_exists = os.path.exists(csv_filename)

# Collect fieldnames
fieldnames_set = set()
for fr in final_records:
    fieldnames_set.update(fr.keys())
fieldnames = sorted(fieldnames_set)

# CSV Handling: Append mode if file exists, else write mode
mode = 'a' if file_exists else 'w'
with open(csv_filename, mode=mode, newline='', encoding='utf-8') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames, quoting=csv.QUOTE_ALL)
    if not file_exists:
        # Write header only if file did not exist before
        writer.writeheader()
    for fr in final_records:
        writer.writerow(fr)

print("Data appended to ../data/people.csv and ../data/people.json (or created if they didn't exist).")


Data appended to ../data/people.csv and ../data/people.json (or created if they didn't exist).
