# Company Data Generation Notebook

This notebook generates dummy company data with the specified fields. The data is appended to existing JSON and CSV files if they already exist, otherwise it creates new files.

**Features:**

- Uses specified sub_counties and wards from a given dictionary.
- Randomly selects industries, company size, and various other attributes.
- Uses Faker for realistic random data (company names, emails, etc.).
- Flattens and removes the top-level category names before saving to CSV.
- Appends new data on each run, does not overwrite existing data.

# Instructions

- Run all cells in order.
- The last cell will generate 5 new records each time you run it.
- These records will be appended to `company_data.json` and `company_data.csv` if these files exist, or created new otherwise.
- Each subsequent run adds more data without overwriting the existing data.



In [2]:
# Cell 1: Imports and Constants
import random
import string
import csv
import json
import os
from datetime import datetime
from faker import Faker

fake = Faker()

# Predefined options
industries = ["Agriculture", "Technology", "Healthcare", "Finance", "Construction", "Retail"]
company_sizes = ["1-10 employees", "11-50 employees", "51-200 employees", "201-500 employees", "500+"]
org_types = ["Public", "Private", "NGO"]
markets = ["Local", "Regional", "Global"]
line_of_business = ["Manufacturing", "Retail", "Software Development", "Consultancy", "Hospitality"]
annual_revenue_brackets = ["<$1M", "$1M-$10M", "$10M-$50M", "$50M-$100M", ">$100M"]
funding_sources = ["Self-Funded", "Investors", "Grants", "Loans"]
communication_channels = ["Email", "Phone", "Meetings"]
recruitment_policies = ["Remote", "On-site", "Hybrid"]

social_media_platforms = ["LinkedIn", "Twitter", "Facebook", "Instagram", "None"]
products_services_list = ["Software", "Hardware", "Organic Produce", "Textiles", "Consulting Services", "Financial Advisory"]
skills_required_list = ["Programming", "Data Analysis", "Project Management", "Sales", "Marketing", "Design"]

sub_counties_wards = {
    "MVITA": ["TUDOR", "TONONOKA", "GANJONI/SHIMANZI", "MAKADARA/MJI WA KALE", "MAJENGO"],
    "NYALI": ["ZIWA LA NG'OMBE", "FRERE TOWN", "MKOMANI", "KONGOWEA", "KADZANDANI"],
    "LIKONI": ["MTONGWE", "SHIKA ADABU", "BOFU", "LIKONI", "TIMBWANI"],
    "JOMVU": ["JOMVU KUU", "MIKINDANI", "MIRITINI"],
    "CHANGAMWE": ["PORT REITZ", "KIPEVU", "AIRPORT", "CHANGAMWE", "CHAANI"],
    "KISAUNI": ["MJAMBERE", "JUNDA", "BAMBURI", "MWAKIRUNGE", "MTOPANGA", "MAGOGONI", "SHANZU"]
}
counties = ["Mombasa"]


In [3]:
# Cell 2: Helper functions
def pick(lst):
    return random.choice(lst)

def random_boolean():
    return random.choice([True, False])

def random_phone_number():
    return "+254" + "".join(random.choices(string.digits, k=9))

def generate_branch_locations(num=2):
    branches = []
    for i in range(num):
        sub_county = pick(list(sub_counties_wards.keys()))
        ward = pick(sub_counties_wards[sub_county])
        branch = {
            "branch_name": fake.company_suffix(),
            "sub_county": sub_county,
            "ward": ward,
            "postal_address": fake.postcode(),
            "latitude": str(round(random.uniform(-1.0, 1.0), 5)),
            "longitude": str(round(random.uniform(30.0, 40.0), 5))
        }
        branches.append(branch)
    return branches

def flatten_and_remove_prefix(d, parent_key='', sep='_'):
    items = []
    for k, v in d.items():
        new_key = f"{parent_key}{sep}{k}" if parent_key else k
        if isinstance(v, dict):
            items.extend(flatten_and_remove_prefix(v, new_key, sep).items())
        elif isinstance(v, list):
            # If the list contains dicts, convert to JSON string to store in CSV
            if v and isinstance(v[0], dict):
                val_str = json.dumps(v, ensure_ascii=False)
            else:
                val_str = ", ".join(map(str, v))
            parts = new_key.split(sep, 1)
            if len(parts) > 1:
                new_key = parts[1]
            items.append((new_key, val_str))
        else:
            val_str = str(v) if v is not None else ""
            parts = new_key.split(sep, 1)
            if len(parts) > 1:
                new_key = parts[1]
            items.append((new_key, val_str))
    return dict(items)


In [6]:
# Cell 3: The main data generation function
def generate_company_data():
    company_name = fake.company()
    trade_name = company_name if random_boolean() else None
    if trade_name is None and random.random() < 0.25:
        trade_name = fake.company()

    registration_number = "".join(random.choices(string.digits, k=9))
    year_establishment = random.randint(1900, 2020)
    industry = pick(industries)
    size = pick(company_sizes)
    org_type = pick(org_types)

    # Headquarters
    sub_county = pick(list(sub_counties_wards.keys()))
    ward = pick(sub_counties_wards[sub_county])
    county = "Mombasa"
    postal_address = fake.postcode()
    latitude = str(round(random.uniform(-1.0, 1.0), 5))
    longitude = str(round(random.uniform(30.0, 40.0), 5))

    branches = generate_branch_locations(num=random.randint(0, 3))

    phone = random_phone_number()
    alt_phone = random_phone_number() if random_boolean() else None
    email = fake.company_email()
    website = f"http://{fake.domain_name()}"
    social_media_handles = []
    if random_boolean():
        platforms = random.sample(social_media_platforms, k=random.randint(1,2))
        for p in platforms:
            if p != "None":
                social_media_handles.append(f"{p}:{company_name.replace(' ','')}")

    # Business Activities
    primary_line = pick(line_of_business)
    secondary_activities = [pick(line_of_business)] if random_boolean() else []
    products_services = random.sample(products_services_list, k=random.randint(1,3))
    target_market = pick(markets)
    key_clients_partners = [fake.company() for _ in range(random.randint(0,3))] if random_boolean() else []
    operating_hours = "Mon-Fri 9am-5pm UTC+3" if random_boolean() else None

    # Employment Information
    num_employees = random.randint(1, 1000)
    employee_demographics = None
    if random_boolean():
        employee_demographics = {
            "gender_ratio": "50% male, 50% female",
            "age_groups": "20-30:40%,30-40:30%,40+:30%"
        }
    recruitment_policy = pick(recruitment_policies)
    skills_required = random.sample(skills_required_list, k=random.randint(1,4))
    training_dev = random_boolean()
    employee_benefits = []
    if random_boolean():
        possible_benefits = ["Health Insurance", "Paid Leave", "Pension", "Flexible Hours"]
        employee_benefits = random.sample(possible_benefits, k=random.randint(1, len(possible_benefits)))

    # Legal and Financial
    business_reg_date = datetime(year_establishment, random.randint(1,12), random.randint(1,28)).isoformat()
    tin = "".join(random.choices(string.digits, k=10))
    license_num = "".join(random.choices(string.ascii_uppercase + string.digits, k=7)) if random_boolean() else None
    annual_revenue = pick(annual_revenue_brackets)
    funding = random.sample(funding_sources, k=random.randint(1,2))

    # Misc
    mission_vision = fake.sentence(nb_words=10)
    csr = fake.sentence(nb_words=8) if random_boolean() else None
    awards = [fake.word().capitalize() + " Award" for _ in range(random.randint(1,2))] if random_boolean() else []
    compliance = random_boolean()
    compliance_details = "Non-compliance in some labor regulations." if not compliance else None
    preferred_channel = pick(communication_channels)

    record = {
        "Basic Information": {
            "company_name": company_name,
            "trade_name_dba": trade_name,
            "registration_number": registration_number,
            "year_of_establishment": year_establishment,
            "industry_field": industry,
            "company_size": size,
            "organization_type": org_type
        },
        "Contact Information": {
            "headquarters_address": {
                "county": county,
                "sub_county": sub_county,
                "ward": ward,
                "postal_address": postal_address,
                "latitude": latitude,
                "longitude": longitude
            },
            "branch_locations": branches,
            "phone_number": phone,
            "alternate_phone_number": alt_phone,
            "email_address": email,
            "website_url": website,
            "social_media_handles": social_media_handles
        },
        "Business Activities": {
            "primary_line_of_business": primary_line,
            "secondary_activities": secondary_activities,
            "products_services_offered": products_services,
            "target_market": target_market,
            "key_clients_partners": key_clients_partners,
            "operating_hours": operating_hours
        },
        "Employment Information": {
            "number_of_employees": num_employees,
            "employee_demographics": employee_demographics,
            "recruitment_policy": recruitment_policy,
            "skills_qualifications_required": skills_required,
            "training_development_offered": training_dev,
            "employee_benefits": employee_benefits
        },
        "Legal and Financial Information": {
            "business_registration_date": business_reg_date,
            "tax_identification_number": tin,
            "license_number": license_num,
            "annual_revenue_bracket": annual_revenue,
            "funding_sources": funding
        },
        "Miscellaneous": {
            "company_mission_vision": mission_vision,
            "csr_initiatives": csr,
            "awards_certifications": awards,
            "compliance_with_labor_laws": compliance,
            "compliance_details": compliance_details,
            "preferred_communication_channel": preferred_channel
        }
    }

    return record

# Cell 4: Generate Data and Append to Files
num_records = 250
new_data = [generate_company_data() for _ in range(num_records)]

json_filename = '../data/companies.json'
existing_data = []
if os.path.exists(json_filename):
    with open(json_filename, 'r', encoding='utf-8') as f:
        try:
            existing_data = json.load(f)
        except json.JSONDecodeError:
            existing_data = []

existing_data.extend(new_data)

with open(json_filename, 'w', encoding='utf-8') as jsonfile:
    json.dump(existing_data, jsonfile, indent=4, ensure_ascii=False)

final_records = [flatten_and_remove_prefix(record) for record in new_data]

csv_filename = '../data/companies.csv'
file_exists = os.path.exists(csv_filename)

fieldnames_set = set()
for fr in final_records:
    fieldnames_set.update(fr.keys())
fieldnames = sorted(fieldnames_set)

mode = 'a' if file_exists else 'w'
with open(csv_filename, mode=mode, newline='', encoding='utf-8') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames, quoting=csv.QUOTE_ALL)
    if not file_exists:
        writer.writeheader()
    for fr in final_records:
        writer.writerow(fr)

print("Company data appended to company_data.json and company_data.csv (or created if they didn't exist).")



Company data appended to company_data.json and company_data.csv (or created if they didn't exist).
