In [6]:
import json
import time
from datetime import datetime, timedelta
import random
from faker import Faker
import logging
import uuid
import pandas as pd
from pathlib import Path

# Set up logging
logging.basicConfig(
    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger(__name__)

# Initialize Faker
fake = Faker()

# Configuration
TOTAL_RECORDS = 10009900000  # Total number of records to generate
BATCH_SIZE = 10000  # Records to process at once to manage memory
OUTPUT_DIR = "data"
CSV_FILENAME = "employee_records.csv"

# Rich data schema configuration
DEPARTMENTS = {
    "Engineering": {
        "salary_range": (70000, 160000),
        "positions": [
            "Software Engineer",
            "DevOps Engineer",
            "QA Engineer",
            "Engineering Manager",
        ],
    },
    "Sales": {
        "salary_range": (50000, 130000),
        "positions": [
            "Sales Representative",
            "Account Executive",
            "Sales Manager",
            "Sales Director",
        ],
    },
    "Marketing": {
        "salary_range": (45000, 120000),
        "positions": [
            "Marketing Specialist",
            "Content Manager",
            "Marketing Director",
            "Brand Manager",
        ],
    },
    "HR": {
        "salary_range": (40000, 110000),
        "positions": ["HR Coordinator", "HR Manager", "Recruiter", "HR Director"],
    },
    "Finance": {
        "salary_range": (55000, 140000),
        "positions": [
            "Financial Analyst",
            "Accountant",
            "Finance Manager",
            "Controller",
        ],
    },
}

OFFICE_LOCATIONS = {
    "New York": {"timezone": "America/New_York", "country": "USA"},
    "San Francisco": {"timezone": "America/Los_Angeles", "country": "USA"},
    "London": {"timezone": "Europe/London", "country": "UK"},
    "Singapore": {"timezone": "Asia/Singapore", "country": "Singapore"},
    "Sydney": {"timezone": "Australia/Sydney", "country": "Australia"},
}

EMPLOYMENT_STATUS = ["Full-time", "Part-time", "Contract", "Remote"]
PERFORMANCE_RATINGS = [
    "Exceptional",
    "Exceeds Expectations",
    "Meets Expectations",
    "Needs Improvement",
]


def generate_record():
    """Generate a single employee record with detailed information"""
    join_date = datetime.now() - timedelta(days=random.randint(0, 3650))
    department = random.choice(list(DEPARTMENTS.keys()))
    position = random.choice(DEPARTMENTS[department]["positions"])
    office_location = random.choice(list(OFFICE_LOCATIONS.keys()))

    return {
        "record_id": str(uuid.uuid4()),
        "timestamp": str(int(time.time() * 1000)),
        "version": "2.0",
        "data_center": random.choice(["dc-east", "dc-west", "dc-eu"]),
        "employee_id": f"EMP-{str(uuid.uuid4())[:8].upper()}",
        "name": fake.name(),
        "email": fake.email(),
        "phone": fake.phone_number(),
        "department": department,
        "position": position,
        "experience_level": random.choice(["Junior", "Mid-level", "Senior", "Expert"]),
        "employment_status": random.choice(EMPLOYMENT_STATUS),
        "salary": random.randint(*DEPARTMENTS[department]["salary_range"]),
        "bonus_eligible": random.choice([True, False]),
        "stock_options": random.randint(0, 10000) if random.random() > 0.5 else 0,
        "office": office_location,
        "timezone": OFFICE_LOCATIONS[office_location]["timezone"],
        "country": OFFICE_LOCATIONS[office_location]["country"],
        "remote_work_eligible": random.choice([True, False]),
        "join_date": join_date.strftime("%Y-%m-%d"),
        "last_promotion_date": (
            join_date + timedelta(days=random.randint(180, 1000))
        ).strftime("%Y-%m-%d"),
        "last_review_date": (
            datetime.now() - timedelta(days=random.randint(0, 365))
        ).strftime("%Y-%m-%d"),
        "last_rating": random.choice(PERFORMANCE_RATINGS),
        "rating_date": (
            datetime.now() - timedelta(days=random.randint(0, 180))
        ).strftime("%Y-%m-%d"),
        "projects_completed": random.randint(1, 20),
    }


def main():
    # Create output directory if it doesn't exist
    Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)
    output_path = Path(OUTPUT_DIR) / CSV_FILENAME

    # Generate and save data in batches
    total_batches = TOTAL_RECORDS // BATCH_SIZE

    logger.info(
        f"Starting data generation: {TOTAL_RECORDS} records in {total_batches} batches"
    )

    # Generate first batch and create CSV with headers
    first_batch = [generate_record() for _ in range(BATCH_SIZE)]
    df = pd.DataFrame(first_batch)
    # df.to_csv(output_path, index=False)
    df.to_csv(output_path, mode="a", header=False, index=False)

    logger.info(f"Created CSV file and wrote first batch of {BATCH_SIZE} records")

    # Generate and append remaining batches
    for batch_num in range(1, total_batches):
        records = [generate_record() for _ in range(BATCH_SIZE)]
        df = pd.DataFrame(records)
        df.to_csv(output_path, mode="a", header=False, index=False)
        logger.info(f"Completed batch {batch_num + 1}/{total_batches}")

    logger.info(f"Data generation complete. File saved to: {output_path}")
    logger.info(f"Total records generated: {TOTAL_RECORDS}")


if __name__ == "__main__":
    main()

2024-11-08 12:24:10,566 - INFO - Starting data generation: 10009900000 records in 1000990 batches


2024-11-08 12:24:14,323 - INFO - Created CSV file and wrote first batch of 10000 records
2024-11-08 12:24:19,291 - INFO - Completed batch 2/1000990
2024-11-08 12:24:25,085 - INFO - Completed batch 3/1000990
2024-11-08 12:24:30,032 - INFO - Completed batch 4/1000990
2024-11-08 12:24:34,552 - INFO - Completed batch 5/1000990
2024-11-08 12:24:39,871 - INFO - Completed batch 6/1000990
2024-11-08 12:24:45,162 - INFO - Completed batch 7/1000990
2024-11-08 12:24:51,355 - INFO - Completed batch 8/1000990
2024-11-08 12:24:56,352 - INFO - Completed batch 9/1000990
2024-11-08 12:25:01,253 - INFO - Completed batch 10/1000990
2024-11-08 12:25:05,953 - INFO - Completed batch 11/1000990
2024-11-08 12:25:10,939 - INFO - Completed batch 12/1000990
2024-11-08 12:25:16,010 - INFO - Completed batch 13/1000990
2024-11-08 12:25:20,744 - INFO - Completed batch 14/1000990
2024-11-08 12:25:25,652 - INFO - Completed batch 15/1000990
2024-11-08 12:25:30,700 - INFO - Completed batch 16/1000990
2024-11-08 12:25:35

KeyboardInterrupt: 