In [25]:
from kafka import KafkaProducer
import json
import time

# Create a Kafka producer
producer = KafkaProducer(
    bootstrap_servers="localhost:9092",
    value_serializer=lambda v: json.dumps(v).encode("utf-8"),
)
   

In [1]:
from kafka import KafkaProducer
import json
import time
from datetime import datetime, timedelta
import random
from faker import Faker
import logging
import uuid

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# Initialize Faker
fake = Faker()

# Configuration
BATCH_SIZE = 1000 # Number of records to generate in one batch
KAFKA_TOPIC = 'testing'
KAFKA_SERVER = 'localhost:9092'

# Rich data schema configuration
DEPARTMENTS = {
    'Engineering': {'salary_range': (70000, 160000), 'positions': ['Software Engineer', 'DevOps Engineer', 'QA Engineer', 'Engineering Manager']},
    'Sales': {'salary_range': (50000, 130000), 'positions': ['Sales Representative', 'Account Executive', 'Sales Manager', 'Sales Director']},
    'Marketing': {'salary_range': (45000, 120000), 'positions': ['Marketing Specialist', 'Content Manager', 'Marketing Director', 'Brand Manager']},
    'HR': {'salary_range': (40000, 110000), 'positions': ['HR Coordinator', 'HR Manager', 'Recruiter', 'HR Director']},
    'Finance': {'salary_range': (55000, 140000), 'positions': ['Financial Analyst', 'Accountant', 'Finance Manager', 'Controller']}
}

OFFICE_LOCATIONS = {
    'New York': {'timezone': 'America/New_York', 'country': 'USA'},
    'San Francisco': {'timezone': 'America/Los_Angeles', 'country': 'USA'},
    'London': {'timezone': 'Europe/London', 'country': 'UK'},
    'Singapore': {'timezone': 'Asia/Singapore', 'country': 'Singapore'},
    'Sydney': {'timezone': 'Australia/Sydney', 'country': 'Australia'}
}

EMPLOYMENT_STATUS = ['Full-time', 'Part-time', 'Contract', 'Remote']
PERFORMANCE_RATINGS = ['Exceptional', 'Exceeds Expectations', 'Meets Expectations', 'Needs Improvement']

def generate_record():
    """Generate a single employee record with detailed information"""
    join_date = datetime.now() - timedelta(days=random.randint(0, 3650))
    department = random.choice(list(DEPARTMENTS.keys()))
    position = random.choice(DEPARTMENTS[department]['positions'])
    office_location = random.choice(list(OFFICE_LOCATIONS.keys()))
    
    return {
        'metadata': {
            'record_id': str(uuid.uuid4()),
            'timestamp': str(int(time.time() * 1000)),  # Convert to string to match schema
            'version': '2.0',
            'data_center': random.choice(['dc-east', 'dc-west', 'dc-eu']),
        },
        'employee': {
            'id': f"EMP-{str(uuid.uuid4())[:8].upper()}",
            'name': fake.name(),
            'email': fake.email(),
            'phone': fake.phone_number(),
            'department': department,
            'position': position,
            'experience_level': random.choice(['Junior', 'Mid-level', 'Senior', 'Expert']),
            'employment_status': random.choice(EMPLOYMENT_STATUS),
        },
        'compensation': {
            'salary': random.randint(*DEPARTMENTS[department]['salary_range']),
            'bonus_eligible': random.choice([True, False]),
            'stock_options': random.randint(0, 10000) if random.random() > 0.5 else 0,
        },
        'location': {
            'office': office_location,
            'timezone': OFFICE_LOCATIONS[office_location]['timezone'],
            'country': OFFICE_LOCATIONS[office_location]['country'],
            'remote_work_eligible': random.choice([True, False]),
        },
        'dates': {
            'join_date': join_date.strftime('%Y-%m-%d'),
            'last_promotion_date': (join_date + timedelta(days=random.randint(180, 1000))).strftime('%Y-%m-%d'),
            'last_review_date': (datetime.now() - timedelta(days=random.randint(0, 365))).strftime('%Y-%m-%d'),
        },
        'performance': {
            'last_rating': random.choice(PERFORMANCE_RATINGS),
            'rating_date': (datetime.now() - timedelta(days=random.randint(0, 180))).strftime('%Y-%m-%d'),
            'projects_completed': random.randint(1, 20),
        }
    }

def main():
    producer = KafkaProducer(
        bootstrap_servers=KAFKA_SERVER,
        value_serializer=lambda x: json.dumps(x).encode('utf-8')
    )

    batch_count = 0
    try:
        while True:
            batch_count += 1
            logger.info(f"Generating batch {batch_count}")
            
            # Generate batch of records
            records = []
            for _ in range(BATCH_SIZE):
                records.append(generate_record())
            
            # Create a wrapper object with metadata and records array
            batch_data = {
                "batch_id": str(uuid.uuid4()),
                "batch_timestamp": int(time.time() * 1000),
                "record_count": len(records),
                "records": records
            }
            
            logger.info(f"Sending batch {batch_count} with {len(records)} records to Kafka...")
            producer.send(KAFKA_TOPIC, value=batch_data)
            producer.flush()
            logger.info(f"Completed sending batch {batch_count}")
            
            # Print sample of the data being sent
            # logger.info("Sample of batch data structure:")
            # sample_json = json.dumps(batch_data, indent=2)
            # logger.info(f"\n{sample_json}...")  # Show first 1000 chars as sample
            
            time.sleep(0)  # Wait 5 seconds between batches
            if(batch_count==100):
                break

    except KeyboardInterrupt:
        logger.info("Shutting down...")
    finally:
        producer.close()
        logger.info(f"Producer closed. Total batches sent: {batch_count}")

if __name__ == "__main__":
    main()

2024-11-08 15:17:18,154 - INFO - <BrokerConnection node_id=bootstrap-0 host=localhost:9092 <connecting> [IPv4 ('127.0.0.1', 9092)]>: connecting to localhost:9092 [('127.0.0.1', 9092) IPv4]
2024-11-08 15:17:18,164 - INFO - Probing node bootstrap-0 broker version
2024-11-08 15:17:18,168 - INFO - <BrokerConnection node_id=bootstrap-0 host=localhost:9092 <connecting> [IPv4 ('127.0.0.1', 9092)]>: Connection complete.
2024-11-08 15:17:18,303 - INFO - Broker version identified as 2.5.0
2024-11-08 15:17:18,306 - INFO - Set configuration api_version=(2, 5, 0) to skip auto check_version requests on startup
2024-11-08 15:17:18,329 - INFO - Generating batch 1
2024-11-08 15:17:20,671 - INFO - Sending batch 1 with 1000 records to Kafka...
2024-11-08 15:17:20,721 - INFO - <BrokerConnection node_id=1 host=kafka:9092 <connecting> [IPv4 ('127.0.0.1', 9092)]>: connecting to kafka:9092 [('127.0.0.1', 9092) IPv4]
2024-11-08 15:17:20,725 - INFO - <BrokerConnection node_id=1 host=kafka:9092 <connecting> [IPv

In [None]:
producer.close()