In [32]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import random

# Set random seed for reproducibility
np.random.seed(42)
random.seed(42)

# Generate a large dataset with 1000 employees
n_employees = 1000

# Generate employee data
employee_ids = np.arange(1, n_employees + 1)

# Names - combining first and last names
first_names = ['Alice', 'Bob', 'Charlie', 'David', 'Emma', 'Frank', 'Grace', 'Henry', 
               'Isabella', 'Jack', 'Kate', 'Liam', 'Mia', 'Noah', 'Olivia', 'Peter',
               'Quinn', 'Rachel', 'Sam', 'Tina', 'Uma', 'Victor', 'Wendy', 'Xavier',
               'Yara', 'Zoe', 'Aaron', 'Bella', 'Chris', 'Diana', 'Ethan', 'Fiona']

last_names = ['Smith', 'Johnson', 'Williams', 'Brown', 'Jones', 'Garcia', 'Miller',
              'Davis', 'Rodriguez', 'Martinez', 'Hernandez', 'Lopez', 'Gonzalez',
              'Wilson', 'Anderson', 'Thomas', 'Taylor', 'Moore', 'Jackson', 'Martin',
              'Lee', 'Perez', 'Thompson', 'White', 'Harris', 'Sanchez', 'Clark']

names = [f"{random.choice(first_names)} {random.choice(last_names)}" for _ in range(n_employees)]

# Ages (18-65)
ages = np.random.randint(18, 66, n_employees)

# Cities
cities = ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix', 'Philadelphia',
          'San Antonio', 'San Diego', 'Dallas', 'San Jose', 'Austin', 'Jacksonville',
          'Fort Worth', 'Columbus', 'Charlotte', 'San Francisco', 'Indianapolis',
          'Seattle', 'Denver', 'Washington', 'Boston', 'El Paso', 'Nashville',
          'Detroit', 'Oklahoma City', 'Portland', 'Las Vegas', 'Memphis', 'Louisville',
          'Baltimore', 'Milwaukee', 'Albuquerque', 'Tucson', 'Fresno', 'Sacramento']

city_list = np.random.choice(cities, n_employees)

# Departments
departments = ['HR', 'Engineering', 'Marketing', 'Finance', 'Sales', 'Operations',
               'Legal', 'IT', 'Customer Service', 'Research & Development']
department_list = np.random.choice(departments, n_employees)

# Job Titles based on departments
job_titles_map = {
    'HR': ['HR Manager', 'Recruiter', 'HR Specialist', 'Payroll Specialist'],
    'Engineering': ['Software Engineer', 'Senior Engineer', 'DevOps Engineer', 'QA Engineer'],
    'Marketing': ['Marketing Manager', 'Digital Marketer', 'Content Creator', 'SEO Specialist'],
    'Finance': ['Financial Analyst', 'Accountant', 'Finance Manager', 'Budget Analyst'],
    'Sales': ['Sales Representative', 'Sales Manager', 'Account Executive', 'Business Development'],
    'Operations': ['Operations Manager', 'Process Analyst', 'Operations Coordinator'],
    'Legal': ['Legal Counsel', 'Paralegal', 'Compliance Officer'],
    'IT': ['System Administrator', 'Network Engineer', 'IT Support', 'Database Administrator'],
    'Customer Service': ['Customer Service Rep', 'Support Manager', 'Customer Success Manager'],
    'Research & Development': ['Research Scientist', 'R&D Engineer', 'Product Developer']
}

job_titles = [random.choice(job_titles_map[dept]) for dept in department_list]

# Salaries (correlated with age, department, and job level)
base_salaries = {
    'HR': 55000, 'Engineering': 85000, 'Marketing': 60000, 'Finance': 70000,
    'Sales': 65000, 'Operations': 58000, 'Legal': 95000, 'IT': 75000,
    'Customer Service': 45000, 'Research & Development': 90000
}

salaries = []
for i, dept in enumerate(department_list):
    base = base_salaries[dept]
    # Add age factor and random variation
    age_factor = (ages[i] - 22) * 1000  # Experience factor
    random_factor = np.random.randint(-15000, 25000)
    salary = max(30000, base + age_factor + random_factor)  # Minimum salary of 30k
    salaries.append(salary)

# Years of Experience
years_experience = np.maximum(0, ages - np.random.randint(18, 25, n_employees))

# Performance Ratings (1-5 scale)
performance_ratings = np.random.choice([1, 2, 3, 4, 5], n_employees, 
                                     p=[0.05, 0.15, 0.4, 0.3, 0.1])  # Realistic distribution

# Education Levels
education_levels = np.random.choice(['High School', 'Bachelor\'s', 'Master\'s', 'PhD'], 
                                  n_employees, p=[0.2, 0.5, 0.25, 0.05])

# Employment Status
employment_status = np.random.choice(['Full-time', 'Part-time', 'Contract'], 
                                   n_employees, p=[0.8, 0.15, 0.05])

# Gender
gender = np.random.choice(['Male', 'Female', 'Other'], n_employees, p=[0.48, 0.48, 0.04])

# Start dates (within last 10 years)
start_date = datetime.now() - timedelta(days=10*365)
start_dates = [start_date + timedelta(days=np.random.randint(0, 10*365)) 
               for _ in range(n_employees)]

# Bonus (percentage of salary)
bonus_percentages = np.random.normal(0.1, 0.05, n_employees)  # 10% average with variation
bonus_percentages = np.clip(bonus_percentages, 0, 0.3)  # Cap at 30%
bonuses = [int(salaries[i] * bonus_percentages[i]) for i in range(n_employees)]

# Remote work days per week
remote_days = np.random.choice([0, 1, 2, 3, 4, 5], n_employees, 
                             p=[0.3, 0.2, 0.2, 0.15, 0.1, 0.05])

# Satisfaction scores (1-10)
satisfaction_scores = np.random.normal(7, 1.5, n_employees)
satisfaction_scores = np.clip(satisfaction_scores, 1, 10)
satisfaction_scores = np.round(satisfaction_scores, 1)

# Create the comprehensive DataFrame
data = {
    'EmployeeID': employee_ids,
    'Name': names,
    'Age': ages,
    'Gender': gender,
    'City': city_list,
    'Department': department_list,
    'JobTitle': job_titles,
    'Salary': salaries,
    'YearsExperience': years_experience,
    'PerformanceRating': performance_ratings,
    'EducationLevel': education_levels,
    'EmploymentStatus': employment_status,
    'StartDate': start_dates,
    'Bonus': bonuses,
    'RemoteDaysPerWeek': remote_days,
    'SatisfactionScore': satisfaction_scores
}

df = pd.DataFrame(data)

df.to_csv('employee.csv', index=False)
print("✅ Dataset saved as 'employee.csv'")

✅ Dataset saved as 'employee.csv'
