In [4]:
!pip install faker



In [11]:
import pandas as pd
import random
from faker import Faker
from datetime import datetime, timedelta, date

fake = Faker('id_ID')

# Parameter dataset
num_records = 5432
umr_tertinggi = 5000000  # UMR tertinggi dalam Rupiah

# Proporsi gender
gender_prob = [0.55, 0.45]  # Male, Female

# Lokasi dan koordinat setelah revisi
locations = {
    "Jakarta": {"Province": "DKI Jakarta", "Latitude": -6.2088, "Longitude": 106.8456, "Weight": 0.175},
    "Tangerang": {"Province": "Banten", "Latitude": -6.1702, "Longitude": 106.6319, "Weight": 0.125},
    "Bandung": {"Province": "Jawa Barat", "Latitude": -6.9175, "Longitude": 107.6191, "Weight": 0.125},
    "Bekasi": {"Province": "Jawa Barat", "Latitude": -6.2383, "Longitude": 106.9756, "Weight": 0.075},
    "Bogor": {"Province": "Jawa Barat", "Latitude": -6.5950, "Longitude": 106.8167, "Weight": 0.075},
    "Surabaya": {"Province": "Jawa Timur", "Latitude": -7.2575, "Longitude": 112.7521, "Weight": 0.35},
    "Bangkalan": {"Province": "Jawa Timur", "Latitude": -7.0439, "Longitude": 112.9091, "Weight": 0.075},
    "Malang": {"Province": "Jawa Timur", "Latitude": -7.9819, "Longitude": 112.6265, "Weight": 0.075},
    "Semarang": {"Province": "Jawa Tengah", "Latitude": -6.9667, "Longitude": 110.4167, "Weight": 0.075},
    "Surakarta": {"Province": "Jawa Tengah", "Latitude": -7.5560, "Longitude": 110.8317, "Weight": 0.075},
    "Denpasar": {"Province": "Bali", "Latitude": -8.6705, "Longitude": 115.2126, "Weight": 0.05},
    "Ubud": {"Province": "Bali", "Latitude": -8.5190, "Longitude": 115.2630, "Weight": 0.05},
}

# Departemen dan proporsi
departments = {
    "HR": 0.05,
    "IT": 0.10,
    "Sales & Marketing": 0.25,
    "Business Development": 0.07,
    "Finance": 0.08,
    "Operations": 0.25,
    "Customer Service": 0.20,
}

# Job titles by department
job_titles = {
    "HR": ["Payroll", "Recruiter", "HR Assistant"],
    "IT": ["Software Developer", "Technical Support", "Network Engineer"],
    "Sales & Marketing": ["Sales Executive", "Marketing Coordinator", "Sales Representative"],
    "Business Development": ["Business Analyst", "Project Coordinator"],
    "Finance": ["Accountant", "Finance Assistant"],
    "Operations": ["Logistics Specialist", "Operations Coordinator"],
    "Customer Service": ["Support Specialist", "Customer Care Representative"],
}

# Education levels mapped to job titles
education_mapping = {
    "Payroll": ["SMA/SMK", "Diploma"],
    "Recruiter": ["Diploma"],
    "HR Assistant": ["SMA/SMK"],
    "Software Developer": ["Diploma", "Sarjana (S1)"],
    "Technical Support": ["Diploma"],
    "Network Engineer": ["Diploma", "Sarjana (S1)"],
    "Sales Executive": ["SMA/SMK", "Diploma"],
    "Marketing Coordinator": ["Sarjana (S1)"],
    "Sales Representative": ["SMA/SMK"],
    "Business Analyst": ["Sarjana (S1)", "Magister (S2)"],
    "Project Coordinator": ["Sarjana (S1)", "Magister (S2)"],
    "Accountant": ["Diploma", "Sarjana (S1)"],
    "Finance Assistant": ["Diploma"],
    "Logistics Specialist": ["Diploma", "Sarjana (S1)"],
    "Operations Coordinator": ["Diploma"],
    "Support Specialist": ["SMA/SMK", "Diploma"],
    "Customer Care Representative": ["SMA/SMK"],
}

# Gaji berdasarkan jabatan dasar
base_salary_ranges = {
    "Payroll": (5000000, 7000000),
    "Recruiter": (5000000, 7000000),
    "HR Assistant": (4000000, 5000000),
    "Software Developer": (8000000, 12000000),
    "Technical Support": (5000000, 8000000),
    "Network Engineer": (7000000, 9000000),
    "Sales Executive": (5000000, 8000000),
    "Marketing Coordinator": (8000000, 10000000),
    "Sales Representative": (4000000, 6000000),
    "Business Analyst": (12000000, 15000000),
    "Project Coordinator": (8000000, 10000000),
    "Accountant": (7000000, 10000000),
    "Finance Assistant": (5000000, 7000000),
    "Logistics Specialist": (8000000, 12000000),
    "Operations Coordinator": (7000000, 10000000),
    "Support Specialist": (4000000, 5000000),
    "Customer Care Representative": (4000000, 6000000),
}

# Employee ID tracker to ensure uniqueness
used_ids = set()

# Fungsi untuk membuat Employee ID yang unik
def generate_unique_employee_id(city, gender, counter):
    while True:
        city_prefix = city[:2].upper()
        gender_code = "M" if gender == "Male" else "F"
        employee_id = f"{city_prefix}{gender_code}{str(counter).zfill(4)}"
        if employee_id not in used_ids:
            used_ids.add(employee_id)
            return employee_id
        counter += 1  # Tambah counter jika terjadi konflik

# Membuat dataset
data = []
counter_per_city_gender = {}

for _ in range(num_records):
    city, details = random.choices(list(locations.items()), weights=[v["Weight"] for v in locations.values()], k=1)[0]
    gender = random.choices(["Male", "Female"], weights=gender_prob, k=1)[0]

    # Hitung counter berdasarkan city + gender
    key = f"{city}_{gender}"
    if key not in counter_per_city_gender:
        counter_per_city_gender[key] = 1
    else:
        counter_per_city_gender[key] += 1

    # Generate Employee ID unik
    emp_id = generate_unique_employee_id(city, gender, counter_per_city_gender[key])
    department = random.choices(list(departments.keys()), weights=departments.values(), k=1)[0]
    job_title = random.choice(job_titles[department])
    education_level = random.choice(education_mapping[job_title])
    hire_date = fake.date_between(start_date='-10y', end_date='today')
    birth_date = fake.date_of_birth(minimum_age=21, maximum_age=55)
    termination_date = hire_date + timedelta(days=random.randint(180, 3650)) if random.random() < 0.112 else None
    base_salary = random.randint(*base_salary_ranges[job_title])
    performance = random.choices(["Excellent", "Good", "Satisfactory", "Needs Improvement"], weights=[0.1, 0.5, 0.3, 0.1], k=1)[0]
    overtime = random.choices(["Yes", "No"], weights=[0.3, 0.7], k=1)[0]

    # Adjusted salary
    adjusted_salary = max(base_salary * (1 + ((date.today() - hire_date).days // 365) * 0.03), umr_tertinggi)

    data.append({
        "employee_id": emp_id,
        "name": fake.name_male() if gender == "Male" else fake.name_female(),
        "gender": gender,
        "city": city,
        "province": details["Province"],
        "latitude": details["Latitude"],
        "longitude": details["Longitude"],
        "department": department,
        "job_title": job_title,
        "hire_date": hire_date,
        "birth_date": birth_date,
        "termination_date": termination_date,
        "education_level": education_level,
        "salary": int(adjusted_salary),
        "performance_rating": performance,
        "overtime": overtime,
    })

# Konversi ke DataFrame
df = pd.DataFrame(data)

# Simpan ke CSV
df.to_csv("full_employee_data.csv", index=False)
print("Dataset berhasil dibuat dan disimpan ke 'full_employee_data.csv'")

Dataset berhasil dibuat dan disimpan ke 'full_employee_data.csv'


In [12]:
# Periksa apakah ada Employee ID yang duplikat
duplicates = df[df.duplicated(subset=['employee_id'], keep=False)]
if duplicates.empty:
    print("Tidak ada Employee ID yang duplikat.")
else:
    print("Terdapat Employee ID yang duplikat:")
    print(duplicates)


Tidak ada Employee ID yang duplikat.
