In [None]:
!pip install Faker
from faker import Faker
fake = Faker()



In [None]:
import pandas as pd
import random

This startup company of 320 employees intends to go public and become a 10,000 employee company. Your job is to produce an expanded 10K record synthetic database to help the founders understand personnel-related issues that might occur with the expanded company.

Use the Faker python module to produce a 10K employee dataset. Follow these constraints:

1.   All columns in the current data set must be preserved. It is not necessary to preserve any of the actual data from the current database
2.   Need to keep track of social security numbers
3.   The database should keep track of the languages (other than English) spoken by each employee. Each employee speaks 0, 1 or 2 languages in addition to English.
4.  To grow, the company plans to sponsor visas and hire non-USA citizens. So your synthetic database should include 40% employees who are non-USA citizens and should include names of employees from India, Mainland China, Canada, South Korea, Philippines, Taiwan and Mexico. These names should be in proportion to the 2019 percentages of H1B petitions from each country.
5.   The expanded company will have additional departments include “Legal” (approximately 5% of employees), “Marketing” (10%), “Administrative” (10%), “Operations” (20%), “Sales” (10%), “Finance” (5%) and “I/T” (10%) to go along with the current “Product” (20%) and “Human Resource” (10%) departments.
6.  Salaries in each department must mimic the typical salaries for professionals in each field. You can find appropriate data for each type of profession at salary.com For example, see this page to find a model estimate for your synthetic marketing department: https://www.salary.com/research/salary/benchmark/marketing-specialist-salary.
7. The current startup company (as represented by the employees.csv data) is skewed toward male employees. Our goal for the new company is to make the numbers of men and women approximately equal.








First Name	Last Name	Email	Phone	Gender	Age	Job Title	Years Of Experience	Salary	Department

In [None]:
# Number of records
num_records = 10000

# Departments and their proportions
departments = [
    ("Legal", 0.05),
    ("Marketing", 0.10),
    ("Administrative", 0.10),
    ("Operations", 0.20),
    ("Sales", 0.10),
    ("Finance", 0.05),
    ("I/T", 0.10),
    ("Product", 0.20),
    ("Human Resource", 0.10)
]

# Function to generate a department based on the defined proportions
def get_department():
    rand_val = random.random()
    cumulative = 0
    for dept, proportion in departments:
        cumulative += proportion
        if rand_val < cumulative:
            return dept
    return departments[-1][0]  # default to the last department

# Function to generate a random salary based on department
def get_salary(department):
    salary_ranges = {
        "Legal": (60000, 120000),
        "Marketing": (40000, 100000),
        "Administrative": (30000, 70000),
        "Operations": (40000, 90000),
        "Sales": (30000, 100000),
        "Finance": (50000, 110000),
        "I/T": (60000, 130000),
        "Product": (50000, 120000),
        "Human Resource": (40000, 90000)
    }
    return random.randint(*salary_ranges[department])


# Function to determine citizenship and generate appropriate name
def get_citizenship_name():
    citizenship_proportions = {
        "USA": 0.60,
        "India": 0.15,
        "Mainland China": 0.10,
        "Canada": 0.05,
        "South Korea": 0.03,
        "Philippines": 0.03,
        "Taiwan": 0.02,
        "Mexico": 0.02
    }
    rand_val = random.random()
    cumulative = 0
    for country, proportion in citizenship_proportions.items():
        cumulative += proportion
        if rand_val < cumulative:
            if country == "USA":
                return country
            else:
                fake_locales = {
                    "India": "hi_IN",
                    "Mainland China": "zh_CN",
                    "Canada": "en_CA",
                    "South Korea": "ko_KR",
                    "Philippines": "fil_PH",
                    "Taiwan": "zh_TW",
                    "Mexico": "es_MX"
                }
                fake_localized = Faker(fake_locales[country])
                return country
    return "USA" # default to USA

# Generate the dataset
data = []
for _ in range(num_records):
    first_name = fake.first_name()
    last_name = fake.last_name()
    email = fake.email()
    phone = fake.phone_number()
    gender = random.choice(["Male", "Female"])
    age = random.randint(22, 65)
    job_title = fake.job()
    years_of_experience = random.randint(1, 30)
    department = get_department()
    salary = get_salary(department)
    ssn = fake.ssn()
    languages = random.choices([0, 1, 2])
    citizenship= get_citizenship_name()
    data.append([first_name, last_name, email, phone, gender, age, job_title, years_of_experience, salary, department, ssn, citizenship,languages])

# Create a DataFrame
columns = ["First Name", "Last Name", "Email", "Phone", "Gender", "Age", "Job Title", "Years Of Experience", "Salary", "Department", "SSN", "Citizenship", "Languages"]
df = pd.DataFrame(data, columns=columns)

# Save to CSV
df.to_csv('synthetic_employee_data.csv', index=False)


In [None]:
import pandas as pd

# Load the data
df = pd.read_csv('synthetic_employee_data.csv')

# Check for all columns presence
print("Columns in the dataset:", df.columns)

# Check for SSN tracking
print("SSN column unique values count:", df['SSN'].nunique())

# Analyze the percentage of non-USA citizens
non_usa_employees = df[df['Citizenship'] != 'USA']
percentage_non_usa = (non_usa_employees.shape[0] / df.shape[0]) * 100
print("Percentage of Non-USA Employees:", percentage_non_usa)

# Calculate the percentage of employees in each department
department_distribution = df['Department'].value_counts(normalize=True) * 100
print("Department distribution:\n", department_distribution)

# Checking for gender balance
gender_balance = df['Gender'].value_counts(normalize=True) * 100
print("Gender distribution:\n", gender_balance)