### Notebook to create Initial dataset sample for the project with employees data

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum as spark_sum, when, lit
import pandas as pd
import numpy as np
from datetime import datetime
from dateutil.relativedelta import relativedelta

In [2]:
spark = SparkSession.builder \
    .appName("HR Dataset Analysis") \
    .getOrCreate()

In [3]:
df = spark.read.format("csv").option("header",True).load("/content/drive/MyDrive/Estudos/Douglas College - Pós/Applied Research/HR_Data_MNC_Data Science Lovers.csv")

In [4]:
df.printSchema()

root
 |-- Unnamed: 0: string (nullable = true)
 |-- Employee_ID: string (nullable = true)
 |-- Full_Name: string (nullable = true)
 |-- Department: string (nullable = true)
 |-- Job_Title: string (nullable = true)
 |-- Hire_Date: string (nullable = true)
 |-- Location: string (nullable = true)
 |-- Performance_Rating: string (nullable = true)
 |-- Experience_Years: string (nullable = true)
 |-- Status: string (nullable = true)
 |-- Work_Mode: string (nullable = true)
 |-- Salary_INR: string (nullable = true)



In [5]:
df = df.withColumn("Experience_Years", col("Experience_Years").cast("double")) \
        .withColumn("Salary_INR", col("Salary_INR").cast("double")) \
        .withColumn("Performance_Rating", col("Performance_Rating").cast("double")) \
        .withColumn("Hire_Date", col("Hire_Date").cast("timestamp"))


In [6]:
df.show(5)

+----------+-----------+---------------+----------+--------------------+-------------------+--------------------+------------------+----------------+--------+---------+----------+
|Unnamed: 0|Employee_ID|      Full_Name|Department|           Job_Title|          Hire_Date|            Location|Performance_Rating|Experience_Years|  Status|Work_Mode|Salary_INR|
+----------+-----------+---------------+----------+--------------------+-------------------+--------------------+------------------+----------------+--------+---------+----------+
|         0| EMP0000001|  Joshua Nguyen|        IT|   Software Engineer|2011-08-10 00:00:00|  Isaacland, Denmark|               5.0|            14.0|Resigned|  On-site| 1585363.0|
|         1| EMP0000002| Julie Williams| Marketing|      SEO Specialist|2018-03-02 00:00:00|Anthonyside, Cost...|               2.0|             7.0|  Active|  On-site|  847686.0|
|         2| EMP0000003|Alyssa Martinez|        HR|          HR Manager|2023-03-20 00:00:00|Port Chr

In [7]:
df.describe().show()

+-------+----------------+-----------+-------------+----------+--------------------+-------------------+------------------+------------------+----------+---------+------------------+
|summary|      Unnamed: 0|Employee_ID|    Full_Name|Department|           Job_Title|           Location|Performance_Rating|  Experience_Years|    Status|Work_Mode|        Salary_INR|
+-------+----------------+-----------+-------------+----------+--------------------+-------------------+------------------+------------------+----------+---------+------------------+
|  count|         2000000|    2000000|      2000000|   2000000|             2000000|            2000000|           2000000|           2000000|   2000000|  2000000|           2000000|
|   mean|        999999.5|       NULL|         NULL|      NULL|                NULL|               NULL|         3.0001485|          5.010287|      NULL|     NULL|    896887.7556635|
| stddev|577350.413527175|       NULL|         NULL|      NULL|                NULL| 

In [8]:
sampled_df = df.sample(withReplacement=False, fraction=1.0, seed=4495).limit(5000)

In [9]:
sampled_df.describe().show()

+-------+------------------+-----------+---------------+----------+--------------------+-----------------+------------------+------------------+----------+---------+------------------+
|summary|        Unnamed: 0|Employee_ID|      Full_Name|Department|           Job_Title|         Location|Performance_Rating|  Experience_Years|    Status|Work_Mode|        Salary_INR|
+-------+------------------+-----------+---------------+----------+--------------------+-----------------+------------------+------------------+----------+---------+------------------+
|  count|              5000|       5000|           5000|      5000|                5000|             5000|              5000|              5000|      5000|     5000|              5000|
|   mean|            2499.5|       NULL|           NULL|      NULL|                NULL|             NULL|            2.9614|            5.0364|      NULL|     NULL|       896535.1738|
| stddev|1443.5200033252052|       NULL|           NULL|      NULL|        

In [10]:
sampled_df.groupBy("Job_Title").count().show(10000)

+--------------------+-----+
|           Job_Title|count|
+--------------------+-----+
|   Software Engineer|  762|
|      SEO Specialist|  182|
|          HR Manager|   62|
|Logistics Coordin...|  237|
|     Finance Manager|   87|
|     Account Manager|  269|
|   Financial Analyst|  153|
|     Sales Executive|  507|
|Operations Executive|  389|
|        HR Executive|  202|
|      Sales Director|   52|
|     DevOps Engineer|  234|
|  Content Strategist|   95|
| Marketing Executive|  314|
|  Research Scientist|  114|
|          Accountant|  211|
|          IT Manager|  136|
|Supply Chain Manager|  124|
|        Data Analyst|  275|
|Talent Acquisitio...|  131|
|   Product Developer|   75|
| Operations Director|   42|
|                 CTO|   76|
|Business Developm...|  138|
|      Lab Technician|   44|
|                 CFO|   31|
|       Brand Manager|   25|
|         HR Director|   21|
|  Innovation Manager|   12|
+--------------------+-----+



In [11]:
executive_titles = ["CTO", "CFO", "HR Director", "Operations Director", "Sales Director"]

p25 = 3
p50 = 8
p75 = 11

df = df.withColumn(
    "Job_Level",
    when(col("Job_Title").isin(executive_titles), "Executive")
    .when(col("Experience_Years") <= p25, "Specialist")
    .when((col("Experience_Years") > p25) & (col("Experience_Years") <= p50), "Analyst")
    .when((col("Experience_Years") > p50) & (col("Experience_Years") <= p75), "Manager")
    .otherwise("Principal")
)


In [12]:
percentiles = df.approxQuantile("Experience_Years", [0.25, 0.5, 0.75], 0.01)

p25 = 3
p50 = 8
p75 = 11

executive_titles = ["CTO", "CFO", "HR Director", "Operations Director", "Sales Director"]

sampled_df = sampled_df.withColumn(
    "Job_Level",
    when(col("Job_Title").isin(executive_titles), "Executive")
    .when(col("Experience_Years") <= p25, "Specialist")
    .when((col("Experience_Years") > p25) & (col("Experience_Years") <= p50), "Analyst")
    .when((col("Experience_Years") > p50) & (col("Experience_Years") <= p75), "Manager")
    .otherwise("Principal")
)

print(f"25th: {p25}, 50th: {p50}, 75th: {p75}")

25th: 3, 50th: 8, 75th: 11


In [13]:
sampled_df.select([spark_sum(col(c).isNull().cast("int")).alias(c) for c in df.columns]).show()

+----------+-----------+---------+----------+---------+---------+--------+------------------+----------------+------+---------+----------+---------+
|Unnamed: 0|Employee_ID|Full_Name|Department|Job_Title|Hire_Date|Location|Performance_Rating|Experience_Years|Status|Work_Mode|Salary_INR|Job_Level|
+----------+-----------+---------+----------+---------+---------+--------+------------------+----------------+------+---------+----------+---------+
|         0|          0|        0|         0|        0|        0|       0|                 0|               0|     0|        0|         0|        0|
+----------+-----------+---------+----------+---------+---------+--------+------------------+----------------+------+---------+----------+---------+



In [14]:
sampled_df.groupBy("Job_Level", "Status").count().orderBy("Job_Level").show(10000)

+----------+----------+-----+
| Job_Level|    Status|count|
+----------+----------+-----+
|   Analyst|    Active| 1387|
|   Analyst|  Resigned|  369|
|   Analyst|Terminated|   96|
|   Analyst|   Retired|  114|
| Executive|    Active|  149|
| Executive|   Retired|   12|
| Executive|  Resigned|   52|
| Executive|Terminated|    9|
|   Manager|    Active|  430|
|   Manager|  Resigned|  113|
|   Manager|   Retired|   32|
|   Manager|Terminated|   28|
| Principal|  Resigned|   59|
| Principal|    Active|  216|
| Principal|   Retired|   10|
| Principal|Terminated|   12|
|Specialist|    Active| 1326|
|Specialist|  Resigned|  389|
|Specialist|   Retired|  102|
|Specialist|Terminated|   95|
+----------+----------+-----+



In [15]:
sampled_df.write.option("header", True).mode("overwrite").csv("/content/drive/MyDrive/Estudos/Douglas College - Pós/Applied Research/initial_dataset.csv")

In [16]:
sampled_df.groupBy("Status").count().show(10000)

+----------+-----+
|    Status|count|
+----------+-----+
|  Resigned|  982|
|    Active| 3508|
|Terminated|  240|
|   Retired|  270|
+----------+-----+



In [17]:
sampled_df.select(["department"]).distinct().show(300)

+----------+
|department|
+----------+
|        IT|
| Marketing|
|        HR|
|Operations|
|   Finance|
|     Sales|
|       R&D|
+----------+



In [18]:
np.random.seed(4495)

In [19]:
start_date = datetime(2025, 10, 1)
months = 84
snapshot_dates = [start_date + relativedelta(months=i) for i in range(months)]

In [20]:
departments = ["IT", "Marketing", "HR", "Operations", "Finance", "Sales", "R&D"]
leave_reasons = ["Resigned", "Terminated", "Retired"]
promotion_milestones = [36, 60, 96]  # in months
monthly_hiring_rate = 0.05  # 5% of initial count every 4 months

In [21]:
sampled_df = sampled_df.toPandas()
df = df.toPandas()

In [22]:
sampled_df = sampled_df.copy()
df = df.copy()

In [23]:
sampled_df["Hire_Date"] = pd.to_datetime(sampled_df["Hire_Date"])
df["Hire_Date"] = pd.to_datetime(df["Hire_Date"])

In [24]:
sampled_df["promotion_count"] = 0
sampled_df["month"] = 0
sampled_df["Status"] = "Active"

In [25]:
used_ids = set(sampled_df["Employee_ID"])

In [26]:
snapshots = []

In [27]:
for i, snapshot_date in enumerate(snapshot_dates, start=1):
    current_emps = sampled_df[sampled_df["Hire_Date"] <= snapshot_date].copy()

    current_emps["time_in_company"] = (
        (snapshot_date.year - current_emps["Hire_Date"].dt.year) * 12 +
        (snapshot_date.month - current_emps["Hire_Date"].dt.month)
    )

    if i % 4 == 0:
        move_mask = (np.random.rand(len(current_emps)) < 0.05) & (current_emps["Status"] == "Active")
        current_emps.loc[move_mask, "Department"] = np.random.choice(departments, size=move_mask.sum())

    if i % 10 == 0:
        active_mask = current_emps["Status"] == "Active"
        leave_mask = (np.random.rand(len(current_emps)) < 0.10) & active_mask
        current_emps.loc[leave_mask, "Status"] = np.random.choice(leave_reasons, size=leave_mask.sum())

    for milestone in promotion_milestones:
        promo_mask = (current_emps["time_in_company"] == milestone) & (current_emps["Status"] == "Active")
        current_emps.loc[promo_mask, "promotion_count"] += 1

    if i % 4 == 0:
        available_pool = df[~df["Employee_ID"].isin(used_ids)]
        n_hires = int(len(sampled_df) * monthly_hiring_rate)

        if len(available_pool) >= n_hires:
            new_hires = available_pool.sample(n=n_hires, random_state=4495)
            new_hires = new_hires.copy()
            new_hires["promotion_count"] = 0
            new_hires["month"] = i
            new_hires["Status"] = "Active"
            new_hires["Hire_Date"] = snapshot_date
            sampled_df = pd.concat([sampled_df, new_hires], ignore_index=True)
            used_ids.update(new_hires["Employee_ID"])

    current_emps["month"] = i
    current_emps["snapshot_date"] = snapshot_date.strftime("%Y-%m-%d")
    snapshots.append(current_emps)

result_df = pd.concat(snapshots, ignore_index=True)

In [28]:
result_df.to_csv("employee_snapshots_final.csv", index=False)