<a href="https://colab.research.google.com/github/AMBOT-pixel96/hr-tech-portfolio/blob/main/faker_data/FakerGen_CB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# -------------------------------
# Step 0 — Clone hr-tech-portfolio (with PAT)
# -------------------------------
import os
from getpass import getpass

# Ask for your GitHub Personal Access Token
token = getpass("🔑 Enter your GitHub token: ")

# Clean any previous clone
os.system("rm -rf hr-tech-portfolio")

# Clone with token authentication
os.system(f"git clone https://{token}@github.com/AMBOT-pixel96/hr-tech-portfolio.git")

# Move inside repo
%cd hr-tech-portfolio

# Check if faker_data exists
!ls faker_data | head

🔑 Enter your GitHub token: ··········
/content/hr-tech-portfolio
employee_master.csv


In [3]:
# -------------------------------
# Step 1 — Refurbish Mammoth into Dashboard Feeds
# -------------------------------
import pandas as pd
import numpy as np
import os

# Load the full faker mammoth
df = pd.read_csv("faker_data/employee_master.csv")

# -------------------------------
# Create Internal Feed: employee_compensation.csv
# -------------------------------
employee_comp = df[[
    "EmpID",
    "Gender",
    "Dept",
    "JobTitle",
    "JobLevel",
    "CTC",
    "Bonus",
    "PerformanceRating"
]].rename(columns={
    "EmpID": "EmployeeID",
    "Dept": "Department",
    "JobTitle": "JobRole"
})

# Save to /data
os.makedirs("data", exist_ok=True)
employee_comp.to_csv("data/employee_compensation.csv", index=False)

print("✅ employee_compensation.csv created:", employee_comp.shape)

# -------------------------------
# Create External Feed: benchmarking_data.csv
# -------------------------------

# Group by JobRole & JobLevel → take median CTC as "market base"
benchmark = (
    df.groupby(["JobTitle", "JobLevel"])["CTC"]
      .median()
      .reset_index()
      .rename(columns={"JobTitle":"JobRole","CTC":"MarketMedianCTC"})
)

# Add some random noise (±15%) to mimic real external survey variance
rng = np.random.default_rng(42)
benchmark["MarketMedianCTC"] = benchmark["MarketMedianCTC"] * (
    1 + rng.uniform(-0.15, 0.15, size=len(benchmark))
)

benchmark.to_csv("data/benchmarking_data.csv", index=False)

print("✅ benchmarking_data.csv created:", benchmark.shape)

# Quick peek
print("\nInternal Feed Sample:")
print(employee_comp.head(3))
print("\nExternal Benchmark Feed Sample:")
print(benchmark.head(3))

✅ employee_compensation.csv created: (35000, 8)
✅ benchmarking_data.csv created: (19, 3)

Internal Feed Sample:
  EmployeeID  Gender  Department           JobRole          JobLevel      CTC  \
0     O00001    Male  Operations  Senior Executive  Senior Executive   829780   
1     L00001  Female       Legal           Analyst           Analyst   470888   
2     M00001    Male   Marketing  Senior Executive  Senior Executive  1328177   

    Bonus  PerformanceRating  
0       0                  2  
1       0                  3  
2  129547                  3  

External Benchmark Feed Sample:
             JobRole           JobLevel  MarketMedianCTC
0            Analyst            Analyst     6.007111e+05
1  Assistant Manager  Assistant Manager     1.615395e+06
2  Associate Partner  Associate Partner     8.290190e+06


In [4]:
# -------------------------------
# Step 1 — Refurbish faker mammoth → Internal + Benchmarking CSVs
# -------------------------------
import pandas as pd
import numpy as np

# Load faker mammoth
df = pd.read_csv("faker_data/employee_master.csv")

# --- Internal Compensation File ---
internal_cols = [
    "EmpID","Name","Dept","JobLevel","JobTitle","Gender",
    "CTC","Bonus","BaseSalary","Allowances","BenefitsValue",
    "Location","PerformanceRating","EducationLevel","Experience","Age"
]
df_internal = df[internal_cols].copy()
df_internal.rename(columns={
    "Dept":"Department",
    "CTC":"AnnualCTC",
    "Bonus":"BonusAmount"
}, inplace=True)

# Save internal file
internal_path = "faker_data/employee_compensation.csv"
df_internal.to_csv(internal_path, index=False)

# --- External Benchmarking File ---
# For benchmarking, aggregate by JobLevel + Dept with averages
df_benchmark = (
    df.groupby(["Dept","JobLevel"])
      .agg({
          "BaseSalary":"mean",
          "CTC":"mean",
          "Bonus":"mean"
      })
      .reset_index()
)

df_benchmark.rename(columns={
    "Dept":"Department",
    "BaseSalary":"AvgBaseSalary",
    "CTC":"AvgCTC",
    "Bonus":"AvgBonus"
}, inplace=True)

# Add a "MarketSource" col to simulate source of benchmarks
df_benchmark["MarketSource"] = np.random.choice(
    ["Mercer","Aon Hewitt","Deloitte","Internal Survey"], size=len(df_benchmark)
)

# Save benchmark file
benchmark_path = "faker_data/benchmarking_data.csv"
df_benchmark.to_csv(benchmark_path, index=False)

print("✅ Files created:")
print("-", internal_path)
print("-", benchmark_path)


# -------------------------------
# Step 2 — Push back to GitHub (with PAT)
# -------------------------------
import os
from getpass import getpass

# Git config (set once in Colab session)
os.system('git config --global user.email "amlanm4@gmail.com"')
os.system('git config --global user.name "Amlan Mishra"')

# Add & commit
os.system("git add faker_data/employee_compensation.csv faker_data/benchmarking_data.csv")
os.system('git commit -m "feat: add internal + benchmarking CSVs from faker mammoth"')

# Push with PAT
token = getpass("🔑 Enter your GitHub token: ")
os.system(f"git push https://{token}@github.com/AMBOT-pixel96/hr-tech-portfolio.git main")

✅ Files created:
- faker_data/employee_compensation.csv
- faker_data/benchmarking_data.csv
🔑 Enter your GitHub token: ··········


0