In [2]:
import pandas as pd

df = pd.read_csv("Messy_HR_Dataset_Detailed.csv")

# Descriptive Statistic : Total Budget Training Loss and Benefit By Training Outcome
failed_training_df = df[df["Training Outcome"] == "Failed"]
total_loss_cost = failed_training_df['Training Cost'].sum().round(2)
total_person_failed = failed_training_df['Training Cost'].count()
average_failed = (total_loss_cost / total_person_failed).round(2)

success_training_df = df[df["Training Outcome"].isin(["Passed", "Completed"])]
total_benefit_cost = success_training_df['Training Cost'].sum().round(2)
total_person_success = success_training_df['Training Cost'].count()
average_success = (total_benefit_cost / total_person_success).round(2)

print(f"Total Company Money's Loss By Failed Training is ${total_loss_cost} from {total_person_failed} person") 
print(f"The average cost per-Failed person is ${average_failed}")
print(f"Total Budget Spending on Success Training is ${total_benefit_cost} from {total_person_success} person") 
print(f"The average cost per-Succeed person is ${average_success}")

Total Company Money's Loss By Failed Training is $421729.55 from 751 person
The average cost per-Failed person is $561.56
Total Budget Spending on Success Training is $893810.5 from 1587 person
The average cost per-Succeed person is $563.21


In [106]:
import pandas as pd

df = pd.read_csv("Messy_HR_Dataset_Detailed.csv")

# Descriptive Statistic : Top 3 Best Employee and Employee With Lowest Score
# Define scoring for each Training Outcome
score_mapping = {
    "Failed": -1,
    "Incomplete": 0,
    "Completed": 1,
    "Passed": 2
}
perf_mapping = {
    "PIP": -1,
    "Needs Improvement": 0,
    "Exceeds": 1,
    "Fully Meets": 2
}
df["Score"] = df["Training Outcome"].map(score_mapping) + df["Satisfaction Score"] + df["Engagement Score"] + df["Current Employee Rating"] + df["Performance Score"].map(perf_mapping)

# Calculate the total score for each employee
df["FullName"] = df["FirstName"] + " " + df["LastName"]
total_scores = df.groupby("FullName")["Score"].sum().sort_values(ascending=False)

top_3 = total_scores.head(3)
low_3 = total_scores.tail(3)
print("Top 3 Best Employee")
print(top_3)
print("\nEmployee With Lowest Score")
print(low_3)

Top 3 Best Employee
FullName
Isaias Pineda     36
Kane Black        34
Kaylyn Patrick    34
Name: Score, dtype: int64

Employee With Lowest Score
FullName
Chace Church       4
Samara Roberson    3
Tanya Leonard      3
Name: Score, dtype: int64


In [105]:
import pandas as pd
from datetime import datetime

df = pd.read_csv("Messy_HR_Dataset_Detailed.csv")

# Descriptive Statistic : Show The 7 Employee Who Have Served The Longest
df["FullName"] = df["FirstName"] + " " + df["LastName"]
df['StartDate'] = pd.to_datetime(df['StartDate'], format='%d-%b-%y')

# Convert ExitDate to datetime
df['CurrDate'] = pd.to_datetime(df['ExitDate'], format='%d-%b-%y', errors='coerce')

# Replace Empty Exit Date as today date
df['CurrDate'] = df['CurrDate'].fillna(pd.Timestamp.today())

# Calculate days worked
df['DaysWorked'] = (df['CurrDate'] - df['StartDate']).dt.days
df = df.sort_values(by='DaysWorked', ascending=False)
df[['FullName','Title','StartDate','DaysWorked']].head(7)

Unnamed: 0,FullName,Title,StartDate,DaysWorked
2863,Jaiden Middleton,Production Technician I,2018-08-07,2412
1465,Saniya Buck,Production Technician II,2018-08-07,2412
2484,Estrella Ho,Production Technician I,2018-08-07,2412
1741,Rashad Mayo,Production Technician II,2018-08-08,2411
124,Emery Roach,Area Sales Manager,2018-08-11,2408
533,Andreas Torres,CIO,2018-08-11,2408
1414,Remington Bullock,Production Technician I,2018-08-14,2405


In [155]:
import pandas as pd
from datetime import datetime

df = pd.read_csv("Messy_HR_Dataset_Detailed.csv")

df["FullName"] = df["FirstName"] + " " + df["LastName"]

# Assign Base Salary
salary_mapping = {
    "Production Technician I": 8500000, 
    "Production Technician II": 9500000, 
    "Area Sales Manager": 18000000,
    "CIO": 70000000,
    "Production Manager": 25000000,
    "Network Engineer": 15000000,
    "Sr. Accountant": 20000000,
    "Software Engineer": 18000000,
    "Senior BI Developer": 22000000,
    "Data Analyst": 16000000,
    "Director of Sales": 50000000,
    "Enterprise Architect": 30000000,
    "Administrative Assistant": 7000000,
    "IT Support": 10000000,
    "Sales Manager": 20000000,
    "BI Director": 40000000,
    "President & CEO": 100000000,
    "Sr. Network Engineer": 18000000,
    "BI Developer": 17000000,
    "IT Manager - Support": 25000000,
    "Shared Services Manager": 23000000,
    "IT Manager - DB": 26000000,
    "Accountant I": 12000000,
    "Principal Data Architect": 32000000,
    "Database Administrator": 17000000,
    "Sr. DBA": 22000000,
    "IT Manager - Infra": 28000000,
    "Software Engineering Manager": 30000000,
    "Director of Operations": 55000000,
    "IT Director": 60000000,
    "Data Architect": 25000000
}
df["Salary Base"] = df["Title"].str.strip().map(salary_mapping)

# Assign Base Salary Bonus Based on Employee Type
def assign_base_salary_employee_type(salary, etype):
    if etype == 'Contract':
        return 0
    elif etype == 'Full-Time':
        return (salary * 10.1 / 100) + (salary * 5 / 100)  
    elif etype == 'Part-Time':
        base = salary * 7.5 / 100
        return -1 * (base - (base * 25 / 100))
df['Subsidy (Employee Type)'] = df.apply(lambda row: assign_base_salary_employee_type(row['Salary Base'],row['EmployeeType']), axis=1)

# Assign Base Salary Bonus Based on Marital Status
def assign_base_salary_marital(salary,status):
    if status == 'Married':
        return salary * 8 / 100
    else:
        return 0
df['Subsidy (Married)'] = df.apply(lambda row: assign_base_salary_marital(row['Salary Base'],row['MaritalDesc']), axis=1)

# Assign Base Salary Bonus or Cut Based on Performance Score
def assign_base_salary_perf_score(salary,perf):
    if perf == 'Fully Meets':
        return salary * 4 / 100
    elif perf == 'Exceeds':
        return salary * 2 / 100    
    elif perf == 'Needs Improvement':
        return -1 * (salary * 1 / 100)
    elif perf == 'PIP':
        return -1 * (salary * 3 / 100)
df['Subsidy (Performance Score)'] = df.apply(lambda row: assign_base_salary_perf_score(row['Salary Base'],row['Performance Score']), axis=1)

# Assign Base Salary Bonus Based on Years Worked
df['StartDate'] = pd.to_datetime(df['StartDate'], format='%d-%b-%y')
df['CurrDate'] = pd.to_datetime(df['ExitDate'], format='%d-%b-%y', errors='coerce')
df['CurrDate'] = df['CurrDate'].fillna(pd.Timestamp.today())
# Calculate days worked
df['DaysWorked'] = (df['CurrDate'] - df['StartDate']).dt.days
df['YearsWorked'] = round((df['CurrDate'] - df['StartDate']).dt.days / 365, 2)

def assign_years_worked_bonus(salary,year):
    if year <= 1:
        return 0
    else:
        return round(salary * (year / 1.5) / 100,1)
        
df['Subsidy (Years Worked)'] = df.apply(lambda row: assign_years_worked_bonus(row['Salary Base'],row['YearsWorked']), axis=1)

# Total Salary
df["Final Salary"] = df["Salary Base"] + df["Subsidy (Employee Type)"] + df["Subsidy (Married)"] + df["Subsidy (Performance Score)"] + df["Subsidy (Years Worked)"]

# df[['FullName','Salary Base','Subsidy (Employee Type)','Subsidy (Married)','Subsidy (Performance Score)','Subsidy (Years Worked)','Final Salary']]
# Descriptive Statistic - The 7 Employee With Highest and Lowest Latest Salary
print("\n7 Employee with Last Highest Salary\n")
print(df[['FullName','Title','Salary Base','Final Salary']].sort_values(by='Final Salary',ascending=False).head(7))

print("\n7 Employee with Last Lowest Salary\n")
print(df[['FullName','Title','Salary Base','Final Salary']].sort_values(by='Final Salary',ascending=True).head(7))


7 Employee with Last Highest Salary

             FullName            Title  Salary Base  Final Salary
1920     Karley Novak  President & CEO    100000000   122693333.3
1877   Marlon Stanton  President & CEO    100000000   121720000.0
3119  Clinton Brennan  President & CEO    100000000   120800000.0
1941  Clinton Brennan  President & CEO    100000000   120800000.0
1978   Emily Davidson  President & CEO    100000000   120100000.0
2014   Camden Mcclure  President & CEO    100000000   114446666.7
1996      Yael Garcia  President & CEO    100000000   108286666.7

7 Employee with Last Lowest Salary

             FullName                     Title  Salary Base  Final Salary
957     Karla Farrell  Administrative Assistant      7000000     6396250.0
950   Larissa Douglas  Administrative Assistant      7000000     6746250.0
2207   Hezekiah Mcgee  Administrative Assistant      7000000     6980050.0
2206       Zain Nolan  Administrative Assistant      7000000     7007583.3
893    Estelle Howard 

In [167]:
# Descriptive Statistic - Salary Summary
average_salary = round(df['Final Salary'].mean(),2)
min_salary = df['Final Salary'].min()
max_salary = df['Final Salary'].max()
print(f"Average Salary : Rp. {average_salary:,.2f}")
print(f"Highest Salary : Rp. {max_salary:,.2f}")
print(f"Lowest Salary : Rp. {min_salary:,.2f}")

Average Salary : Rp. 14,926,110.31
Highest Salary : Rp. 122,693,333.30
Lowest Salary : Rp. 6,396,250.00


In [201]:
# Descriptive Statistic - Salary Summary per Title
salary_summary_title = df.groupby('Title')['Final Salary'].agg(['count', 'mean', 'sum', 'max', 'min']).round(2)

for title, row in salary_summary_title.iterrows():
    print(f"{title} ({round(row['count'])} employees)\n========================================\nAverage Salary: Rp. {row['mean']:,.2f}\nTotal Salary: Rp. {row['sum']:,.2f}\nHighest Salary: Rp. {row['max']:,.2f}\nLowest Salary: Rp. {row['min']:,.2f}\n")

Accountant I (35 employees)
Average Salary: Rp. 13,272,200.00
Total Salary: Rp. 464,527,000.00
Highest Salary: Rp. 15,499,200.00
Lowest Salary: Rp. 11,565,000.00

Administrative Assistant (35 employees)
Average Salary: Rp. 7,792,123.33
Total Salary: Rp. 272,724,316.60
Highest Salary: Rp. 9,039,800.00
Lowest Salary: Rp. 6,396,250.00

Area Sales Manager (311 employees)
Average Salary: Rp. 19,993,480.71
Total Salary: Rp. 6,217,972,500.00
Highest Salary: Rp. 23,667,600.00
Lowest Salary: Rp. 16,807,500.00

BI Developer (50 employees)
Average Salary: Rp. 19,269,959.00
Total Salary: Rp. 963,497,949.80
Highest Salary: Rp. 22,247,333.30
Lowest Salary: Rp. 16,723,750.00

BI Director (13 employees)
Average Salary: Rp. 42,975,692.31
Total Salary: Rp. 558,684,000.00
Highest Salary: Rp. 50,840,000.00
Lowest Salary: Rp. 39,723,333.30

CIO (11 employees)
Average Salary: Rp. 76,227,666.66
Total Salary: Rp. 838,504,333.30
Highest Salary: Rp. 90,846,000.00
Lowest Salary: Rp. 68,862,500.00

Data Analyst (

In [203]:
# Descriptive Statistic - Salary Summary per Division
salary_summary_div = df.groupby('Division')['Final Salary'].agg(['count', 'mean', 'sum', 'max', 'min']).round(2)

for div, row in salary_summary_div.iterrows():
    print(f"{div} ({round(row['count'])} employees)\n========================================\nAverage Salary: Rp. {row['mean']:,.2f}\nTotal Salary: Rp. {row['sum']:,.2f}\nHighest Salary: Rp. {row['max']:,.2f}\nLowest Salary: Rp. {row['min']:,.2f}\n")

Aerial (210 employees)
Average Salary: Rp. 15,410,497.94
Total Salary: Rp. 3,236,204,566.70
Highest Salary: Rp. 100,141,666.70
Lowest Salary: Rp. 7,766,875.00

Billable Consultants (24 employees)
Average Salary: Rp. 12,239,568.75
Total Salary: Rp. 293,749,650.10
Highest Salary: Rp. 22,878,000.00
Lowest Salary: Rp. 8,361,875.00

Catv (59 employees)
Average Salary: Rp. 14,840,686.72
Total Salary: Rp. 875,600,516.60
Highest Salary: Rp. 34,267,733.30
Lowest Salary: Rp. 8,434,408.30

Corp Operations (2 employees)
Average Salary: Rp. 22,463,333.35
Total Salary: Rp. 44,926,666.70
Highest Salary: Rp. 26,206,666.70
Lowest Salary: Rp. 18,720,000.00

Engineers (292 employees)
Average Salary: Rp. 14,452,912.16
Total Salary: Rp. 4,220,250,349.90
Highest Salary: Rp. 72,800,000.00
Lowest Salary: Rp. 7,766,875.00

Executive (46 employees)
Average Salary: Rp. 16,733,565.77
Total Salary: Rp. 769,744,025.20
Highest Salary: Rp. 57,200,000.00
Lowest Salary: Rp. 7,366,800.00

Field Operations (826 employees

In [207]:
# Descriptive Statistic - Salary Summary per Employee Type
salary_summary_etype = df.groupby('EmployeeType')['Final Salary'].agg(['count', 'mean', 'sum', 'max', 'min']).round(2)

for etype, row in salary_summary_etype.iterrows():
    print(f"{etype} ({round(row['count'])} employees)\n========================================\nAverage Salary: Rp. {row['mean']:,.2f}\nTotal Salary: Rp. {row['sum']:,.2f}\nHighest Salary: Rp. {row['max']:,.2f}\nLowest Salary: Rp. {row['min']:,.2f}\n")

Contract (1055 employees)
Average Salary: Rp. 14,599,755.23
Total Salary: Rp. 15,402,741,767.30
Highest Salary: Rp. 114,446,666.70
Lowest Salary: Rp. 7,280,000.00

Full-Time (1086 employees)
Average Salary: Rp. 16,553,583.30
Total Salary: Rp. 17,977,191,465.40
Highest Salary: Rp. 122,693,333.30
Lowest Salary: Rp. 8,070,533.30

Part-Time (1009 employees)
Average Salary: Rp. 13,515,673.19
Total Salary: Rp. 13,637,314,250.20
Highest Salary: Rp. 100,141,666.70
Lowest Salary: Rp. 6,396,250.00



In [211]:
# Descriptive Statistic - Salary Summary per Department Type
salary_summary_dept = df.groupby('DepartmentType')['Final Salary'].agg(['count', 'mean', 'sum', 'max', 'min']).round(2)

for dept, row in salary_summary_dept.iterrows():
    print(f"{dept} ({round(row['count'])} employees)\n========================================\nAverage Salary: Rp. {row['mean']:,.2f}\nTotal Salary: Rp. {row['sum']:,.2f}\nHighest Salary: Rp. {row['max']:,.2f}\nLowest Salary: Rp. {row['min']:,.2f}\n")

Admin Offices (85 employees)
Average Salary: Rp. 17,173,140.40
Total Salary: Rp. 1,459,716,933.60
Highest Salary: Rp. 28,242,466.70
Lowest Salary: Rp. 6,396,250.00

Executive Office (25 employees)
Average Salary: Rp. 22,392,153.34
Total Salary: Rp. 559,803,833.40
Highest Salary: Rp. 100,141,666.70
Lowest Salary: Rp. 13,706,250.00

IT/IS (459 employees)
Average Salary: Rp. 23,205,114.05
Total Salary: Rp. 10,651,147,350.30
Highest Salary: Rp. 90,846,000.00
Lowest Salary: Rp. 9,137,500.00

Production        (2115 employees)
Average Salary: Rp. 11,779,690.95
Total Salary: Rp. 24,914,046,357.30
Highest Salary: Rp. 122,693,333.30
Lowest Salary: Rp. 7,766,875.00

Sales (345 employees)
Average Salary: Rp. 19,785,531.04
Total Salary: Rp. 6,826,008,208.40
Highest Salary: Rp. 70,561,333.30
Lowest Salary: Rp. 6,980,050.00

Software Engineering (121 employees)
Average Salary: Rp. 21,541,527.27
Total Salary: Rp. 2,606,524,799.90
Highest Salary: Rp. 38,626,000.00
Lowest Salary: Rp. 17,707,500.00

