In [2]:
import pandas as pd

df = pd.read_csv("Messy_HR_Dataset_Detailed.csv")

# Descriptive Statistic : Total Budget Training Loss and Benefit By Training Outcome
failed_training_df = df[df["Training Outcome"] == "Failed"]
total_loss_cost = failed_training_df['Training Cost'].sum().round(2)
total_person_failed = failed_training_df['Training Cost'].count()
average_failed = (total_loss_cost / total_person_failed).round(2)

success_training_df = df[df["Training Outcome"].isin(["Passed", "Completed"])]
total_benefit_cost = success_training_df['Training Cost'].sum().round(2)
total_person_success = success_training_df['Training Cost'].count()
average_success = (total_benefit_cost / total_person_success).round(2)

print(f"Total Company Money's Loss By Failed Training is ${total_loss_cost} from {total_person_failed} person") 
print(f"The average cost per-Failed person is ${average_failed}")
print(f"Total Budget Spending on Success Training is ${total_benefit_cost} from {total_person_success} person") 
print(f"The average cost per-Succeed person is ${average_success}")

Total Company Money's Loss By Failed Training is $421729.55 from 751 person
The average cost per-Failed person is $561.56
Total Budget Spending on Success Training is $893810.5 from 1587 person
The average cost per-Succeed person is $563.21


In [106]:
import pandas as pd

df = pd.read_csv("Messy_HR_Dataset_Detailed.csv")

# Descriptive Statistic : Top 3 Best Employee and Employee With Lowest Score
# Define scoring for each Training Outcome
score_mapping = {
    "Failed": -1,
    "Incomplete": 0,
    "Completed": 1,
    "Passed": 2
}
perf_mapping = {
    "PIP": -1,
    "Needs Improvement": 0,
    "Exceeds": 1,
    "Fully Meets": 2
}
df["Score"] = df["Training Outcome"].map(score_mapping) + df["Satisfaction Score"] + df["Engagement Score"] + df["Current Employee Rating"] + df["Performance Score"].map(perf_mapping)

# Calculate the total score for each employee
df["FullName"] = df["FirstName"] + " " + df["LastName"]
total_scores = df.groupby("FullName")["Score"].sum().sort_values(ascending=False)

top_3 = total_scores.head(3)
low_3 = total_scores.tail(3)
print("Top 3 Best Employee")
print(top_3)
print("\nEmployee With Lowest Score")
print(low_3)

Top 3 Best Employee
FullName
Isaias Pineda     36
Kane Black        34
Kaylyn Patrick    34
Name: Score, dtype: int64

Employee With Lowest Score
FullName
Chace Church       4
Samara Roberson    3
Tanya Leonard      3
Name: Score, dtype: int64


In [50]:
import pandas as pd
from datetime import datetime

df = pd.read_csv("Messy_HR_Dataset_Detailed.csv")

# Descriptive Statistic : Show The 7 Employee Who Have Served The Longest
df["FullName"] = df["FirstName"] + " " + df["LastName"]
df['StartDate'] = pd.to_datetime(df['StartDate'], format='%d-%b-%y')

# Convert ExitDate to datetime
df['CurrDate'] = pd.to_datetime(df['ExitDate'], format='%d-%b-%y', errors='coerce')

# Replace Empty Exit Date as today date
df['CurrDate'] = df['CurrDate'].fillna(pd.Timestamp.today())

# Calculate days worked
df['DaysWorked'] = (df['CurrDate'] - df['StartDate']).dt.days
df = df.sort_values(by='DaysWorked', ascending=False)
df[['FullName','Title','StartDate','DaysWorked']].head(7)

Unnamed: 0,FullName,Title,StartDate,DaysWorked
2863,Jaiden Middleton,Production Technician I,2018-08-07,2412
1465,Saniya Buck,Production Technician II,2018-08-07,2412
2484,Estrella Ho,Production Technician I,2018-08-07,2412
1741,Rashad Mayo,Production Technician II,2018-08-08,2411
124,Emery Roach,Area Sales Manager,2018-08-11,2408
533,Andreas Torres,CIO,2018-08-11,2408
1414,Remington Bullock,Production Technician I,2018-08-14,2405
