In [None]:
import pandas as pd
import numpy as np

# Load dataset (assumes this includes predictions from earlier steps)
df = pd.read_csv("likely_to_stay_with_predicted_salaries.csv")

# If coming from earlier in-memory steps, skip loading and use `df_likely_to_stay` and `df`

# Re-load the full dataframe that includes everyone
df_all = pd.read_csv("employee_dataset_with_future_salaries.csv")

✅ Total Expected Salary Loss: ₹1,262,643.56

🔝 Top 10 High-Risk High-Loss Employees:
      Age  JobRole  MonthlyIncome  P_leave  PredictedFutureSalary  \
595    58        5          19246     0.77             20241.0285   
913    45        3          18824     0.77             19755.0430   
749    52        3          19845     0.70             20861.0220   
45     41        5          19545     0.71             20515.0470   
568    55        3          19859     0.68             20868.5205   
975    55        7          13695     0.78             14420.5900   
1223   47        7          12936     0.80             13642.1905   
706    40        7          13194     0.78             13862.8075   
435    33        3          13610     0.74             14296.7830   
838    42        7          13758     0.65             14527.7405   

      ExpectedLoss  
595   15585.591945  
913   15211.383110  
749   14602.715400  
45    14565.683370  
568   14190.593940  
975   11248.060200  
1223  10

In [None]:
# Step 1: Ensure Attrition is binary (if not already done)
df_all["Attrition"] = df_all["Attrition"].map({"Yes": 1, "No": 0})

In [None]:
# Step 2: Encode categorical columns again (consistent with training)
from sklearn.preprocessing import LabelEncoder
categorical_cols = df_all.select_dtypes(include="object").columns.tolist()
categorical_cols = [col for col in categorical_cols if col not in ['Attrition']]

le = LabelEncoder()
for col in categorical_cols:
    df_all[col] = le.fit_transform(df_all[col])

In [None]:
# Step 3: Predict attrition probability again
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
import math

X_cls = df_all.drop(columns=["Attrition", "FutureSalary_PerformanceBased", "FutureSalary_Fixed"])
y_cls = df_all["Attrition"]
cls_model = RandomForestClassifier(random_state=42)
cls_model.fit(X_cls, y_cls)

# Add attrition probabilities
probs = cls_model.predict_proba(X_cls)
df_all["P_leave"] = probs[:, 1]
df_all["P_stay"] = 1 - df_all["P_leave"]

In [None]:
# Step 4: Predict future salary again
X_reg = df_all.drop(columns=["Attrition", "FutureSalary_PerformanceBased", "FutureSalary_Fixed", "P_leave", "P_stay"])
y_reg = df_all["FutureSalary_PerformanceBased"]
reg_model = RandomForestRegressor(random_state=42)
reg_model.fit(X_reg, y_reg)

df_all["PredictedFutureSalary"] = reg_model.predict(X_reg)

In [None]:
# Step 5: Compute Expected Salary Loss
df_all["ExpectedLoss"] = df_all["P_leave"] * df_all["PredictedFutureSalary"]

In [None]:
# Step 6: Aggregate Total Expected Loss
total_loss = df_all["ExpectedLoss"].sum()

In [None]:
# Step 7: Sort to view high-risk, high-loss individuals
df_loss_sorted = df_all[["Age", "JobRole", "MonthlyIncome", "P_leave", "PredictedFutureSalary", "ExpectedLoss"]].sort_values(by="ExpectedLoss", ascending=False)

In [None]:
# Step 8: Save Results
df_all.to_csv("employee_attrition_risk_with_expected_loss.csv", index=False)

In [None]:
# Step 9: Output Summary
print(f"✅ Total Expected Salary Loss: ₹{total_loss:,.2f}\n")
print("🔝 Top 10 High-Risk High-Loss Employees:")
print(df_loss_sorted.head(10))