In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier, XGBRegressor
from sklearn.metrics import r2_score, mean_squared_error
import math

# -------- Part 1: Load and Preprocess Data --------
df = pd.read_csv("employee_dataset_with_future_salaries.csv")

# Convert Attrition to binary
df["Attrition"] = df["Attrition"].map({"Yes": 1, "No": 0})

# Encode categorical columns
categorical_cols = df.select_dtypes(include="object").columns.tolist()
categorical_cols = [col for col in categorical_cols if col not in ['Attrition']]

le = LabelEncoder()
for col in categorical_cols:
    df[col] = le.fit_transform(df[col])

# -------- Part 2: Train XGBoost Attrition Classification Model --------
X_cls = df.drop(columns=["Attrition", "FutureSalary_PerformanceBased", "FutureSalary_Fixed"])
y_cls = df["Attrition"]

X_cls_train, X_cls_test, y_cls_train, y_cls_test = train_test_split(X_cls, y_cls, test_size=0.2, random_state=42)

cls_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
cls_model.fit(X_cls_train, y_cls_train)

# Predict probabilities of attrition
probs = cls_model.predict_proba(X_cls)
df["P_leave"] = probs[:, 1]
df["P_stay"] = 1 - df["P_leave"]

# -------- Part 3: Filter Likely to Stay Employees --------
stay_threshold = 0.6
df_likely_to_stay = df[df["P_stay"] > stay_threshold].copy()

# -------- Part 4: Train XGBoost Salary Regression Model --------
X_reg = df.drop(columns=["Attrition", "FutureSalary_PerformanceBased", "FutureSalary_Fixed", "P_leave", "P_stay"])
y_reg = df["FutureSalary_PerformanceBased"]

X_reg_train, X_reg_test, y_reg_train, y_reg_test = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)

reg_model = XGBRegressor(random_state=42)
reg_model.fit(X_reg_train, y_reg_train)

# -------- Part 5: Predict Future Salary for Likely to Stay Employees --------
X_stay = df_likely_to_stay.drop(columns=["Attrition", "FutureSalary_PerformanceBased", "FutureSalary_Fixed", "P_leave", "P_stay"])
df_likely_to_stay["PredictedFutureSalary"] = reg_model.predict(X_stay)

# -------- Part 6: Evaluate the Regression Model --------
y_pred_test = reg_model.predict(X_reg_test)
r2 = r2_score(y_reg_test, y_pred_test)
rmse = math.sqrt(mean_squared_error(y_reg_test, y_pred_test))

print(f"R² Score: {r2:.4f}")
print(f"RMSE: {rmse:.2f}")

# -------- Part 7: Display Sample Output --------
output_cols = ["Age", "JobRole", "MonthlyIncome", "P_stay", "PredictedFutureSalary"]
output_sample = df_likely_to_stay[output_cols].sort_values(by="P_stay", ascending=False).head(10)

print("\nSample of Predicted Salaries for Likely to Stay Employees:")
print(output_sample)

# -------- Part 8: Save to CSV --------
df_likely_to_stay.to_csv("likely_to_stay_with_predicted_salaries_xgb.csv", index=False)
print("\n✅ File saved as 'likely_to_stay_with_predicted_salaries_xgb.csv'")
