# Import Required Libraries
Import the necessary libraries for data manipulation, visualization, and machine learning.

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error
import math

R² Score: 0.9994
RMSE: 121.32

Sample of Predicted Salaries for Likely to Stay Employees:
     Age  JobRole  MonthlyIncome  P_stay  PredictedFutureSalary
574   34        6           3280     1.0              3450.8030
521   27        7           4647     1.0              5063.0020
814   40        5          19626     1.0             20637.4665
307   38        5          12061     1.0             12646.4680
855   37        4           6474     1.0              6800.7330
739   27        4           4227     1.0              4441.9935
392   54        5          19406     1.0             20364.9180
737   41        4           5003     1.0              5272.3860
738   39        4          12742     1.0             13466.8050
373   27        2           3816     1.0              4010.2310

✅ File saved as 'likely_to_stay_with_predicted_salaries.csv'


# Load and Preprocess the Dataset
Load the dataset and preprocess it by converting categorical variables and filtering data.

In [None]:
# Load the dataset
df = pd.read_csv("employee_dataset_with_future_salaries.csv")

# Convert Attrition to binary
df["Attrition"] = df["Attrition"].map({"Yes": 1, "No": 0})

# Encode categorical columns
categorical_cols = df.select_dtypes(include="object").columns.tolist()
categorical_cols = [col for col in categorical_cols if col not in ['Attrition']]  # already handled

le = LabelEncoder()
for col in categorical_cols:
    df[col] = le.fit_transform(df[col])

# Train Attrition Classification Model
Train a classification model to predict employee attrition.

In [None]:
# Define features and target
X_cls = df.drop(columns=["Attrition", "FutureSalary_PerformanceBased", "FutureSalary_Fixed"])
y_cls = df["Attrition"]

# Train-test split
X_cls_train, X_cls_test, y_cls_train, y_cls_test = train_test_split(X_cls, y_cls, test_size=0.2, random_state=42)

# Train the model
cls_model = RandomForestClassifier(random_state=42)
cls_model.fit(X_cls_train, y_cls_train)

# Predict probabilities of attrition
probs = cls_model.predict_proba(X_cls)
df["P_leave"] = probs[:, 1]
df["P_stay"] = 1 - df["P_leave"]

# Filter Likely to Stay Employees
Filter employees who are likely to stay based on a threshold.

In [None]:
# Define threshold
stay_threshold = 0.6
df_likely_to_stay = df[df["P_stay"] > stay_threshold].copy()

# Train Salary Regression Model
Train a regression model to predict future salaries.

In [None]:
# Define features and target
X_reg = df.drop(columns=["Attrition", "FutureSalary_PerformanceBased", "FutureSalary_Fixed", "P_leave", "P_stay"])
y_reg = df["FutureSalary_PerformanceBased"]

# Train-test split
X_reg_train, X_reg_test, y_reg_train, y_reg_test = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)

# Train the model
reg_model = RandomForestRegressor(random_state=42)
reg_model.fit(X_reg_train, y_reg_train)

# Predict Future Salary for Likely to Stay Employees
Use the regression model to predict future salaries for employees likely to stay.

In [None]:
# Predict future salaries
X_stay = df_likely_to_stay.drop(columns=["Attrition", "FutureSalary_PerformanceBased", "FutureSalary_Fixed", "P_leave", "P_stay"])
df_likely_to_stay["PredictedFutureSalary"] = reg_model.predict(X_stay)

# Evaluate the Regression Model
Evaluate the performance of the regression model using R² and RMSE metrics.

In [None]:
# Evaluate the model
y_pred_test = reg_model.predict(X_reg_test)
r2 = r2_score(y_reg_test, y_pred_test)
rmse = math.sqrt(mean_squared_error(y_reg_test, y_pred_test))

print(f"R² Score: {r2:.4f}")
print(f"RMSE: {rmse:.2f}")

# Display Sample Output
Display a sample of predicted salaries for employees likely to stay.

In [None]:
# Display sample output
output_cols = ["Age", "JobRole", "MonthlyIncome", "P_stay", "PredictedFutureSalary"]
output_sample = df_likely_to_stay[output_cols].sort_values(by="P_stay", ascending=False).head(10)

print("\nSample of Predicted Salaries for Likely to Stay Employees:")
print(output_sample)

# Save Results to CSV
Save the predictions for likely to stay employees to a CSV file.

In [None]:
# Save to CSV
df_likely_to_stay.to_csv("likely_to_stay_with_predicted_salaries.csv", index=False)
print("\n✅ File saved as 'likely_to_stay_with_predicted_salaries.csv'")