In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score, mean_absolute_error
import matplotlib.pyplot as plt
import numpy as np

from google.colab import drive
drive.mount('/content/drive')
df = pd.read_excel('/content/drive/MyDrive/Colab Notebooks/data/HrData.xlsx')

# Display first few rows to understand the structure
display(df.head())

# Identify relevant columns for attrition prediction
attrition_columns = ['Attrition', 'JobSatisfaction', 'Education', 'PerformanceRating', 'Age', 'JobRole', 'YearsAtCompany', 'MonthlyIncome', 'OverTime']
df_attrition = df[attrition_columns].dropna()

# Encode categorical values if necessary
label_enc = LabelEncoder()
df_attrition['Attrition'] = label_enc.fit_transform(df_attrition['Attrition'])  # Convert 'Yes'/'No' to 1/0
df_attrition['JobRole'] = label_enc.fit_transform(df_attrition['JobRole'])  # Encode job roles
df_attrition['OverTime'] = label_enc.fit_transform(df_attrition['OverTime'])  # Convert 'Yes'/'No' to 1/0

# Split data into features and target for attrition prediction
X_attrition = df_attrition.drop(columns=['Attrition'])
y_attrition = df_attrition['Attrition']

# Train/Test Split for attrition
X_train_attrition, X_test_attrition, y_train_attrition, y_test_attrition = train_test_split(X_attrition, y_attrition, test_size=0.2, random_state=42)

# Train a model (Random Forest Classifier) for attrition
model_attrition = RandomForestClassifier(n_estimators=100, random_state=42)
model_attrition.fit(X_train_attrition, y_train_attrition)

# Predict on test data for attrition
y_pred_attrition = model_attrition.predict(X_test_attrition)
y_pred_prob_attrition = model_attrition.predict_proba(X_test_attrition)[:, 1]  # Probability of leaving

# Check accuracy for attrition prediction
accuracy_attrition = accuracy_score(y_test_attrition, y_pred_attrition)
print(f"Attrition Model Accuracy: {accuracy_attrition:.2f}")

# Predict attrition likelihood for all employees
df_attrition['Attrition_Likelihood'] = model_attrition.predict_proba(X_attrition)[:, 1]

# Identify high-risk employee groups
high_risk_threshold = 0.6
df_high_risk = df_attrition[df_attrition['Attrition_Likelihood'] > high_risk_threshold]
display(df_high_risk[['Age', 'JobRole', 'YearsAtCompany', 'MonthlyIncome', 'OverTime', 'Attrition_Likelihood']])

# Analyze impact of overtime on attrition
overtime_impact = df_high_risk.groupby('OverTime')['Attrition_Likelihood'].mean()
print("Impact of Overtime on Attrition:")
print(overtime_impact)

# Suggest strategies to reduce attrition
print("Strategies to Reduce Attrition:")
print("1. Improve job satisfaction through engagement programs.")
print("2. Offer career development opportunities.")
print("3. Review compensation and benefits.")
print("4. Provide flexible work options.")
print("5. Foster a positive work culture and recognition.")

# Identify relevant columns for tenure prediction
tenure_columns = ['Tenure', 'Age', 'JobSatisfaction', 'Education', 'PerformanceRating']
df_tenure = df[tenure_columns]

# Drop rows with missing values
df_tenure = df_tenure.dropna()

# Split data into features and target for tenure prediction
X_tenure = df_tenure.drop(columns=['Tenure'])
y_tenure = df_tenure['Tenure']

# Train/Test Split for tenure prediction
X_train_tenure, X_test_tenure, y_train_tenure, y_test_tenure = train_test_split(X_tenure, y_tenure, test_size=0.2, random_state=42)

# Train a model (Random Forest Regressor) for tenure prediction
model_tenure = RandomForestRegressor(n_estimators=100, random_state=42)
model_tenure.fit(X_train_tenure, y_train_tenure)

# Predict on test data for tenure
y_pred_tenure = model_tenure.predict(X_test_tenure)

# Check mean absolute error for tenure prediction
mae_tenure = mean_absolute_error(y_test_tenure, y_pred_tenure)
print(f"Tenure Prediction Mean Absolute Error: {mae_tenure:.2f} years")

# Predict expected tenure for all employees
df_tenure['Predicted_Tenure'] = model_tenure.predict(X_tenure)

# Display employees with predicted tenure
display(df_tenure[['Tenure', 'Predicted_Tenure']])

# Predict salary based on department, experience, and job role
salary_columns = ['MonthlyIncome', 'Department', 'YearsAtCompany', 'JobRole']
df_salary = df[salary_columns].dropna()

# Encode categorical features
df_salary['Department'] = label_enc.fit_transform(df_salary['Department'])
df_salary['JobRole'] = label_enc.fit_transform(df_salary['JobRole'])

# Split data into features and target
X_salary = df_salary.drop(columns=['MonthlyIncome'])
y_salary = df_salary['MonthlyIncome']

# Train/Test Split
X_train_salary, X_test_salary, y_train_salary, y_test_salary = train_test_split(X_salary, y_salary, test_size=0.2, random_state=42)

# Train a model (Random Forest Regressor) for salary prediction
model_salary = RandomForestRegressor(n_estimators=100, random_state=42)
model_salary.fit(X_train_salary, y_train_salary)

# Predict on test data
y_pred_salary = model_salary.predict(X_test_salary)

# Check mean absolute error for salary prediction
mae_salary = mean_absolute_error(y_test_salary, y_pred_salary)
print(f"Salary Prediction Mean Absolute Error: ${mae_salary:.2f}")

# Predict expected salary for all employees
df_salary['Predicted_Salary'] = model_salary.predict(X_salary)

# Display employees with predicted salary
display(df_salary[['MonthlyIncome', 'Predicted_Salary']])
