In [1]:
# a)

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score, mean_absolute_error
import matplotlib.pyplot as plt
import numpy as np

from google.colab import drive
drive.mount('/content/drive')
df = pd.read_excel('/content/drive/MyDrive/Colab Notebooks/data/HrData.xlsx')

# Display first few rows to understand the structure
display(df.head())

# Identify relevant columns for attrition prediction
attrition_columns = ['Attrition', 'JobSatisfaction', 'Education', 'PerformanceRating', 'Age', 'JobRole', 'YearsAtCompany', 'MonthlyIncome']
df_attrition = df[attrition_columns].dropna()

# Encode categorical values if necessary
label_enc = LabelEncoder()
df_attrition['Attrition'] = label_enc.fit_transform(df_attrition['Attrition'])  # Convert 'Yes'/'No' to 1/0
df_attrition['JobRole'] = label_enc.fit_transform(df_attrition['JobRole'])  # Encode job roles

# Split data into features and target for attrition prediction
X_attrition = df_attrition.drop(columns=['Attrition'])
y_attrition = df_attrition['Attrition']

# Train/Test Split for attrition
X_train_attrition, X_test_attrition, y_train_attrition, y_test_attrition = train_test_split(X_attrition, y_attrition, test_size=0.2, random_state=42)

# Train a model (Random Forest Classifier) for attrition
model_attrition = RandomForestClassifier(n_estimators=100, random_state=42, probability=True)
model_attrition.fit(X_train_attrition, y_train_attrition)

# Predict on test data for attrition
y_pred_attrition = model_attrition.predict(X_test_attrition)
y_pred_prob_attrition = model_attrition.predict_proba(X_test_attrition)[:, 1]  # Probability of leaving

# Check accuracy for attrition prediction
accuracy_attrition = accuracy_score(y_test_attrition, y_pred_attrition)
print(f"Attrition Model Accuracy: {accuracy_attrition:.2f}")

# Predict attrition likelihood for all employees
df_attrition['Attrition_Likelihood'] = model_attrition.predict_proba(X_attrition)[:, 1]

# Display employees with their likelihood of leaving
display(df_attrition[['Age', 'JobRole', 'YearsAtCompany', 'MonthlyIncome', 'Attrition_Likelihood']])

# Identify relevant columns for tenure prediction
tenure_columns = ['Tenure', 'Age', 'JobSatisfaction', 'Education', 'PerformanceRating']
df_tenure = df[tenure_columns]

# Drop rows with missing values
df_tenure = df_tenure.dropna()

# Split data into features and target for tenure prediction
X_tenure = df_tenure.drop(columns=['Tenure'])
y_tenure = df_tenure['Tenure']

# Train/Test Split for tenure prediction
X_train_tenure, X_test_tenure, y_train_tenure, y_test_tenure = train_test_split(X_tenure, y_tenure, test_size=0.2, random_state=42)

# Train a model (Random Forest Regressor) for tenure prediction
model_tenure = RandomForestRegressor(n_estimators=100, random_state=42)
model_tenure.fit(X_train_tenure, y_train_tenure)

# Predict on test data for tenure
y_pred_tenure = model_tenure.predict(X_test_tenure)

# Check mean absolute error for tenure prediction
mae_tenure = mean_absolute_error(y_test_tenure, y_pred_tenure)
print(f"Tenure Prediction Mean Absolute Error: {mae_tenure:.2f} years")

# Predict expected tenure for all employees
df_tenure['Predicted_Tenure'] = model_tenure.predict(X_tenure)

# Display employees with predicted tenure
display(df_tenure[['Tenure', 'Predicted_Tenure']])

# Identify relevant columns for performance prediction
performance_columns = ['PerformanceRating', 'Age', 'Education', 'JobSatisfaction', 'YearsAtCompany', 'MonthlyIncome']
df_performance = df[performance_columns].dropna()

# Split data into features and target for performance prediction
X_performance = df_performance.drop(columns=['PerformanceRating'])
y_performance = df_performance['PerformanceRating']

# Train/Test Split for performance prediction
X_train_perf, X_test_perf, y_train_perf, y_test_perf = train_test_split(X_performance, y_performance, test_size=0.2, random_state=42)

# Train a model (Random Forest Classifier) for performance rating
model_performance = RandomForestClassifier(n_estimators=100, random_state=42)
model_performance.fit(X_train_perf, y_train_perf)

# Feature importance for performance prediction
feature_importances = model_performance.feature_importances_
feature_names = X_performance.columns

# Plot feature importances
plt.figure(figsize=(10, 6))
plt.barh(feature_names, feature_importances, color='skyblue')
plt.xlabel("Feature Importance")
plt.ylabel("Feature")
plt.title("Key Factors Influencing Employee Performance Ratings")
plt.show()

# Print feature importance scores
importance_dict = dict(zip(feature_names, feature_importances))
sorted_importance = sorted(importance_dict.items(), key=lambda x: x[1], reverse=True)
print("Key Factors Predicting Employee Performance Ratings:")
for feature, importance in sorted_importance:
    print(f"{feature}: {importance:.4f}")

# Identify relevant columns for job satisfaction prediction
satisfaction_columns = ['JobSatisfaction', 'Education', 'PerformanceRating', 'Tenure', 'MonthlyIncome']
df_satisfaction = df[satisfaction_columns].dropna()

# Split data into features and target for satisfaction prediction
X_satisfaction = df_satisfaction.drop(columns=['JobSatisfaction'])
y_satisfaction = df_satisfaction['JobSatisfaction']

# Train a model (Random Forest Regressor) for job satisfaction
model_satisfaction = RandomForestRegressor(n_estimators=100, random_state=42)
model_satisfaction.fit(X_satisfaction, y_satisfaction)

# Feature importance for job satisfaction prediction
feature_importances_satisfaction = model_satisfaction.feature_importances_
feature_names_satisfaction = X_satisfaction.columns

# Plot feature importances
plt.figure(figsize=(10, 6))
plt.barh(feature_names_satisfaction, feature_importances_satisfaction, color='green')
plt.xlabel("Feature Importance")
plt.ylabel("Feature")
plt.title("Key Factors Influencing Employee Satisfaction Levels")
plt.show()

# Print feature importance scores
importance_dict_satisfaction = dict(zip(feature_names_satisfaction, feature_importances_satisfaction))
sorted_importance_satisfaction = sorted(importance_dict_satisfaction.items(), key=lambda x: x[1], reverse=True)
print("Key Factors Predicting Employee Satisfaction Levels:")
for feature, importance in sorted_importance_satisfaction:
    print(f"{feature}: {importance:.4f}")



20


In [None]:
# b)

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score, mean_absolute_error
import matplotlib.pyplot as plt
import numpy as np

from google.colab import drive
drive.mount('/content/drive')
df = pd.read_excel('/content/drive/MyDrive/Colab Notebooks/data/HrData.xlsx')

# Display first few rows to understand the structure
display(df.head())

# Identify relevant columns for attrition prediction
attrition_columns = ['Attrition', 'JobSatisfaction', 'Education', 'PerformanceRating', 'Age', 'JobRole', 'YearsAtCompany', 'MonthlyIncome']
df_attrition = df[attrition_columns].dropna()

# Encode categorical values if necessary
label_enc = LabelEncoder()
df_attrition['Attrition'] = label_enc.fit_transform(df_attrition['Attrition'])  # Convert 'Yes'/'No' to 1/0
df_attrition['JobRole'] = label_enc.fit_transform(df_attrition['JobRole'])  # Encode job roles

# Split data into features and target for attrition prediction
X_attrition = df_attrition.drop(columns=['Attrition'])
y_attrition = df_attrition['Attrition']

# Train/Test Split for attrition
X_train_attrition, X_test_attrition, y_train_attrition, y_test_attrition = train_test_split(X_attrition, y_attrition, test_size=0.2, random_state=42)

# Train a model (Random Forest Classifier) for attrition
model_attrition = RandomForestClassifier(n_estimators=100, random_state=42, probability=True)
model_attrition.fit(X_train_attrition, y_train_attrition)

# Predict on test data for attrition
y_pred_attrition = model_attrition.predict(X_test_attrition)
y_pred_prob_attrition = model_attrition.predict_proba(X_test_attrition)[:, 1]  # Probability of leaving

# Check accuracy for attrition prediction
accuracy_attrition = accuracy_score(y_test_attrition, y_pred_attrition)
print(f"Attrition Model Accuracy: {accuracy_attrition:.2f}")

# Predict attrition likelihood for all employees
df_attrition['Attrition_Likelihood'] = model_attrition.predict_proba(X_attrition)[:, 1]

# Display employees with their likelihood of leaving
display(df_attrition[['Age', 'JobRole', 'YearsAtCompany', 'MonthlyIncome', 'Attrition_Likelihood']])

# Identify relevant columns for tenure prediction
tenure_columns = ['Tenure', 'Age', 'JobSatisfaction', 'Education', 'PerformanceRating']
df_tenure = df[tenure_columns]

# Drop rows with missing values
df_tenure = df_tenure.dropna()

# Split data into features and target for tenure prediction
X_tenure = df_tenure.drop(columns=['Tenure'])
y_tenure = df_tenure['Tenure']

# Train/Test Split for tenure prediction
X_train_tenure, X_test_tenure, y_train_tenure, y_test_tenure = train_test_split(X_tenure, y_tenure, test_size=0.2, random_state=42)

# Train a model (Random Forest Regressor) for tenure prediction
model_tenure = RandomForestRegressor(n_estimators=100, random_state=42)
model_tenure.fit(X_train_tenure, y_train_tenure)

# Predict on test data for tenure
y_pred_tenure = model_tenure.predict(X_test_tenure)

# Check mean absolute error for tenure prediction
mae_tenure = mean_absolute_error(y_test_tenure, y_pred_tenure)
print(f"Tenure Prediction Mean Absolute Error: {mae_tenure:.2f} years")

# Predict expected tenure for all employees
df_tenure['Predicted_Tenure'] = model_tenure.predict(X_tenure)

# Display employees with predicted tenure
display(df_tenure[['Tenure', 'Predicted_Tenure']])

# Identify relevant columns for job satisfaction prediction
satisfaction_columns = ['JobSatisfaction', 'Education', 'PerformanceRating', 'Tenure', 'MonthlyIncome', 'BusinessTravel']
df_satisfaction = df[satisfaction_columns].dropna()

# Encode categorical variables if necessary
df_satisfaction['BusinessTravel'] = label_enc.fit_transform(df_satisfaction['BusinessTravel'])

# Split data into features and target for satisfaction prediction
X_satisfaction = df_satisfaction.drop(columns=['JobSatisfaction'])
y_satisfaction = df_satisfaction['JobSatisfaction']

# Train a model (Random Forest Regressor) for job satisfaction
model_satisfaction = RandomForestRegressor(n_estimators=100, random_state=42)
model_satisfaction.fit(X_satisfaction, y_satisfaction)

# Feature importance for job satisfaction prediction
feature_importances_satisfaction = model_satisfaction.feature_importances_
feature_names_satisfaction = X_satisfaction.columns

# Plot feature importances
plt.figure(figsize=(10, 6))
plt.barh(feature_names_satisfaction, feature_importances_satisfaction, color='purple')
plt.xlabel("Feature Importance")
plt.ylabel("Feature")
plt.title("Key Factors Influencing Employee Job Satisfaction")
plt.show()

# Print feature importance scores
importance_dict_satisfaction = dict(zip(feature_names_satisfaction, feature_importances_satisfaction))
sorted_importance_satisfaction = sorted(importance_dict_satisfaction.items(), key=lambda x: x[1], reverse=True)
print("Key Factors Predicting Employee Job Satisfaction:")
for feature, importance in sorted_importance_satisfaction:
    print(f"{feature}: {importance:.4f}")