<a href="https://colab.research.google.com/github/1vanl0pez/DataScienceSantiagoIvan/blob/main/DSPRO1_HRAnalytics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [16]:
# Import libraries for data management
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Read the dataframe
df = pd.read_excel('general_data.xlsx')
# Display the headers and the first values to confirm a correct read
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,Gender,JobLevel,JobRole,...,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsSinceLastPromotion,YearsWithCurrManager,EnvironmentSatisfaction,JobSatisfaction,WorkLifeBalance,JobInvolvement,PerformanceRating
0,51,No,Travel_Rarely,Sales,6,2,Life Sciences,Female,1,Healthcare Representative,...,1.0,6,1,0,0,3.0,4.0,2.0,3,3
1,31,Yes,Travel_Frequently,Research & Development,10,1,Life Sciences,Female,1,Research Scientist,...,6.0,3,5,1,4,3.0,2.0,4.0,2,4
2,32,No,Travel_Frequently,Research & Development,17,4,Other,Male,4,Sales Executive,...,5.0,2,5,0,3,2.0,2.0,1.0,3,3
3,38,No,Non-Travel,Research & Development,2,5,Life Sciences,Male,3,Human Resources,...,13.0,5,8,7,5,4.0,4.0,3.0,2,3
4,32,No,Travel_Rarely,Research & Development,10,1,Medical,Male,1,Sales Executive,...,9.0,2,6,0,4,4.0,1.0,3.0,3,3


In [3]:
# Compute main statistics
df.describe()

Unnamed: 0,Age,DistanceFromHome,Education,JobLevel,MonthlyIncome,NumCompaniesWorked,PercentSalaryHike,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsSinceLastPromotion,YearsWithCurrManager,EnvironmentSatisfaction,JobSatisfaction,WorkLifeBalance,JobInvolvement,PerformanceRating
count,4410.0,4410.0,4410.0,4410.0,4410.0,4391.0,4410.0,4410.0,4401.0,4410.0,4410.0,4410.0,4410.0,4385.0,4390.0,4372.0,4410.0,4410.0
mean,36.92381,9.192517,2.912925,2.063946,65029.312925,2.69483,15.209524,0.793878,11.279936,2.79932,7.008163,2.187755,4.123129,2.723603,2.728246,2.761436,2.729932,3.153741
std,9.133301,8.105026,1.023933,1.106689,47068.888559,2.498887,3.659108,0.851883,7.782222,1.288978,6.125135,3.221699,3.567327,1.092756,1.101253,0.706245,0.7114,0.360742
min,18.0,1.0,1.0,1.0,10090.0,0.0,11.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,3.0
25%,30.0,2.0,2.0,1.0,29110.0,1.0,12.0,0.0,6.0,2.0,3.0,0.0,2.0,2.0,2.0,2.0,2.0,3.0
50%,36.0,7.0,3.0,2.0,49190.0,2.0,14.0,1.0,10.0,3.0,5.0,1.0,3.0,3.0,3.0,3.0,3.0,3.0
75%,43.0,14.0,4.0,3.0,83800.0,4.0,18.0,1.0,15.0,3.0,9.0,3.0,7.0,4.0,4.0,3.0,3.0,3.0
max,60.0,29.0,5.0,5.0,199990.0,9.0,25.0,3.0,40.0,6.0,40.0,15.0,17.0,4.0,4.0,4.0,4.0,4.0


In [4]:
# Display the data type for each variable
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4410 entries, 0 to 4409
Data columns (total 25 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Age                      4410 non-null   int64  
 1   Attrition                4410 non-null   object 
 2   BusinessTravel           4410 non-null   object 
 3   Department               4410 non-null   object 
 4   DistanceFromHome         4410 non-null   int64  
 5   Education                4410 non-null   int64  
 6   EducationField           4410 non-null   object 
 7   Gender                   4410 non-null   object 
 8   JobLevel                 4410 non-null   int64  
 9   JobRole                  4410 non-null   object 
 10  MaritalStatus            4410 non-null   object 
 11  MonthlyIncome            4410 non-null   int64  
 12  NumCompaniesWorked       4391 non-null   float64
 13  PercentSalaryHike        4410 non-null   int64  
 14  StockOptionLevel        

In [6]:
# DATA CLEANING
# Step 1: Handle missing values
# Replace missing numeric values with the median of the respective column.
# Median is more resilient to outliers than the mean.

# Look if the null values are greater than 0, if so, add them into a list
columns_with_na = df.columns[df.isnull().sum() > 0].tolist()

# Replacement
for col in columns_with_na:
    df.fillna({col.median()}, inplace=True)

In [9]:
# Step 2: Convert categorical variables to numerical format using one-hot encoding
# drop_first=True avoids logical errors (multicollinearity)
df = pd.get_dummies(df, drop_first=True)

# Check the cleaned dataset structure
df.head(), df.isnull().sum().sum()  # Checking for any remaining missing values

(   Age  DistanceFromHome  Education  JobLevel  MonthlyIncome  \
 0   51                 6          2         1         131160   
 1   31                10          1         1          41890   
 2   32                17          4         4         193280   
 3   38                 2          5         3          83210   
 4   32                10          1         1          23420   
 
    NumCompaniesWorked  PercentSalaryHike  StockOptionLevel  TotalWorkingYears  \
 0                 1.0                 11                 0                1.0   
 1                 0.0                 23                 1                6.0   
 2                 1.0                 15                 3                5.0   
 3                 3.0                 11                 3               13.0   
 4                 4.0                 12                 2                9.0   
 
    TrainingTimesLastYear  ...  JobRole_Human Resources  \
 0                      6  ...                    False

In [14]:
# MODEL CONSTRUCTION
# Separate the data into two groups: employees who left (Attrition = Yes) and those who stayed (Attrition = No)
data_left = df[df['Attrition_Yes'] == 1]
data_stayed = df[df['Attrition_Yes'] == 0]

# Define the output and input variables for each data set
# For employees who left
# Remove JobSatisfaction from the input variables since it is the output variable
X_left = data_left.drop(columns=["JobSatisfaction"])
# The output variable is JobSatisfaction
y_left = data_left["JobSatisfaction"]

# For employees who stayed
# Remove JobSatisfaction from the input variables since it is the output variable
X_stayed = data_stayed.drop(columns=["JobSatisfaction"])
# The output variable is JobSatisfaction
y_stayed = data_stayed["JobSatisfaction"]

In [18]:
# Initialize KFold cross-validation with 5 folds
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Function to perform K-Fold Cross-Validation and capture feature importance
def calculate_feature_importances(X, y):
    fold_importances = []
    fold_errors = []

    for train_index, test_index in kf.split(X):
        # Split data into train and test sets for the current fold
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # Initialize RandomForestRegressor
        model = RandomForestRegressor(n_estimators=100, random_state=42)

        # Train the model on the training set
        model.fit(X_train, y_train)

        # Predict on the test set and calculate error
        y_pred = model.predict(X_test)
        fold_errors.append(mean_squared_error(y_test, y_pred))

        # Store feature importances
        fold_importances.append(model.feature_importances_)

    # Average feature importances and errors across all folds
    avg_importances = np.mean(fold_importances, axis=0)
    avg_error = np.mean(fold_errors)

    return avg_importances, avg_error

# Calculate feature importances for each group
importances_left, error_left = calculate_feature_importances(X_left, y_left)
importances_stayed, error_stayed = calculate_feature_importances(X_stayed, y_stayed)

# Store the feature importances in DataFrames for better visualization
feature_importances_left = pd.DataFrame({"Feature": X_left.columns, "Importance": importances_left}).sort_values(by="Importance", ascending=False)
feature_importances_stayed = pd.DataFrame({"Feature": X_stayed.columns, "Importance": importances_stayed}).sort_values(by="Importance", ascending=False)

print("Top 10 most relevant features for job satisfaction")
print("\nFor employees who left")
print(feature_importances_left.head(10))
print("\nMean Squared Error for Left Group:", error_left)
print("\nFor employees who stayed:")
print(feature_importances_stayed.head(10))
print("\nMean Squared Error for Stayed Group:", error_stayed)

Top 10 most relevant features for job satisfaction

For employees who left
                    Feature  Importance
4             MonthlyIncome    0.119644
6         PercentSalaryHike    0.091472
1          DistanceFromHome    0.071871
8         TotalWorkingYears    0.062593
0                       Age    0.060396
10           YearsAtCompany    0.048236
5        NumCompaniesWorked    0.046164
12     YearsWithCurrManager    0.044945
11  YearsSinceLastPromotion    0.041712
2                 Education    0.038433

Mean Squared Error for Left Group: 0.13347001948520965

For employees who stayed:
                    Feature  Importance
4             MonthlyIncome    0.119164
0                       Age    0.080044
1          DistanceFromHome    0.078463
8         TotalWorkingYears    0.064581
6         PercentSalaryHike    0.062372
10           YearsAtCompany    0.052213
12     YearsWithCurrManager    0.048373
5        NumCompaniesWorked    0.045673
11  YearsSinceLastPromotion    0.044616
9 

In [15]:
# Initialize RandomForestRegressor for each group
model_left = RandomForestRegressor(n_estimators=100, random_state=42)
model_stayed = RandomForestRegressor(n_estimators=100, random_state=42)

# Fit the models
model_left.fit(X_left, y_left)
model_stayed.fit(X_stayed, y_stayed)

# Get feature importances for each group
importances_left = model_left.feature_importances_
importances_stayed = model_stayed.feature_importances_

# Create DataFrames to store feature importances for both groups
feature_importances_left = pd.DataFrame({"Feature": X_left.columns, "Importance": importances_left}).sort_values(by="Importance", ascending=False)
feature_importances_stayed = pd.DataFrame({"Feature": X_stayed.columns, "Importance": importances_stayed}).sort_values(by="Importance", ascending=False)

# Display the top 10 most important features for each group
print("Top 10 most relevant features for job satisfaction")
print("\nFor employees who left")
print(feature_importances_left.head(10))
print("\nFor employees who stayed:")
print(feature_importances_stayed.head(10))

Top 10 most relevant features for job satisfaction

For employees who left
                    Feature  Importance
4             MonthlyIncome    0.107233
6         PercentSalaryHike    0.098404
1          DistanceFromHome    0.067651
8         TotalWorkingYears    0.066893
0                       Age    0.061240
5        NumCompaniesWorked    0.050799
10           YearsAtCompany    0.047961
12     YearsWithCurrManager    0.047521
15           JobInvolvement    0.040667
11  YearsSinceLastPromotion    0.038623

For employees who stayed:
                    Feature  Importance
4             MonthlyIncome    0.115587
1          DistanceFromHome    0.078086
0                       Age    0.076367
8         TotalWorkingYears    0.064706
6         PercentSalaryHike    0.062677
10           YearsAtCompany    0.053093
12     YearsWithCurrManager    0.049068
5        NumCompaniesWorked    0.046159
11  YearsSinceLastPromotion    0.041483
9     TrainingTimesLastYear    0.040945
