In [26]:
# import libs
import numpy as np
import pandas as pd
import os

import category_encoders as ce
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

## **Business Understanding**

The HR department at Salifort Motors wants to take some initiatives to improve employee satis- faction levels at the company.If we can predict employees likely to quit, it might be possible to identify factors that contribute to their leaving. Because it is time-consuming and expensive to find, interview, and hire new employees, increasing employee retention will be beneficial to the company.

## **Data Understanding**

In [27]:
# navigation do data folder
actual_folder = os.path.abspath(os.getcwd())
data_folder = os.path.dirname(os.path.dirname(actual_folder))

# reading data
df = pd.read_csv(f'{data_folder}/04. Data/02. HR_dataset.csv', sep = ',')
df.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,Department,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


In [28]:
# columns and line for the dataset
columns = df.shape[1]
lines = df.shape[0]
print(f'-- Lines >>> {lines} \n-- Columns >>> {columns}\n')

print('-'*30)

columns_names = df.columns
print(f'\n-- Columns names >> {[column_name for column_name in columns_names]}')

-- Lines >>> 14999 
-- Columns >>> 10

------------------------------

-- Columns names >> ['satisfaction_level', 'last_evaluation', 'number_project', 'average_montly_hours', 'time_spend_company', 'Work_accident', 'left', 'promotion_last_5years', 'Department', 'salary']


In [29]:
# data types and null values
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14999 entries, 0 to 14998
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   satisfaction_level     14999 non-null  float64
 1   last_evaluation        14999 non-null  float64
 2   number_project         14999 non-null  int64  
 3   average_montly_hours   14999 non-null  int64  
 4   time_spend_company     14999 non-null  int64  
 5   Work_accident          14999 non-null  int64  
 6   left                   14999 non-null  int64  
 7   promotion_last_5years  14999 non-null  int64  
 8   Department             14999 non-null  object 
 9   salary                 14999 non-null  object 
dtypes: float64(2), int64(6), object(2)
memory usage: 1.1+ MB


In [30]:
# information of numeric features
df.describe()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years
count,14999.0,14999.0,14999.0,14999.0,14999.0,14999.0,14999.0,14999.0
mean,0.612834,0.716102,3.803054,201.050337,3.498233,0.14461,0.238083,0.021268
std,0.248631,0.171169,1.232592,49.943099,1.460136,0.351719,0.425924,0.144281
min,0.09,0.36,2.0,96.0,2.0,0.0,0.0,0.0
25%,0.44,0.56,3.0,156.0,3.0,0.0,0.0,0.0
50%,0.64,0.72,4.0,200.0,3.0,0.0,0.0,0.0
75%,0.82,0.87,5.0,245.0,4.0,0.0,0.0,0.0
max,1.0,1.0,7.0,310.0,10.0,1.0,1.0,1.0


In [31]:
# information of categorical features
df.describe(include = 'O')

Unnamed: 0,Department,salary
count,14999,14999
unique,10,3
top,sales,low
freq,4140,7316


In [32]:
# check the numbers of left employers
df['left'].value_counts()

left
0    11428
1     3571
Name: count, dtype: int64

## **Data Preparation**

In [33]:
# split in X and y our dataset
X = df.drop(['left'], axis = 1) # features
y = df['left'] # target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

In [34]:
# check the size of train and test
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((10499, 9), (4500, 9), (10499,), (4500,))

In [35]:
# Transform categorical features with label encoding
encoder_X = ce.OrdinalEncoder(cols = ['Department', 'salary'])
X_train = encoder_X.fit_transform(X_train)
X_test = encoder_X.transform(X_test)

## **Modeling**

In [36]:
model_rf = RandomForestClassifier()

model_rf.fit(X_train, y_train)

y_rf_pred = model_rf.predict(X_test)

## **Validation**

In [37]:
# check the score of the model (1)
print(f'Model acurracacy score: {round(accuracy_score(y_test, y_rf_pred), 4)}')

Model acurracacy score: 0.9876


In [38]:
# check the score of the model (1)
print(f'Model acurracacy score: {classification_report(y_test, y_rf_pred)}')

Model acurracacy score:               precision    recall  f1-score   support

           0       0.99      1.00      0.99      3428
           1       0.99      0.96      0.97      1072

    accuracy                           0.99      4500
   macro avg       0.99      0.98      0.98      4500
weighted avg       0.99      0.99      0.99      4500

