In [10]:
# importing necessary libraries 
import pandas as pd 
from sklearn import datasets 
from sklearn.metrics import confusion_matrix 
from sklearn.model_selection import train_test_split 
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error


In [11]:
file_path = '/kaggle/input/ibm-hr/HR-Em.csv'
data = pd.read_csv(file_path)

print(data.shape)

#information about the number of rows, columns, column data types, memory usage
#The columns with object dtype are the possible categorical features in dataset.
print(data.info())


# I want to use one hot encoding for all the features that have categorical values, except the target which is 'JobRole'.
# The target is also a categorical variable and i will use label encoding for this one

data_cpy = data
column_to_be_deleted = ['JobRole']
data_cpy = data_cpy.drop(column_to_be_deleted, axis=1)

columns = data_cpy.columns

print(data_cpy.info())

(1470, 35)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 35 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       1470 non-null   int64 
 1   Attrition                 1470 non-null   object
 2   BusinessTravel            1470 non-null   object
 3   DailyRate                 1470 non-null   int64 
 4   Department                1470 non-null   object
 5   DistanceFromHome          1470 non-null   int64 
 6   Education                 1470 non-null   int64 
 7   EducationField            1470 non-null   object
 8   EmployeeCount             1470 non-null   int64 
 9   EmployeeNumber            1470 non-null   int64 
 10  EnvironmentSatisfaction   1470 non-null   int64 
 11  Gender                    1470 non-null   object
 12  HourlyRate                1470 non-null   int64 
 13  JobInvolvement            1470 non-null   int64 
 14  JobLevel     

In [12]:
# use one-hot encoding 
print('\ndata shape before one-hot encoding: ', data_cpy.shape)
data_cpy = pd.get_dummies(data_cpy)
print('\ndata shape after encoding: ', data_cpy.shape)


data shape before one-hot encoding:  (1470, 34)

data shape after encoding:  (1470, 48)


In [13]:
# I'll use label encoding for the target

print('\ndata shape: ', data.shape)

data['JobRole'].unique()

job_roles_number = data['JobRole'].value_counts()
print('\n ---- The job roles and their numbers: \n', job_roles_number)
job_roles_before_encoding = data['JobRole'].unique()
print('\n--- Job roles before label encoding: \n', job_roles_before_encoding)

from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
data['JobRoleCategory'] = label_encoder.fit_transform(data['JobRole'])

job_roles_after_encoding = data['JobRoleCategory'].unique()

print('\n--- Job roles after label encoding: \n', job_roles_after_encoding)



data shape:  (1470, 35)

 ---- The job roles and their numbers: 
 Sales Executive              326
Research Scientist           292
Laboratory Technician        259
Manufacturing Director       145
Healthcare Representative    131
Manager                      102
Sales Representative          83
Research Director             80
Human Resources               52
Name: JobRole, dtype: int64

--- Job roles before label encoding: 
 ['Sales Executive' 'Research Scientist' 'Laboratory Technician'
 'Manufacturing Director' 'Healthcare Representative' 'Manager'
 'Sales Representative' 'Research Director' 'Human Resources']

--- Job roles after label encoding: 
 [7 6 2 4 0 3 8 5 1]


In [14]:
# X will be represented by the columns from data_cpy which has been one-hot encoded
# y will be represented by the JobRoleCategory column from data(this column has been label encoded)

features = data_cpy.columns
X = data_cpy[features]
y = data['JobRoleCategory']

print(X.head())
print(y.head())

   Age  DailyRate  DistanceFromHome  Education  EmployeeCount  EmployeeNumber  \
0   41       1102                 1          2              1               1   
1   49        279                 8          1              1               2   
2   37       1373                 2          2              1               4   
3   33       1392                 3          4              1               5   
4   27        591                 2          1              1               7   

   EnvironmentSatisfaction  HourlyRate  JobInvolvement  JobLevel  ...  \
0                        2          94               3         2  ...   
1                        3          61               2         2  ...   
2                        4          92               2         1  ...   
3                        4          56               3         1  ...   
4                        1          40               3         1  ...   

   EducationField_Other  EducationField_Technical Degree  Gender_Female  \

In [15]:
# split the data
# test_size=the proportion of the dataset to include in the test split
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=1)

print('Initial size of the dataset:', len(X))
print('The size of the dataset used for training the model: ',len(train_X))
print('The size of the dataset used for validating the model: ',len(test_X))


Initial size of the dataset: 1470
The size of the dataset used for training the model:  1176
The size of the dataset used for validating the model:  294


In [16]:
## DECISION TREE REGRESSOR

print('\n\n*** Decision tree regressor ***')
regressor = DecisionTreeRegressor(random_state=0)
regressor.fit(train_X, train_y)

regressor_predictions = regressor.predict(test_X)

print('\n--- Job roles before label encoding: \n', job_roles_before_encoding)
print('\n--- Job roles after label encoding: \n', job_roles_after_encoding)
print('\n --- Predictions: \n', regressor_predictions)
print('\n --- Actual: \n', test_y)

regressor_value_mae = mean_absolute_error(test_y, regressor.predict(test_X))



*** Decision tree regressor ***

--- Job roles before label encoding: 
 ['Sales Executive' 'Research Scientist' 'Laboratory Technician'
 'Manufacturing Director' 'Healthcare Representative' 'Manager'
 'Sales Representative' 'Research Director' 'Human Resources']

--- Job roles after label encoding: 
 [7 6 2 4 0 3 8 5 1]

 --- Predictions: 
 [6. 8. 6. 8. 4. 4. 3. 2. 7. 7. 5. 5. 8. 2. 6. 7. 1. 1. 6. 7. 0. 2. 6. 0.
 7. 2. 7. 8. 2. 6. 6. 7. 2. 0. 7. 2. 1. 1. 6. 2. 0. 2. 5. 8. 7. 0. 6. 0.
 0. 1. 5. 4. 7. 2. 7. 7. 2. 2. 7. 6. 3. 0. 3. 2. 0. 6. 6. 7. 7. 6. 4. 6.
 8. 2. 8. 3. 6. 0. 8. 1. 7. 6. 5. 1. 8. 2. 4. 5. 7. 7. 6. 7. 4. 6. 3. 4.
 7. 4. 7. 4. 7. 4. 7. 3. 7. 7. 6. 2. 2. 4. 2. 0. 2. 0. 0. 7. 7. 6. 2. 3.
 7. 2. 6. 2. 3. 6. 0. 0. 0. 3. 2. 0. 5. 7. 6. 0. 6. 7. 4. 1. 2. 3. 7. 6.
 7. 6. 7. 7. 6. 6. 6. 2. 2. 5. 6. 2. 6. 7. 3. 2. 1. 2. 1. 3. 2. 6. 2. 6.
 0. 6. 4. 7. 2. 6. 6. 7. 6. 6. 4. 8. 7. 7. 7. 6. 7. 7. 6. 3. 7. 4. 3. 3.
 3. 8. 7. 7. 0. 7. 4. 6. 8. 5. 6. 7. 7. 4. 2. 8. 2. 2. 7. 6. 7. 0. 6. 4

In [17]:
from sklearn.metrics import classification_report, confusion_matrix
print('\nConfusion matrix\n', confusion_matrix(test_y, regressor_predictions))
print('\n\n classification report\n', classification_report(test_y, regressor_predictions))


Confusion matrix
 [[12  0  4  0  6  0  3  0  0]
 [ 0 11  0  0  0  0  0  0  0]
 [ 5  0 19  0  0  0 25  0  0]
 [ 2  0  0 19  1  8  0  0  0]
 [ 9  0  1  0 15  0  3  0  0]
 [ 0  0  0  2  0  4  0  0  0]
 [ 4  0 23  0  4  0 28  0  0]
 [ 0  0  0  0  0  0  0 68  0]
 [ 0  0  0  0  0  0  0  0 18]]


 classification report
               precision    recall  f1-score   support

           0       0.38      0.48      0.42        25
           1       1.00      1.00      1.00        11
           2       0.40      0.39      0.40        49
           3       0.90      0.63      0.75        30
           4       0.58      0.54      0.56        28
           5       0.33      0.67      0.44         6
           6       0.47      0.47      0.47        59
           7       1.00      1.00      1.00        68
           8       1.00      1.00      1.00        18

    accuracy                           0.66       294
   macro avg       0.67      0.69      0.67       294
weighted avg       0.68      0.66 

In [18]:
# RANDOM FOREST REGRESSOR

print('\n\n*** Random forest regressor ***')
random_forest_regressor = RandomForestRegressor(random_state = 1)

random_forest_regressor.fit(train_X, train_y)

random_forest_regressor_predictions = random_forest_regressor.predict(test_X)
# Calculate the mean absolute error of your Random Forest model on the validation data
random_forest_value_mae = mean_absolute_error(test_y, random_forest_regressor_predictions)

print('\n --- Predictions: \n', random_forest_regressor_predictions)
print('\n --- Actual: \n', test_y)




*** Random forest regressor ***

 --- Predictions: 
 [3.08 8.   3.8  8.   3.71 3.26 3.32 4.16 7.   7.   4.26 4.18 8.   4.
 4.62 7.01 1.   1.   3.8  7.   2.88 4.28 2.54 2.47 7.   3.77 6.96 8.
 3.16 4.58 3.31 7.   3.24 2.59 7.   3.86 1.06 1.   4.   3.52 3.58 2.8
 4.1  8.   6.65 2.88 4.08 2.02 3.15 1.   4.63 3.08 7.   4.38 7.   7.22
 3.1  3.66 7.   3.71 3.22 2.84 3.08 3.98 2.28 4.78 4.04 6.88 7.   4.18
 2.32 4.06 8.   4.34 8.   3.7  3.62 3.17 8.   1.11 7.   4.1  3.65 1.
 8.   3.32 2.22 3.56 7.01 7.01 3.59 7.   3.28 3.02 3.76 1.75 7.   2.75
 7.   2.14 7.   2.87 6.94 3.16 7.   7.   3.72 3.94 3.54 3.55 3.46 3.39
 4.1  3.8  2.56 6.8  6.24 3.3  3.12 3.49 7.12 4.46 4.04 3.86 3.08 3.62
 2.96 2.53 2.03 3.12 3.78 2.9  4.42 6.26 3.04 2.86 4.56 7.02 2.11 1.
 3.72 3.43 7.   4.42 7.03 3.57 7.12 7.   4.32 3.94 3.62 3.72 3.33 4.21
 4.56 3.78 2.96 7.1  3.62 4.32 1.66 3.68 1.   3.21 3.68 3.92 3.71 4.14
 1.25 4.98 2.97 7.   2.58 5.36 5.02 7.   3.76 4.32 3.83 8.   7.   7.
 7.   3.37 7.   7.05 4.88 3.08 7.

In [None]:
print("\n--- Validation MAE for Random Forest Model: {}".format(random_forest_value_mae))
print("\n--- Validation MAE for Decision Tree Regressor Model: {}".format(regressor_value_mae))