## Import the Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns',None)

## Import the Dataset

In [2]:
hr_train = pd.read_csv(r'train_hr.csv',header=0)
hr_test = pd.read_csv(r'test_hr.csv',header=0)

### Working on Training Data

In [3]:
hr_train.head()

Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted
0,65438,Sales & Marketing,region_7,Master's & above,f,sourcing,1,35,5.0,8,1,0,49,0
1,65141,Operations,region_22,Bachelor's,m,other,1,30,5.0,4,0,0,60,0
2,7513,Sales & Marketing,region_19,Bachelor's,m,sourcing,1,34,3.0,7,0,0,50,0
3,2542,Sales & Marketing,region_23,Bachelor's,m,other,2,39,1.0,10,0,0,50,0
4,48945,Technology,region_26,Bachelor's,m,other,1,45,3.0,2,0,0,73,0


In [4]:
for i in hr_train:
    print({i: hr_train[i].unique()})

{'employee_id': array([65438, 65141,  7513, ..., 13918, 13614, 51526], dtype=int64)}
{'department': array(['Sales & Marketing', 'Operations', 'Technology', 'Analytics',
       'R&D', 'Procurement', 'Finance', 'HR', 'Legal'], dtype=object)}
{'region': array(['region_7', 'region_22', 'region_19', 'region_23', 'region_26',
       'region_2', 'region_20', 'region_34', 'region_1', 'region_4',
       'region_29', 'region_31', 'region_15', 'region_14', 'region_11',
       'region_5', 'region_28', 'region_17', 'region_13', 'region_16',
       'region_25', 'region_10', 'region_27', 'region_30', 'region_12',
       'region_21', 'region_8', 'region_32', 'region_6', 'region_33',
       'region_24', 'region_3', 'region_9', 'region_18'], dtype=object)}
{'education': array(["Master's & above", "Bachelor's", nan, 'Below Secondary'],
      dtype=object)}
{'gender': array(['f', 'm'], dtype=object)}
{'recruitment_channel': array(['sourcing', 'other', 'referred'], dtype=object)}
{'no_of_trainings': array(

### Creating a Duplicate Dataset

In [5]:
hr_train_rev = pd.DataFrame.copy(hr_train)

In [6]:
hr_train_rev.shape

(54808, 14)

## Feature Selection

In [7]:
hr_train_rev.drop(['employee_id','region'],axis=1,inplace=True)

In [8]:
hr_train_rev.shape

(54808, 12)

In [9]:
hr_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54808 entries, 0 to 54807
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   employee_id           54808 non-null  int64  
 1   department            54808 non-null  object 
 2   region                54808 non-null  object 
 3   education             52399 non-null  object 
 4   gender                54808 non-null  object 
 5   recruitment_channel   54808 non-null  object 
 6   no_of_trainings       54808 non-null  int64  
 7   age                   54808 non-null  int64  
 8   previous_year_rating  50684 non-null  float64
 9   length_of_service     54808 non-null  int64  
 10  KPIs_met >80%         54808 non-null  int64  
 11  awards_won?           54808 non-null  int64  
 12  avg_training_score    54808 non-null  int64  
 13  is_promoted           54808 non-null  int64  
dtypes: float64(1), int64(8), object(5)
memory usage: 5.9+ MB


In [10]:
hr_train_rev.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54808 entries, 0 to 54807
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   department            54808 non-null  object 
 1   education             52399 non-null  object 
 2   gender                54808 non-null  object 
 3   recruitment_channel   54808 non-null  object 
 4   no_of_trainings       54808 non-null  int64  
 5   age                   54808 non-null  int64  
 6   previous_year_rating  50684 non-null  float64
 7   length_of_service     54808 non-null  int64  
 8   KPIs_met >80%         54808 non-null  int64  
 9   awards_won?           54808 non-null  int64  
 10  avg_training_score    54808 non-null  int64  
 11  is_promoted           54808 non-null  int64  
dtypes: float64(1), int64(7), object(4)
memory usage: 5.0+ MB


## Handling The Missing Values

In [11]:
hr_train_rev.isna().sum()

department                 0
education               2409
gender                     0
recruitment_channel        0
no_of_trainings            0
age                        0
previous_year_rating    4124
length_of_service          0
KPIs_met >80%              0
awards_won?                0
avg_training_score         0
is_promoted                0
dtype: int64

In [12]:
hr_train_rev.columns

Index(['department', 'education', 'gender', 'recruitment_channel',
       'no_of_trainings', 'age', 'previous_year_rating', 'length_of_service',
       'KPIs_met >80%', 'awards_won?', 'avg_training_score', 'is_promoted'],
      dtype='object')

In [13]:
for value in['education','previous_year_rating']:
    hr_train_rev[value].fillna(hr_train_rev[value].mode()[0],inplace=True)

In [14]:
hr_train_rev.education.mode()[0]

"Bachelor's"

In [15]:
hr_train_rev.previous_year_rating.mode()[0]

3.0

In [16]:
hr_train_rev.isna().sum()

department              0
education               0
gender                  0
recruitment_channel     0
no_of_trainings         0
age                     0
previous_year_rating    0
length_of_service       0
KPIs_met >80%           0
awards_won?             0
avg_training_score      0
is_promoted             0
dtype: int64

In [17]:
hr_train_rev.head()

Unnamed: 0,department,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted
0,Sales & Marketing,Master's & above,f,sourcing,1,35,5.0,8,1,0,49,0
1,Operations,Bachelor's,m,other,1,30,5.0,4,0,0,60,0
2,Sales & Marketing,Bachelor's,m,sourcing,1,34,3.0,7,0,0,50,0
3,Sales & Marketing,Bachelor's,m,other,2,39,1.0,10,0,0,50,0
4,Technology,Bachelor's,m,other,1,45,3.0,2,0,0,73,0


### Calling the columns having dtype as object

In [18]:
colname=[]
for x in hr_train_rev.columns:
    if hr_train_rev[x].dtype=='object':
        colname.append(x)
colname

['department', 'education', 'gender', 'recruitment_channel']

### Converting Categorical Variables into Numerical Variables by using LabelEncoder()

In [19]:
from sklearn.preprocessing import LabelEncoder
 
le=LabelEncoder()
 
for x in colname:
    hr_train_rev[x]=le.fit_transform(hr_train_rev[x])

In [20]:
hr_train_rev.head()

Unnamed: 0,department,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted
0,7,2,0,2,1,35,5.0,8,1,0,49,0
1,4,0,1,0,1,30,5.0,4,0,0,60,0
2,7,0,1,2,1,34,3.0,7,0,0,50,0
3,7,0,1,0,2,39,1.0,10,0,0,50,0
4,8,0,1,0,1,45,3.0,2,0,0,73,0


### Create X and Y

In [21]:
X = hr_train_rev.values[:,0:-1]
Y = hr_train_rev.values[:,-1]

In [22]:
print(X.shape)
print(Y.shape)
Y=Y.astype(int)

(54808, 11)
(54808,)


In [23]:
from sklearn.preprocessing import StandardScaler
 
scaler = StandardScaler()
 
scaler.fit(X)
X = scaler.transform(X)
#X=scaler.fit_transform(X)
#print(X)

In [24]:
print(X)

[[ 0.80893285  1.61975831 -1.53622276 ...  1.35687789 -0.15401776
  -1.07593145]
 [-0.38818322 -0.62883817  0.65094726 ... -0.73698599 -0.15401776
  -0.25328242]
 [ 0.80893285 -0.62883817  0.65094726 ... -0.73698599 -0.15401776
  -1.00114517]
 ...
 [-1.98433798 -0.62883817  0.65094726 ...  1.35687789 -0.15401776
   1.1676568 ]
 [ 0.80893285 -0.62883817  0.65094726 ... -0.73698599 -0.15401776
  -1.37507655]
 [-1.1862606  -0.62883817  0.65094726 ... -0.73698599 -0.15401776
  -1.07593145]]


### Splitting the data into train and test

In [25]:
from sklearn.model_selection import train_test_split
 
#Split the data into test and train
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, 
                                                    random_state=10)

In [26]:
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

(38365, 11)
(16443, 11)
(38365,)
(16443,)


## SMOTE

In [27]:
print("Before OverSampling, counts of label '1': ", (sum(Y_train == 1)))
print("Before OverSampling, counts of label '0': ", (sum(Y_train == 0)))
  
# import SMOTE from imblearn library
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state = 10,k_neighbors=5)
X_train_res, Y_train_res = sm.fit_resample(X_train, Y_train)
  
print('After OverSampling, the shape of train_X: ', (X_train_res.shape))
print('After OverSampling, the shape of train_y: ', (Y_train_res.shape))
  
print("After OverSampling, counts of label '1': ", (sum(Y_train_res == 1)))
print("After OverSampling, counts of label '0': ", (sum(Y_train_res == 0)))

Before OverSampling, counts of label '1':  3289
Before OverSampling, counts of label '0':  35076
After OverSampling, the shape of train_X:  (70152, 11)
After OverSampling, the shape of train_y:  (70152,)
After OverSampling, counts of label '1':  35076
After OverSampling, counts of label '0':  35076


### By using SMOTE will can train our data well because it will increase the minority to majority and help to train the data well.

## Runing the Logistic Regression Model

In [28]:
from sklearn.linear_model import LogisticRegression
# create a model object
logreg = LogisticRegression()
# train the model object
logreg.fit(X_train_res,Y_train_res)

Y_pred=logreg.predict(X_test)
print(Y_pred)

[0 0 0 ... 0 0 0]


In [29]:
Y_pred_prob=logreg.predict_proba(X_test)
Y_pred_prob

array([[0.91956461, 0.08043539],
       [0.95018116, 0.04981884],
       [0.78639303, 0.21360697],
       ...,
       [0.92998745, 0.07001255],
       [0.55333606, 0.44666394],
       [0.75676329, 0.24323671]])

In [30]:
print(list(zip(Y_test,Y_pred)))


[(0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (1, 0), (0, 0), (0, 1), (0, 0), (0, 0), (0, 0), (0, 1), (0, 1), (0, 1), (0, 0), (0, 0), (0, 0), (0, 0), (1, 0), (0, 0), (0, 0), (0, 0), (1, 1), (0, 1), (0, 1), (0, 0), (0, 0), (0, 0), (0, 0), (0, 1), (0, 0), (1, 1), (0, 1), (0, 0), (0, 0), (0, 0), (0, 0), (0, 1), (0, 0), (0, 0), (0, 0), (0, 1), (0, 1), (0, 1), (0, 0), (0, 0), (0, 0), (0, 1), (0, 0), (0, 0), (1, 1), (0, 0), (1, 1), (0, 1), (0, 1), (0, 1), (0, 0), (1, 0), (0, 0), (0, 0), (0, 0), (0, 0), (1, 1), (0, 0), (0, 1), (1, 1), (0, 1), (0, 1), (0, 0), (1, 1), (0, 1), (1, 1), (0, 0), (0, 1), (0, 1), (0, 0), (0, 0), (0, 1), (0, 0), (0, 0), (1, 1), (0, 0), (1, 1), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 1), (0, 0), (0, 0), (0, 0), (0, 1), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 1), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 1), (1, 0), (0, 1), (0, 0), (0, 1), (0, 0), (0, 1), (0, 0), (0, 1), (0, 0), (0, 1), (0, 0), (0, 0),

In [31]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
 
cfm=confusion_matrix(Y_test,Y_pred)
print(cfm)
print()
 
print("Classification report: ")

 
print(classification_report(Y_test,Y_pred))
print()

 
acc=accuracy_score(Y_test,Y_pred)
print("Accuracy of the model: ",acc)

[[10804  4260]
 [  425   954]]

Classification report: 
              precision    recall  f1-score   support

           0       0.96      0.72      0.82     15064
           1       0.18      0.69      0.29      1379

    accuracy                           0.72     16443
   macro avg       0.57      0.70      0.56     16443
weighted avg       0.90      0.72      0.78     16443


Accuracy of the model:  0.7150763242717265


## Running the Decision Tree Model

In [32]:
from sklearn.tree import DecisionTreeClassifier

# create a model object
model_DT = DecisionTreeClassifier(random_state=10,criterion='entropy')

#min_samples_leaf, min_samples_split, max_depth, max_features, max_leaf_nodes

# train the model object
model_DT.fit(X_train_res,Y_train_res)

Y_pred=model_DT.predict(X_test)
print(Y_pred)

[0 0 0 ... 0 0 0]


In [33]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
 
cfm=confusion_matrix(Y_test,Y_pred)
print(cfm)
print()
 
print("Classification report: ")

 
print(classification_report(Y_test,Y_pred))
print()

 
acc=accuracy_score(Y_test, Y_pred)
print("Accuracy of the model: ",acc)

[[14099   965]
 [  783   596]]

Classification report: 
              precision    recall  f1-score   support

           0       0.95      0.94      0.94     15064
           1       0.38      0.43      0.41      1379

    accuracy                           0.89     16443
   macro avg       0.66      0.68      0.67     16443
weighted avg       0.90      0.89      0.90     16443


Accuracy of the model:  0.8936933649577328


In [34]:
model_DT.score(X_train,Y_train)

0.9969764107910857

## Pruning

### Attempt 1

In [35]:
from sklearn.tree import DecisionTreeClassifier

# create a model object
model_DT = DecisionTreeClassifier(random_state=10,criterion='gini',
                                 splitter='best',
                                  min_samples_leaf=3,
                                 min_samples_split=5,
                                 max_depth=10)

#min_samples_leaf, min_samples_split, max_depth, max_features, max_leaf_nodes

# train the model object
model_DT.fit(X_train_res,Y_train_res)

Y_pred=model_DT.predict(X_test)
print(Y_pred)

[0 0 0 ... 0 0 0]


In [36]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
 
cfm=confusion_matrix(Y_test,Y_pred)
print(cfm)
print()
 
print("Classification report: ")

 
print(classification_report(Y_test,Y_pred))
print()

 
acc=accuracy_score(Y_test, Y_pred)
print("Accuracy of the model: ",acc)

[[11529  3535]
 [  362  1017]]

Classification report: 
              precision    recall  f1-score   support

           0       0.97      0.77      0.86     15064
           1       0.22      0.74      0.34      1379

    accuracy                           0.76     16443
   macro avg       0.60      0.75      0.60     16443
weighted avg       0.91      0.76      0.81     16443


Accuracy of the model:  0.7629994526546251


### Attempt 2

In [37]:
from sklearn.tree import DecisionTreeClassifier

# create a model object
model_DT = DecisionTreeClassifier(random_state=10,criterion='gini',
                                 splitter='best',
                                  min_samples_leaf=2,
                                 min_samples_split=3,
                                 max_depth=12)

#min_samples_leaf, min_samples_split, max_depth, max_features, max_leaf_nodes

# train the model object
model_DT.fit(X_train_res,Y_train_res)

Y_pred=model_DT.predict(X_test)
print(Y_pred)

[0 0 0 ... 0 0 0]


In [38]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
 
cfm=confusion_matrix(Y_test,Y_pred)
print(cfm)
print()
 
print("Classification report: ")

 
print(classification_report(Y_test,Y_pred))
print()

 
acc=accuracy_score(Y_test, Y_pred)
print("Accuracy of the model: ",acc)

[[11818  3246]
 [  370  1009]]

Classification report: 
              precision    recall  f1-score   support

           0       0.97      0.78      0.87     15064
           1       0.24      0.73      0.36      1379

    accuracy                           0.78     16443
   macro avg       0.60      0.76      0.61     16443
weighted avg       0.91      0.78      0.82     16443


Accuracy of the model:  0.7800887915830444


### Attempt 3

In [39]:
from sklearn.tree import DecisionTreeClassifier

# create a model object
model_DT = DecisionTreeClassifier(random_state=10,criterion='gini',
                                 splitter='best',
                                  min_samples_leaf=2,
                                 min_samples_split=2,
                                 max_depth=15)

#min_samples_leaf, min_samples_split, max_depth, max_features, max_leaf_nodes

# train the model object
model_DT.fit(X_train_res,Y_train_res)

Y_pred=model_DT.predict(X_test)
print(Y_pred)

[0 0 0 ... 0 0 0]


In [40]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
 
cfm=confusion_matrix(Y_test,Y_pred)
print(cfm)
print()
 
print("Classification report: ")

 
print(classification_report(Y_test,Y_pred))
print()

 
acc=accuracy_score(Y_test, Y_pred)
print("Accuracy of the model: ",acc)

[[12636  2428]
 [  488   891]]

Classification report: 
              precision    recall  f1-score   support

           0       0.96      0.84      0.90     15064
           1       0.27      0.65      0.38      1379

    accuracy                           0.82     16443
   macro avg       0.62      0.74      0.64     16443
weighted avg       0.90      0.82      0.85     16443


Accuracy of the model:  0.8226600985221675


### In Training Data, the best model is Base Decision Tree where we get the maximum accuracy of 89.36%. So for feather evaluation on Test data will be used this Model.

## Working on Test Data

In [41]:
hr_test.head()

Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score
0,8724,Technology,region_26,Bachelor's,m,sourcing,1,24,,1,1,0,77
1,74430,HR,region_4,Bachelor's,f,other,1,31,3.0,5,0,0,51
2,72255,Sales & Marketing,region_13,Bachelor's,m,other,1,31,1.0,4,0,0,47
3,38562,Procurement,region_2,Bachelor's,f,other,3,31,2.0,9,0,0,65
4,64486,Finance,region_29,Bachelor's,m,sourcing,1,30,4.0,7,0,0,61


In [42]:
for i in hr_test:
    print({i: hr_test[i].unique()})

{'employee_id': array([ 8724, 74430, 72255, ..., 45409,  1186,  5973], dtype=int64)}
{'department': array(['Technology', 'HR', 'Sales & Marketing', 'Procurement', 'Finance',
       'Analytics', 'Operations', 'Legal', 'R&D'], dtype=object)}
{'region': array(['region_26', 'region_4', 'region_13', 'region_2', 'region_29',
       'region_7', 'region_22', 'region_16', 'region_17', 'region_24',
       'region_11', 'region_27', 'region_9', 'region_20', 'region_34',
       'region_23', 'region_8', 'region_14', 'region_31', 'region_19',
       'region_5', 'region_28', 'region_15', 'region_3', 'region_25',
       'region_12', 'region_21', 'region_30', 'region_10', 'region_33',
       'region_32', 'region_6', 'region_1', 'region_18'], dtype=object)}
{'education': array(["Bachelor's", "Master's & above", nan, 'Below Secondary'],
      dtype=object)}
{'gender': array(['m', 'f'], dtype=object)}
{'recruitment_channel': array(['sourcing', 'other', 'referred'], dtype=object)}
{'no_of_trainings': array(

### Creating a Duplicated Test Data

In [43]:
hr_test.shape

(23490, 13)

In [44]:
hr_test_rev = pd.DataFrame.copy(hr_test)
hr_test_rev.head()

Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score
0,8724,Technology,region_26,Bachelor's,m,sourcing,1,24,,1,1,0,77
1,74430,HR,region_4,Bachelor's,f,other,1,31,3.0,5,0,0,51
2,72255,Sales & Marketing,region_13,Bachelor's,m,other,1,31,1.0,4,0,0,47
3,38562,Procurement,region_2,Bachelor's,f,other,3,31,2.0,9,0,0,65
4,64486,Finance,region_29,Bachelor's,m,sourcing,1,30,4.0,7,0,0,61


## Feature Selection

In [45]:
hr_test_rev.drop(['employee_id','region'], axis=1, inplace=True)

In [46]:
hr_test_rev.head()

Unnamed: 0,department,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score
0,Technology,Bachelor's,m,sourcing,1,24,,1,1,0,77
1,HR,Bachelor's,f,other,1,31,3.0,5,0,0,51
2,Sales & Marketing,Bachelor's,m,other,1,31,1.0,4,0,0,47
3,Procurement,Bachelor's,f,other,3,31,2.0,9,0,0,65
4,Finance,Bachelor's,m,sourcing,1,30,4.0,7,0,0,61


In [47]:
hr_test_rev.shape

(23490, 11)

In [48]:
hr_test_rev.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23490 entries, 0 to 23489
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   department            23490 non-null  object 
 1   education             22456 non-null  object 
 2   gender                23490 non-null  object 
 3   recruitment_channel   23490 non-null  object 
 4   no_of_trainings       23490 non-null  int64  
 5   age                   23490 non-null  int64  
 6   previous_year_rating  21678 non-null  float64
 7   length_of_service     23490 non-null  int64  
 8   KPIs_met >80%         23490 non-null  int64  
 9   awards_won?           23490 non-null  int64  
 10  avg_training_score    23490 non-null  int64  
dtypes: float64(1), int64(6), object(4)
memory usage: 2.0+ MB


## Handling Missing Values

In [49]:
hr_test_rev.isna().sum()

department                 0
education               1034
gender                     0
recruitment_channel        0
no_of_trainings            0
age                        0
previous_year_rating    1812
length_of_service          0
KPIs_met >80%              0
awards_won?                0
avg_training_score         0
dtype: int64

In [50]:
hr_test_rev.columns

Index(['department', 'education', 'gender', 'recruitment_channel',
       'no_of_trainings', 'age', 'previous_year_rating', 'length_of_service',
       'KPIs_met >80%', 'awards_won?', 'avg_training_score'],
      dtype='object')

In [51]:
for value in['education','previous_year_rating']:
    hr_test_rev[value].fillna(hr_test_rev[value].mode()[0],inplace=True)

In [52]:
hr_test_rev.education.mode()[0]

"Bachelor's"

In [53]:
hr_test_rev.previous_year_rating.mode()[0]

3.0

In [54]:
hr_test_rev.isna().sum()

department              0
education               0
gender                  0
recruitment_channel     0
no_of_trainings         0
age                     0
previous_year_rating    0
length_of_service       0
KPIs_met >80%           0
awards_won?             0
avg_training_score      0
dtype: int64

In [55]:
hr_test_rev.head()

Unnamed: 0,department,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score
0,Technology,Bachelor's,m,sourcing,1,24,3.0,1,1,0,77
1,HR,Bachelor's,f,other,1,31,3.0,5,0,0,51
2,Sales & Marketing,Bachelor's,m,other,1,31,1.0,4,0,0,47
3,Procurement,Bachelor's,f,other,3,31,2.0,9,0,0,65
4,Finance,Bachelor's,m,sourcing,1,30,4.0,7,0,0,61


### Calling the columns having dtype as object

In [56]:
colname=[]
for x in hr_test_rev.columns:
    if hr_test_rev[x].dtype=='object':
        colname.append(x)
colname

['department', 'education', 'gender', 'recruitment_channel']

### Converting Categorical Variables into Numerical Variables by using LabelEncoder()

In [57]:
from sklearn.preprocessing import LabelEncoder
 
le=LabelEncoder()
 
for x in colname:
    hr_test_rev[x]=le.fit_transform(hr_test_rev[x])

In [58]:
hr_test_rev.head()

Unnamed: 0,department,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score
0,8,0,1,2,1,24,3.0,1,1,0,77
1,2,0,0,0,1,31,3.0,5,0,0,51
2,7,0,1,0,1,31,1.0,4,0,0,47
3,5,0,0,0,3,31,2.0,9,0,0,65
4,1,0,1,2,1,30,4.0,7,0,0,61


## Create X and Y

In [59]:
X_test_new = hr_test_rev.values[:,:]

In [60]:
print(X_test_new.shape)

(23490, 11)


In [61]:
X_test_new = scaler.transform(X_test_new)
#X=scaler.fit_transform(X)
#print(X)


In [62]:
print(X_test_new)

[[ 1.20797154 -0.62883817  0.65094726 ...  1.35687789 -0.15401776
   1.01808425]
 [-1.1862606  -0.62883817 -1.53622276 ... -0.73698599 -0.15401776
  -0.9263589 ]
 [ 0.80893285 -0.62883817  0.65094726 ... -0.73698599 -0.15401776
  -1.225504  ]
 ...
 [-1.1862606  -0.62883817 -1.53622276 ... -0.73698599 -0.15401776
  -1.00114517]
 [ 0.01085547 -0.62883817  0.65094726 ... -0.73698599 -0.15401776
   0.49458032]
 [ 1.20797154  1.61975831  0.65094726 ...  1.35687789 -0.15401776
   1.91551954]]


### Predicting on Test Data

In [63]:
Y_pred_new=model_DT.predict(X_test_new)
print(Y_pred_new)

[1 0 0 ... 0 0 0]


In [64]:
hr_test = pd.read_csv(r'test_hr.csv',header=0)
hr_test["Pred"]=Y_pred_new
hr_test.head()

Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,Pred
0,8724,Technology,region_26,Bachelor's,m,sourcing,1,24,,1,1,0,77,1
1,74430,HR,region_4,Bachelor's,f,other,1,31,3.0,5,0,0,51,0
2,72255,Sales & Marketing,region_13,Bachelor's,m,other,1,31,1.0,4,0,0,47,0
3,38562,Procurement,region_2,Bachelor's,f,other,3,31,2.0,9,0,0,65,0
4,64486,Finance,region_29,Bachelor's,m,sourcing,1,30,4.0,7,0,0,61,0


In [65]:
hr_test.Pred.value_counts()

0    18686
1     4804
Name: Pred, dtype: int64

In [67]:
hr_test.to_excel('HR Analytics Decision Test Output.xlsx',header=True)

### The Prediction File of the HR Analytics Test Data is save to excel 