In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
%matplotlib inline
import seaborn as sns

pd.set_option('display.max_columns', 100)

In [2]:
train = pd.read_csv('train_LZdllcl.csv')
test = pd.read_csv('test_2umaH9m.csv')

train.head()
print(test.head())
print(train.isnull().sum())
print(test.isnull().sum())

   employee_id         department     region   education gender  \
0         8724         Technology  region_26  Bachelor's      m   
1        74430                 HR   region_4  Bachelor's      f   
2        72255  Sales & Marketing  region_13  Bachelor's      m   
3        38562        Procurement   region_2  Bachelor's      f   
4        64486            Finance  region_29  Bachelor's      m   

  recruitment_channel  no_of_trainings  age  previous_year_rating  \
0            sourcing                1   24                   NaN   
1               other                1   31                   3.0   
2               other                1   31                   1.0   
3               other                3   31                   2.0   
4            sourcing                1   30                   4.0   

   length_of_service  KPIs_met >80%  awards_won?  avg_training_score  
0                  1              1            0                  77  
1                  5              0     

# Cleaning Data

In [3]:
train.columns = ['id','dept','reg','edu','gen','rec','nof','age','rat','los','kpi','won','ats','pro']
test.columns = ['id','dept','reg','edu','gen','rec','nof','age','rat','los','kpi','won','ats']

train.head()

Unnamed: 0,id,dept,reg,edu,gen,rec,nof,age,rat,los,kpi,won,ats,pro
0,65438,Sales & Marketing,region_7,Master's & above,f,sourcing,1,35,5.0,8,1,0,49,0
1,65141,Operations,region_22,Bachelor's,m,other,1,30,5.0,4,0,0,60,0
2,7513,Sales & Marketing,region_19,Bachelor's,m,sourcing,1,34,3.0,7,0,0,50,0
3,2542,Sales & Marketing,region_23,Bachelor's,m,other,2,39,1.0,10,0,0,50,0
4,48945,Technology,region_26,Bachelor's,m,other,1,45,3.0,2,0,0,73,0


## id

In [4]:
print(train.id.value_counts().sum())
print(train.shape)

54808
(54808, 14)


In [5]:
print(test.id.value_counts().sum())
print(test.shape)

23490
(23490, 13)


No edits in id

## dept

In [6]:
train.dept.value_counts()

Sales & Marketing    16840
Operations           11348
Technology            7138
Procurement           7138
Analytics             5352
Finance               2536
HR                    2418
Legal                 1039
R&D                    999
Name: dept, dtype: int64

In [7]:
test.dept.value_counts()

Sales & Marketing    7315
Operations           4764
Procurement          3020
Technology           3011
Analytics            2319
Finance              1091
HR                   1085
Legal                 445
R&D                   440
Name: dept, dtype: int64

Let's apply some labels using sklearn.preprocessing.LabelEncoder

In [8]:
from sklearn.preprocessing import LabelEncoder

In [9]:
def train_label(a):
    train[a] = LabelEncoder().fit_transform(train[a])
    
def test_label(a):
    test[a] = LabelEncoder().fit_transform(test[a])

In [10]:
train_label('dept')

train.dept.value_counts()

7    16840
4    11348
8     7138
5     7138
0     5352
1     2536
2     2418
3     1039
6      999
Name: dept, dtype: int64

In [11]:
test_label('dept')

test.dept.value_counts()

7    7315
4    4764
5    3020
8    3011
0    2319
1    1091
2    1085
3     445
6     440
Name: dept, dtype: int64

Analytics  --> 0

Finance --> 1

HR --> 2
 
Legal --> 3
 
Operations --> 4
 
Technology --> 5
 
R&D --> 6
 
Sales & Marketing --> 7
 
Technology --> 8
 
No extra edits in dept

In [12]:
train = pd.get_dummies(train, columns = ['dept'])
test = pd.get_dummies(test, columns = ['dept'])

train.head()

Unnamed: 0,id,reg,edu,gen,rec,nof,age,rat,los,kpi,won,ats,pro,dept_0,dept_1,dept_2,dept_3,dept_4,dept_5,dept_6,dept_7,dept_8
0,65438,region_7,Master's & above,f,sourcing,1,35,5.0,8,1,0,49,0,0,0,0,0,0,0,0,1,0
1,65141,region_22,Bachelor's,m,other,1,30,5.0,4,0,0,60,0,0,0,0,0,1,0,0,0,0
2,7513,region_19,Bachelor's,m,sourcing,1,34,3.0,7,0,0,50,0,0,0,0,0,0,0,0,1,0
3,2542,region_23,Bachelor's,m,other,2,39,1.0,10,0,0,50,0,0,0,0,0,0,0,0,1,0
4,48945,region_26,Bachelor's,m,other,1,45,3.0,2,0,0,73,0,0,0,0,0,0,0,0,0,1


## reg

In [13]:
print(train.reg.value_counts())
print(test.reg.value_counts())

region_2     12343
region_22     6428
region_7      4843
region_15     2808
region_13     2648
region_26     2260
region_31     1935
region_4      1703
region_27     1659
region_16     1465
region_28     1318
region_11     1315
region_23     1175
region_29      994
region_32      945
region_19      874
region_20      850
region_14      827
region_25      819
region_17      796
region_5       766
region_6       690
region_30      657
region_8       655
region_10      648
region_1       610
region_24      508
region_12      500
region_9       420
region_21      411
region_3       346
region_34      292
region_33      269
region_18       31
Name: reg, dtype: int64
region_2     5299
region_22    2739
region_7     1982
region_13    1167
region_15    1130
region_26    1011
region_31     844
region_4      775
region_27     710
region_28     595
region_16     590
region_11     571
region_23     516
region_32     433
region_29     414
region_19     410
region_17     361
region_14     350
region

In [14]:
print(train.reg.value_counts().shape,
     test.reg.value_counts().shape)

(34,) (34,)


Hence no. of regions are same in train and test

In [15]:
train_label('reg')
test_label('reg')

print(train.reg.value_counts())
print(test.reg.value_counts())

11    12343
14     6428
31     4843
6      2808
4      2648
18     2260
24     1935
28     1703
19     1659
7      1465
20     1318
2      1315
15     1175
21      994
25      945
10      874
12      850
5       827
17      819
8       796
29      766
30      690
23      657
32      655
1       648
0       610
16      508
3       500
33      420
13      411
22      346
27      292
26      269
9        31
Name: reg, dtype: int64
11    5299
14    2739
31    1982
4     1167
6     1130
18    1011
24     844
28     775
19     710
20     595
7      590
2      571
15     516
25     433
21     414
10     410
8      361
5      350
29     342
17     337
12     326
30     298
23     273
1      269
32     269
0      238
16     219
3      215
33     180
13     179
27     155
22     147
26     126
9       20
Name: reg, dtype: int64


In [16]:
train = pd.get_dummies(train, columns = ['reg'])
test = pd.get_dummies(test, columns = ['reg'])


train.head()

Unnamed: 0,id,edu,gen,rec,nof,age,rat,los,kpi,won,ats,pro,dept_0,dept_1,dept_2,dept_3,dept_4,dept_5,dept_6,dept_7,dept_8,reg_0,reg_1,reg_2,reg_3,reg_4,reg_5,reg_6,reg_7,reg_8,reg_9,reg_10,reg_11,reg_12,reg_13,reg_14,reg_15,reg_16,reg_17,reg_18,reg_19,reg_20,reg_21,reg_22,reg_23,reg_24,reg_25,reg_26,reg_27,reg_28,reg_29,reg_30,reg_31,reg_32,reg_33
0,65438,Master's & above,f,sourcing,1,35,5.0,8,1,0,49,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
1,65141,Bachelor's,m,other,1,30,5.0,4,0,0,60,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,7513,Bachelor's,m,sourcing,1,34,3.0,7,0,0,50,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,2542,Bachelor's,m,other,2,39,1.0,10,0,0,50,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,48945,Bachelor's,m,other,1,45,3.0,2,0,0,73,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


## gen --> mal

In [17]:
print(train.gen.value_counts())
print(test.gen.value_counts())

m    38496
f    16312
Name: gen, dtype: int64
m    16596
f     6894
Name: gen, dtype: int64


In [18]:
train.gen.replace({'m' : 1, 'f' : 0}, inplace = True)
train.rename(columns = {'gen' : 'mal'}, inplace = True) #Changed column from gender to male

test.gen.replace({'m' : 1, 'f' : 0}, inplace = True)
test.rename(columns = {'gen' : 'mal'}, inplace = True) #Changed column from gender to male

train.head()

Unnamed: 0,id,edu,mal,rec,nof,age,rat,los,kpi,won,ats,pro,dept_0,dept_1,dept_2,dept_3,dept_4,dept_5,dept_6,dept_7,dept_8,reg_0,reg_1,reg_2,reg_3,reg_4,reg_5,reg_6,reg_7,reg_8,reg_9,reg_10,reg_11,reg_12,reg_13,reg_14,reg_15,reg_16,reg_17,reg_18,reg_19,reg_20,reg_21,reg_22,reg_23,reg_24,reg_25,reg_26,reg_27,reg_28,reg_29,reg_30,reg_31,reg_32,reg_33
0,65438,Master's & above,0,sourcing,1,35,5.0,8,1,0,49,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
1,65141,Bachelor's,1,other,1,30,5.0,4,0,0,60,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,7513,Bachelor's,1,sourcing,1,34,3.0,7,0,0,50,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,2542,Bachelor's,1,other,2,39,1.0,10,0,0,50,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,48945,Bachelor's,1,other,1,45,3.0,2,0,0,73,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


## rec

In [19]:
print(train.rec.value_counts())
print(test.rec.value_counts())

other       30446
sourcing    23220
referred     1142
Name: rec, dtype: int64
other       13078
sourcing     9961
referred      451
Name: rec, dtype: int64


In [20]:
train_label('rec')
test_label('rec')

print(train.rec.value_counts())
print(test.rec.value_counts())

0    30446
2    23220
1     1142
Name: rec, dtype: int64
0    13078
2     9961
1      451
Name: rec, dtype: int64


In [21]:
train = pd.get_dummies(train, columns = ['rec'])
test = pd.get_dummies(test, columns = ['rec'])


train.head()

Unnamed: 0,id,edu,mal,nof,age,rat,los,kpi,won,ats,pro,dept_0,dept_1,dept_2,dept_3,dept_4,dept_5,dept_6,dept_7,dept_8,reg_0,reg_1,reg_2,reg_3,reg_4,reg_5,reg_6,reg_7,reg_8,reg_9,reg_10,reg_11,reg_12,reg_13,reg_14,reg_15,reg_16,reg_17,reg_18,reg_19,reg_20,reg_21,reg_22,reg_23,reg_24,reg_25,reg_26,reg_27,reg_28,reg_29,reg_30,reg_31,reg_32,reg_33,rec_0,rec_1,rec_2
0,65438,Master's & above,0,1,35,5.0,8,1,0,49,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1
1,65141,Bachelor's,1,1,30,5.0,4,0,0,60,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,7513,Bachelor's,1,1,34,3.0,7,0,0,50,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3,2542,Bachelor's,1,2,39,1.0,10,0,0,50,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
4,48945,Bachelor's,1,1,45,3.0,2,0,0,73,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0


## edu

In [22]:
print(train.edu.value_counts())
print(test.edu.value_counts())

Bachelor's          36669
Master's & above    14925
Below Secondary       805
Name: edu, dtype: int64
Bachelor's          15578
Master's & above     6504
Below Secondary       374
Name: edu, dtype: int64


In [28]:
train.edu.replace({"Bachelor's" : 1, "Master's & above" : 0, 'Below Secondary' : 2}, inplace = True)
test.edu.replace({"Bachelor's" : 1, "Master's & above" : 0, 'Below Secondary' : 2}, inplace = True)

train.head()

Unnamed: 0,id,edu,mal,nof,age,rat,los,kpi,won,ats,pro,dept_0,dept_1,dept_2,dept_3,dept_4,dept_5,dept_6,dept_7,dept_8,reg_0,reg_1,reg_2,reg_3,reg_4,reg_5,reg_6,reg_7,reg_8,reg_9,reg_10,reg_11,reg_12,reg_13,reg_14,reg_15,reg_16,reg_17,reg_18,reg_19,reg_20,reg_21,reg_22,reg_23,reg_24,reg_25,reg_26,reg_27,reg_28,reg_29,reg_30,reg_31,reg_32,reg_33,rec_0,rec_1,rec_2
0,65438,0.0,0,1,35,5.0,8,1,0,49,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1
1,65141,1.0,1,1,30,5.0,4,0,0,60,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,7513,1.0,1,1,34,3.0,7,0,0,50,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3,2542,1.0,1,2,39,1.0,10,0,0,50,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
4,48945,1.0,1,1,45,3.0,2,0,0,73,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0


In [29]:
train = pd.get_dummies(train, columns = ['edu'])
test = pd.get_dummies(test, columns = ['edu'])

train.head()

Unnamed: 0,id,mal,nof,age,rat,los,kpi,won,ats,pro,dept_0,dept_1,dept_2,dept_3,dept_4,dept_5,dept_6,dept_7,dept_8,reg_0,reg_1,reg_2,reg_3,reg_4,reg_5,reg_6,reg_7,reg_8,reg_9,reg_10,reg_11,reg_12,reg_13,reg_14,reg_15,reg_16,reg_17,reg_18,reg_19,reg_20,reg_21,reg_22,reg_23,reg_24,reg_25,reg_26,reg_27,reg_28,reg_29,reg_30,reg_31,reg_32,reg_33,rec_0,rec_1,rec_2,edu_0.0,edu_1.0,edu_2.0
0,65438,0,1,35,5.0,8,1,0,49,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0
1,65141,1,1,30,5.0,4,0,0,60,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0
2,7513,1,1,34,3.0,7,0,0,50,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0
3,2542,1,2,39,1.0,10,0,0,50,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0
4,48945,1,1,45,3.0,2,0,0,73,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0
