In [66]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn import model_selection

import seaborn as sns

%matplotlib inline

In [5]:
df = pd.read_csv('HR_comma_sep.csv')

In [46]:
# Printing the list of columns in the dataset

df.columns

Index([u'satisfaction_level', u'last_evaluation', u'number_project',
       u'average_montly_hours', u'time_spend_company', u'Work_accident',
       u'left', u'promotion_last_5years', u'sales', u'salary'],
      dtype='object')

In [48]:
df.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


In [6]:
df.describe()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years
count,14999.0,14999.0,14999.0,14999.0,14999.0,14999.0,14999.0,14999.0
mean,0.612834,0.716102,3.803054,201.050337,3.498233,0.14461,0.238083,0.021268
std,0.248631,0.171169,1.232592,49.943099,1.460136,0.351719,0.425924,0.144281
min,0.09,0.36,2.0,96.0,2.0,0.0,0.0,0.0
25%,0.44,0.56,3.0,156.0,3.0,0.0,0.0,0.0
50%,0.64,0.72,4.0,200.0,3.0,0.0,0.0,0.0
75%,0.82,0.87,5.0,245.0,4.0,0.0,0.0,0.0
max,1.0,1.0,7.0,310.0,10.0,1.0,1.0,1.0


#### Let's see how the data of retained and the left employees differ across different variables.

In [8]:
# The data of the users who have left
df[df['left'] == 1].describe()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years
count,3571.0,3571.0,3571.0,3571.0,3571.0,3571.0,3571.0,3571.0
mean,0.440098,0.718113,3.855503,207.41921,3.876505,0.047326,1.0,0.005321
std,0.263933,0.197673,1.818165,61.202825,0.977698,0.212364,0.0,0.072759
min,0.09,0.45,2.0,126.0,2.0,0.0,1.0,0.0
25%,0.13,0.52,2.0,146.0,3.0,0.0,1.0,0.0
50%,0.41,0.79,4.0,224.0,4.0,0.0,1.0,0.0
75%,0.73,0.9,6.0,262.0,5.0,0.0,1.0,0.0
max,0.92,1.0,7.0,310.0,6.0,1.0,1.0,1.0


In [9]:
# The data of the users who have been retained
df[df['left'] == 0].describe()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years
count,11428.0,11428.0,11428.0,11428.0,11428.0,11428.0,11428.0,11428.0
mean,0.66681,0.715473,3.786664,199.060203,3.380032,0.175009,0.0,0.026251
std,0.217104,0.162005,0.979884,45.682731,1.562348,0.379991,0.0,0.159889
min,0.12,0.36,2.0,96.0,2.0,0.0,0.0,0.0
25%,0.54,0.58,3.0,162.0,2.0,0.0,0.0,0.0
50%,0.69,0.71,4.0,198.0,3.0,0.0,0.0,0.0
75%,0.84,0.85,4.0,238.0,4.0,0.0,0.0,0.0
max,1.0,1.0,6.0,287.0,10.0,1.0,0.0,1.0


 - So, as expected, the mean satisfaction level of the retained users is higher than the ones who left [0.666 vs 0.44]
 
 - Not sure about what the 'last evaluation' column really means. There is no explanation for the same.
 
 - The number of projects do not seem to make a lot of difference. However, a good plot might help us know better.
 
 - The average monthly hours is pretty high for the left employees. So, burn out might be a good reason.
 
 - The above hypothesis is supported by the average 'time_spend_company' column too
 
 - The retained employees are also the one who have been promoted in the last 5 years [So, the left employees didn't really have good encouragement or an incentive to stay]
 
 - I didn't understand what the Work_accident columns expalin. [Again, lack of documentation]

#### Now, let's look at the categorical variables

In [41]:
df['sales'].describe()
df['sales'][df['left'] == 1].value_counts(normalize = True)

sales          0.283954
technical      0.195183
support        0.155419
IT             0.076449
hr             0.060207
accounting     0.057127
marketing      0.056847
product_mng    0.055447
RandD          0.033884
management     0.025483
Name: sales, dtype: float64

In [42]:
df['sales'][df['left'] == 0].value_counts(normalize = True)

sales          0.273539
technical      0.177021
support        0.146482
IT             0.083479
product_mng    0.061603
RandD          0.058278
marketing      0.057315
accounting     0.049265
management     0.047165
hr             0.045852
Name: sales, dtype: float64

In [44]:
df['salary'][df['left'] == 0].value_counts(normalize = True)

low       0.450123
medium    0.448810
high      0.101068
Name: salary, dtype: float64

In [47]:
df['salary'][df['left'] == 1].value_counts(normalize = True)

low       0.608233
medium    0.368804
high      0.022963
Name: salary, dtype: float64

 - Amongst the categorical variable, the team name does not really differ a lot from each other. So, it might not be a helpful predictor variable
 - However, the salary seems like a good variavle for prediction, as we can see that the lower salaried people leave more.

Let's fit tree models for the data, as the columns mostly show a clear distinction between both the classes. 
which will make the entropy splits easier.

In [50]:
df['salary'] = df['salary'].map({'low':0,'medium':1,'high':2})

In [51]:
df.columns

Index([u'satisfaction_level', u'last_evaluation', u'number_project',
       u'average_montly_hours', u'time_spend_company', u'Work_accident',
       u'left', u'promotion_last_5years', u'sales', u'salary'],
      dtype='object')

In [61]:
train = df.drop([ 'left', 'sales'], 1) # number_project, #last_evaluation

test = df['left']

In [62]:
X_train, X_test, y_train, y_test= model_selection.train_test_split(train, test, test_size=0.3)

In [63]:
# Decision trees Classifier

clf = DecisionTreeClassifier(random_state=0)
clf.fit(X_train, y_train)

accuracy = clf.score(X_test,y_test)
print accuracy

0.981333333333


In [68]:
clf2 = RandomForestClassifier(random_state = 0)
clf2.fit(X_train, y_train)

accuracy2 = clf2.score(X_test,y_test)
print accuracy2

0.990888888889
