In [1]:
import pandas as pd
from sklearn import tree
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
import numpy as np

In [2]:
dff = pd.read_csv('general_data.csv')

In [3]:
dff.columns

Index(['Age', 'Attrition', 'BusinessTravel', 'Department', 'DistanceFromHome',
       'Education', 'EducationField', 'EmployeeCount', 'EmployeeID', 'Gender',
       'JobLevel', 'JobRole', 'MaritalStatus', 'MonthlyIncome',
       'NumCompaniesWorked', 'Over18', 'PercentSalaryHike', 'StandardHours',
       'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear',
       'YearsAtCompany', 'YearsSinceLastPromotion', 'YearsWithCurrManager'],
      dtype='object')

In [4]:
le = preprocessing.LabelEncoder()
dff['Attrition'] = le.fit_transform(dff['Attrition'])
dff['BusinessTravel'] = le.fit_transform(dff['BusinessTravel'])
dff['EducationField'] = le.fit_transform(dff['EducationField'])
dff['Gender'] = le.fit_transform(dff['Gender'])
dff['Department'] = le.fit_transform(dff['Department'])
dff['MaritalStatus'] = le.fit_transform(dff['MaritalStatus'])
dff['JobRole'] = le.fit_transform(dff['JobRole'])


In [5]:
df1 = dff.drop(['EmployeeCount', 'EmployeeID', 'Over18', 'StandardHours'], axis = 1)

In [6]:
df1.columns

Index(['Age', 'Attrition', 'BusinessTravel', 'Department', 'DistanceFromHome',
       'Education', 'EducationField', 'Gender', 'JobLevel', 'JobRole',
       'MaritalStatus', 'MonthlyIncome', 'NumCompaniesWorked',
       'PercentSalaryHike', 'StockOptionLevel', 'TotalWorkingYears',
       'TrainingTimesLastYear', 'YearsAtCompany', 'YearsSinceLastPromotion',
       'YearsWithCurrManager'],
      dtype='object')

In [7]:
df2 = df1.dropna()

In [8]:
df3 = df2.drop_duplicates()

## Random Forest

In [9]:
rf_model = RandomForestClassifier(n_estimators=1000, max_features=2, oob_score=True)

In [10]:
features = ['Age', 'BusinessTravel', 'Department', 'DistanceFromHome',
       'Education', 'EducationField', 'Gender', 'JobLevel', 'JobRole',
       'MaritalStatus', 'MonthlyIncome', 'NumCompaniesWorked',
       'PercentSalaryHike', 'StockOptionLevel', 'TotalWorkingYears',
       'TrainingTimesLastYear', 'YearsAtCompany', 'YearsSinceLastPromotion',
       'YearsWithCurrManager']

In [11]:
rf_model.fit(X=df3[features], y = df3['Attrition'])

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features=2,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=None, oob_score=True, random_state=None,
                       verbose=0, warm_start=False)

In [12]:
print('OOB Accuracy:')
print(rf_model.oob_score_)

OOB Accuracy:
0.8428571428571429


In [13]:
for feature, imp in zip(features, rf_model.feature_importances_):
    print(feature, imp)

Age 0.09800637384344427
BusinessTravel 0.027372953628544243
Department 0.025834387620329326
DistanceFromHome 0.06908872817862248
Education 0.04062281168053368
EducationField 0.04116764095532268
Gender 0.019176953724272295
JobLevel 0.036953647792355285
JobRole 0.054442739938381605
MaritalStatus 0.03992811936875547
MonthlyIncome 0.09362674705201009
NumCompaniesWorked 0.0555053607187841
PercentSalaryHike 0.06611489541726455
StockOptionLevel 0.03384335373514676
TotalWorkingYears 0.08664548595311193
TrainingTimesLastYear 0.04491404240196894
YearsAtCompany 0.06806627376498835
YearsSinceLastPromotion 0.04394611901906143
YearsWithCurrManager 0.054743365207102525


## Decision Tree

In [14]:
tree_model = tree.DecisionTreeClassifier(max_depth=6, max_leaf_nodes=12)

In [15]:
pred = pd.DataFrame([df3['Age'], df3['MonthlyIncome'], df3['TotalWorkingYears']]).T
tree_model.fit(X = pred, y = df3['Attrition'])

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=6, max_features=None, max_leaf_nodes=12,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [16]:
with open ('DtreeATT.dot', 'w') as f:
    f = tree.export_graphviz(tree_model, feature_names=['Age', 'MonthlyIncome', 'TotalWorkingYears'], out_file=f);