In [1]:
import pandas as pd
from sklearn import tree
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
import numpy as np

dff = pd.read_csv('general_data.csv')

dff.columns

Index(['Age', 'Attrition', 'BusinessTravel', 'Department', 'DistanceFromHome',
       'Education', 'EducationField', 'EmployeeCount', 'EmployeeID', 'Gender',
       'JobLevel', 'JobRole', 'MaritalStatus', 'MonthlyIncome',
       'NumCompaniesWorked', 'Over18', 'PercentSalaryHike', 'StandardHours',
       'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear',
       'YearsAtCompany', 'YearsSinceLastPromotion', 'YearsWithCurrManager'],
      dtype='object')

In [3]:
le = preprocessing.LabelEncoder()
dff['Attrition'] = le.fit_transform(dff['Attrition'])
dff['BusinessTravel'] = le.fit_transform(dff['BusinessTravel'])
dff['EducationField'] = le.fit_transform(dff['EducationField'])
dff['Gender'] = le.fit_transform(dff['Gender'])
dff['Department'] = le.fit_transform(dff['Department'])
dff['MaritalStatus'] = le.fit_transform(dff['MaritalStatus'])
dff['JobRole'] = le.fit_transform(dff['JobRole'])

df1 = dff.drop(['EmployeeCount', 'EmployeeID', 'Over18', 'StandardHours'], axis = 1)

df1.columns

Index(['Age', 'Attrition', 'BusinessTravel', 'Department', 'DistanceFromHome',
       'Education', 'EducationField', 'Gender', 'JobLevel', 'JobRole',
       'MaritalStatus', 'MonthlyIncome', 'NumCompaniesWorked',
       'PercentSalaryHike', 'StockOptionLevel', 'TotalWorkingYears',
       'TrainingTimesLastYear', 'YearsAtCompany', 'YearsSinceLastPromotion',
       'YearsWithCurrManager'],
      dtype='object')

In [4]:
df2 = df1.dropna()
df3 = df2.drop_duplicates()

# Random Forest

In [7]:
rf_model = RandomForestClassifier(n_estimators=1000, max_features=2, oob_score=True)

features = ['Age', 'BusinessTravel', 'Department', 'DistanceFromHome',
       'Education', 'EducationField', 'Gender', 'JobLevel', 'JobRole',
       'MaritalStatus', 'MonthlyIncome', 'NumCompaniesWorked',
       'PercentSalaryHike', 'StockOptionLevel', 'TotalWorkingYears',
       'TrainingTimesLastYear', 'YearsAtCompany', 'YearsSinceLastPromotion',
       'YearsWithCurrManager']

rf_model.fit(X=df3[features], y = df3['Attrition'])

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features=2,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=None, oob_score=True, random_state=None,
                       verbose=0, warm_start=False)

In [8]:
rf_model.fit(X=df3[features], y = df3['Attrition'])

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features=2,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=None, oob_score=True, random_state=None,
                       verbose=0, warm_start=False)

In [9]:
print('OOB Accuracy:')
print(rf_model.oob_score_)

OOB Accuracy:
0.8428571428571429


In [10]:
for feature, imp in zip(features, rf_model.feature_importances_):
    print(feature, imp)

Age 0.09663491680073151
BusinessTravel 0.027749990274815024
Department 0.025832298704847344
DistanceFromHome 0.06952395757416664
Education 0.04107543302694804
EducationField 0.041530525328969864
Gender 0.018909386127250204
JobLevel 0.037939337885970724
JobRole 0.05491364296308485
MaritalStatus 0.04060558533121955
MonthlyIncome 0.09287603450708964
NumCompaniesWorked 0.05677708950338047
PercentSalaryHike 0.06586851904328704
StockOptionLevel 0.033772853243178816
TotalWorkingYears 0.08519004856679625
TrainingTimesLastYear 0.044156100418397175
YearsAtCompany 0.06890070896889539
YearsSinceLastPromotion 0.043379792298969905
YearsWithCurrManager 0.05436377943200153


# Decision Tree

In [12]:
tree_model = tree.DecisionTreeClassifier(max_depth=6, max_leaf_nodes=12)

pred = pd.DataFrame([df3['Age'], df3['MonthlyIncome'], df3['TotalWorkingYears']]).T
tree_model.fit(X = pred, y = df3['Attrition'])

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=6, max_features=None, max_leaf_nodes=12,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [13]:
with open ('DtreeATT.dot', 'w') as f:
    f = tree.export_graphviz(tree_model, feature_names=['Age', 'MonthlyIncome', 'TotalWorkingYears'], out_file=f)