In [1]:
import pandas as pd
from sklearn import tree
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
import numpy as np

In [2]:
dff = pd.read_csv('general_data.csv')

In [11]:
dff.columns

Index(['Age', 'Attrition', 'BusinessTravel', 'Department', 'DistanceFromHome',
       'Education', 'EducationField', 'EmployeeCount', 'EmployeeID', 'Gender',
       'JobLevel', 'JobRole', 'MaritalStatus', 'MonthlyIncome',
       'NumCompaniesWorked', 'Over18', 'PercentSalaryHike', 'StandardHours',
       'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear',
       'YearsAtCompany', 'YearsSinceLastPromotion', 'YearsWithCurrManager'],
      dtype='object')

In [12]:
le = preprocessing.LabelEncoder()
dff['Attrition'] = le.fit_transform(dff['Attrition'])
dff['BusinessTravel'] = le.fit_transform(dff['BusinessTravel'])
dff['EducationField'] = le.fit_transform(dff['EducationField'])
dff['Gender'] = le.fit_transform(dff['Gender'])
dff['Department'] = le.fit_transform(dff['Department'])
dff['MaritalStatus'] = le.fit_transform(dff['MaritalStatus'])
dff['JobRole'] = le.fit_transform(dff['JobRole'])


In [14]:
df1 = dff.drop(['EmployeeCount', 'EmployeeID', 'Over18', 'StandardHours'], axis = 1)

In [15]:
df1.columns

Index(['Age', 'Attrition', 'BusinessTravel', 'Department', 'DistanceFromHome',
       'Education', 'EducationField', 'Gender', 'JobLevel', 'JobRole',
       'MaritalStatus', 'MonthlyIncome', 'NumCompaniesWorked',
       'PercentSalaryHike', 'StockOptionLevel', 'TotalWorkingYears',
       'TrainingTimesLastYear', 'YearsAtCompany', 'YearsSinceLastPromotion',
       'YearsWithCurrManager'],
      dtype='object')

In [20]:
df2 = df1.dropna()

In [21]:
df3 = df2.drop_duplicates()

# Random Forest

In [23]:
rf_model = RandomForestClassifier(n_estimators=1000, max_features=2, oob_score=True)

In [22]:
features = ['Age', 'BusinessTravel', 'Department', 'DistanceFromHome',
       'Education', 'EducationField', 'Gender', 'JobLevel', 'JobRole',
       'MaritalStatus', 'MonthlyIncome', 'NumCompaniesWorked',
       'PercentSalaryHike', 'StockOptionLevel', 'TotalWorkingYears',
       'TrainingTimesLastYear', 'YearsAtCompany', 'YearsSinceLastPromotion',
       'YearsWithCurrManager']

In [24]:
rf_model.fit(X=df3[features], y = df3['Attrition'])

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features=2,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=None, oob_score=True, random_state=None,
                       verbose=0, warm_start=False)

In [25]:
print('OOB Accuracy:')
print(rf_model.oob_score_)

OOB Accuracy:
0.8435374149659864


In [26]:
for feature, imp in zip(features, rf_model.feature_importances_):
    print(feature, imp)

Age 0.09822861529750367
BusinessTravel 0.027851909179650827
Department 0.025861680224511487
DistanceFromHome 0.06920109716389829
Education 0.04061308140624962
EducationField 0.04130090361777618
Gender 0.018527266864806945
JobLevel 0.038102683592929246
JobRole 0.05489561999497393
MaritalStatus 0.03970809423278251
MonthlyIncome 0.09297091170254097
NumCompaniesWorked 0.05553422789570587
PercentSalaryHike 0.06541077321146957
StockOptionLevel 0.03386076266138414
TotalWorkingYears 0.08725888141051351
TrainingTimesLastYear 0.04453865881296472
YearsAtCompany 0.06916755992875943
YearsSinceLastPromotion 0.04324572454866928
YearsWithCurrManager 0.05372154825290976


Now taking DV - Attrition and IDV - Age, Monthly Income, Total Working Years 

# Decision Tree

In [27]:
tree_model = tree.DecisionTreeClassifier(max_depth=6, max_leaf_nodes=12)

In [28]:
pred = pd.DataFrame([df3['Age'], df3['MonthlyIncome'], df3['TotalWorkingYears']]).T
tree_model.fit(X = pred, y = df3['Attrition'])

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=6, max_features=None, max_leaf_nodes=12,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [29]:
with open ('DtreeATT.dot', 'w') as f:
    f = tree.export_graphviz(tree_model, feature_names=['Age', 'MonthlyIncome', 'TotalWorkingYears'], out_file=f);