# Project 02 is about building a Decision Tree basis "Attrition Dataset" as an input to classify whether the Attrition is dependent on what factors from the given dataset. 
# The Independent Factors is obtained through Random Forest Methodology.

In [17]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier

In [2]:
attrition_dataset = pd.read_csv("general_data.csv")

In [3]:
attrition_dataset.head(2)

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeID,Gender,...,NumCompaniesWorked,Over18,PercentSalaryHike,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsSinceLastPromotion,YearsWithCurrManager
0,51,No,Travel_Rarely,Sales,6,2,Life Sciences,1,1,Female,...,1.0,Y,11,8,0,1.0,6,1,0,0
1,31,Yes,Travel_Frequently,Research & Development,10,1,Life Sciences,1,2,Female,...,0.0,Y,23,8,1,6.0,3,5,1,4


In [4]:
attrition_dataset.columns

Index(['Age', 'Attrition', 'BusinessTravel', 'Department', 'DistanceFromHome',
       'Education', 'EducationField', 'EmployeeCount', 'EmployeeID', 'Gender',
       'JobLevel', 'JobRole', 'MaritalStatus', 'MonthlyIncome',
       'NumCompaniesWorked', 'Over18', 'PercentSalaryHike', 'StandardHours',
       'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear',
       'YearsAtCompany', 'YearsSinceLastPromotion', 'YearsWithCurrManager'],
      dtype='object')

In [6]:
attrition_dataset.isna().sum()

Age                         0
Attrition                   0
BusinessTravel              0
Department                  0
DistanceFromHome            0
Education                   0
EducationField              0
EmployeeCount               0
EmployeeID                  0
Gender                      0
JobLevel                    0
JobRole                     0
MaritalStatus               0
MonthlyIncome               0
NumCompaniesWorked         19
Over18                      0
PercentSalaryHike           0
StandardHours               0
StockOptionLevel            0
TotalWorkingYears           9
TrainingTimesLastYear       0
YearsAtCompany              0
YearsSinceLastPromotion     0
YearsWithCurrManager        0
dtype: int64

In [7]:
# Here we fill Nan values of no. of companies Worked with mode
attrition_dataset.NumCompaniesWorked = np.where(attrition_dataset.NumCompaniesWorked.isna(),np.floor(attrition_dataset.NumCompaniesWorked.mode()),attrition_dataset.NumCompaniesWorked)

In [8]:
# Nan values of total Working years is replaced by mean
attrition_dataset.TotalWorkingYears = np.where(attrition_dataset.TotalWorkingYears.isna(),np.floor(attrition_dataset.TotalWorkingYears.mean()),attrition_dataset.TotalWorkingYears)

In [10]:
attrition_dataset.isna().sum()

Age                        0
Attrition                  0
BusinessTravel             0
Department                 0
DistanceFromHome           0
Education                  0
EducationField             0
EmployeeCount              0
EmployeeID                 0
Gender                     0
JobLevel                   0
JobRole                    0
MaritalStatus              0
MonthlyIncome              0
NumCompaniesWorked         0
Over18                     0
PercentSalaryHike          0
StandardHours              0
StockOptionLevel           0
TotalWorkingYears          0
TrainingTimesLastYear      0
YearsAtCompany             0
YearsSinceLastPromotion    0
YearsWithCurrManager       0
dtype: int64

In [12]:
# Now we try to remove duplicated records
attrition_dataset = attrition_dataset.drop_duplicates()

In [13]:
attrition_dataset.dtypes #To identify the caltegorical data in the given dataset for encoding as required.

Age                          int64
Attrition                   object
BusinessTravel              object
Department                  object
DistanceFromHome             int64
Education                    int64
EducationField              object
EmployeeCount                int64
EmployeeID                   int64
Gender                      object
JobLevel                     int64
JobRole                     object
MaritalStatus               object
MonthlyIncome                int64
NumCompaniesWorked         float64
Over18                      object
PercentSalaryHike            int64
StandardHours                int64
StockOptionLevel             int64
TotalWorkingYears          float64
TrainingTimesLastYear        int64
YearsAtCompany               int64
YearsSinceLastPromotion      int64
YearsWithCurrManager         int64
dtype: object

**Encoding following Categorical variables for simplification -**

1. Attrition
2. BusinessTravel
3. Department
4. EducationField
5. Gender
6. JobRole
7. MaritialStatus
8. Over18

In [21]:
label_encoder = LabelEncoder()
attrition_dataset["Attrition"] = label_encoder.fit_transform(attrition_dataset["Attrition"])
attrition_dataset["BusinessTravel"] = label_encoder.fit_transform(attrition_dataset["BusinessTravel"])
attrition_dataset["Department"] = label_encoder.fit_transform(attrition_dataset["Department"])
attrition_dataset["EducationField"] = label_encoder.fit_transform(attrition_dataset["EducationField"])
attrition_dataset["Gender"] = label_encoder.fit_transform(attrition_dataset["Gender"])
attrition_dataset["JobRole"] = label_encoder.fit_transform(attrition_dataset["JobRole"])
attrition_dataset["MaritalStatus"] = label_encoder.fit_transform(attrition_dataset["MaritalStatus"])
attrition_dataset["Over18"] = label_encoder.fit_transform(attrition_dataset["Over18"])


In [22]:
attrition_dataset.dtypes

Age                          int64
Attrition                    int64
BusinessTravel               int32
Department                   int32
DistanceFromHome             int64
Education                    int64
EducationField               int32
EmployeeCount                int64
EmployeeID                   int64
Gender                       int32
JobLevel                     int64
JobRole                      int32
MaritalStatus                int32
MonthlyIncome                int64
NumCompaniesWorked         float64
Over18                       int32
PercentSalaryHike            int64
StandardHours                int64
StockOptionLevel             int64
TotalWorkingYears          float64
TrainingTimesLastYear        int64
YearsAtCompany               int64
YearsSinceLastPromotion      int64
YearsWithCurrManager         int64
dtype: object

# Random Forest Method Implementation - 

In [23]:
rf_model = RandomForestClassifier(n_estimators = 1000, max_features = 2, oob_score = True)

In [24]:
features = ['Age', 'BusinessTravel', 'Department', 'DistanceFromHome',
       'Education', 'EducationField', 'EmployeeCount', 'EmployeeID', 'Gender',
       'JobLevel', 'JobRole', 'MaritalStatus', 'MonthlyIncome',
       'NumCompaniesWorked', 'Over18', 'PercentSalaryHike', 'StandardHours',
       'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear',
       'YearsAtCompany', 'YearsSinceLastPromotion', 'YearsWithCurrManager']

In [27]:
rf_model.fit(X = attrition_dataset[features], y = attrition_dataset["Attrition"])

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features=2,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=None, oob_score=True, random_state=None,
                       verbose=0, warm_start=False)

In [28]:
print("OOB Accuracy:")
print(rf_model.oob_score_)

OOB Accuracy:
0.9997732426303855


In [30]:
for feature,imp in zip(features, rf_model.feature_importances_):
    print(feature,imp)

Age 0.09250185153856451
BusinessTravel 0.027150697267129882
Department 0.025643476434468685
DistanceFromHome 0.06734692656746362
Education 0.0392929918857209
EducationField 0.03977451774245958
EmployeeCount 0.0
EmployeeID 0.03833181582025859
Gender 0.017036419111314916
JobLevel 0.0366548683913343
JobRole 0.05356818105234133
MaritalStatus 0.038715162644011814
MonthlyIncome 0.08861008332240664
NumCompaniesWorked 0.053298607022867206
Over18 0.0
PercentSalaryHike 0.06251812105954281
StandardHours 0.0
StockOptionLevel 0.03187849187531683
TotalWorkingYears 0.08108136717223106
TrainingTimesLastYear 0.04326274537815918
YearsAtCompany 0.06760157193166082
YearsSinceLastPromotion 0.04210082773692452
YearsWithCurrManager 0.053631276045822865


# Conclusion of above RF analysis - 

1. Age has high significance in Attrition determination
2. MonthlyIncome and TotalWorkingYears has next significance after Age
3. DistanceFromHome, PercentSalaryHike and YearsAtCompany has next significance
4. JobRole, NumCompaniesWorked, YearsWithCurrManager has next significance
5. YearsSinceLastPromotion, StockOptionLevel, MaritialStatus, JObLevel, EducationField, Education, has next significance
6. Remaning all other variables have least significance
7. Over18, StandardWorkingHours, EmployeeCount has no significance at all.

# Thus, we will consider only ['Age','MonthlyIncome','TotalWorkingYears', 'DistanceFromHome', 'PercentSalaryHike', 'YearsAtCompany', 'JobRole', 'NumCompaniesWorked', 'YearsWithCurrManager', 'YearsSinceLastPromotion'] as our prime Independent Variables for Dependent Variable(Attrition).

# Decision Tree Building(Classification).

In [31]:
tree_model = tree.DecisionTreeClassifier(max_depth=20)

In [34]:
predictors = pd.DataFrame([attrition_dataset["Age"],attrition_dataset["MonthlyIncome"],attrition_dataset["TotalWorkingYears"],attrition_dataset["DistanceFromHome"],attrition_dataset["PercentSalaryHike"],attrition_dataset["JobRole"],attrition_dataset["NumCompaniesWorked"],attrition_dataset["YearsWithCurrManager"],attrition_dataset["YearsSinceLastPromotion"],attrition_dataset["YearsAtCompany"]]).T

In [35]:
tree_model.fit(X=predictors,y=attrition_dataset["Attrition"])

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=20, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [36]:
with open("AttritionRFDTree.dot", 'w') as f:
    f = tree.export_graphviz(tree_model,feature_names=['Age','MonthlyIncome','TotalWorkingYears', 'DistanceFromHome', 'PercentSalaryHike', 'YearsAtCompany', 'JobRole', 'NumCompaniesWorked', 'YearsWithCurrManager', 'YearsSinceLastPromotion'], out_file=f);

# Inference -

1. If PercentSalaryHike >= 12.5 and MonthlyIncome <= 23140.0 and Age <= 33.5 and TotalWorkingYears <= 1.5 then Attrition is Yes.
2. If DistanceFromHome <= 13.0 and YearsSinceLastPromotion <= 0.5 and MonthlyIncome >= 112610.0 and Age >= 33.5 and TotalWorkingYears <= 1.5 then Attrition is Yes.
3. If YearsSinceLastPromotion >= 0.5 and MonthlyIncome >= 112610.0 and Age >= 33.5 and TotalWorkingYears <= 1.5 then Attrition is No.
4. If PercentSalaryHike <= 12.0 and MonthlyIncome <= 25425.0 and Age >= 33.5 and TotalWorkingYears <= 1.5 then Attrition is yes.
5. If PercentSalaryHike >= 21.0 and YearsAtCompany <= 1.5 and NumCompaniesWorked <= 4.5 and Age >= 33.5 then Attrition is No.