*Model 1 - Decision Tree

In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import confusion_matrix, roc_auc_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression



In [3]:
file = 'Data\WA_Fn-UseC_-HR-Employee-Attrition.csv'
df = pd.read_csv(file)

In [4]:
num_col = list(df.describe().columns)
col_categorical = list(set(df.columns).difference(num_col))
remove_list = ['EmployeeCount', 'EmployeeNumber', 'StandardHours']
col_numerical = [e for e in num_col if e not in remove_list]
attrition_to_num = {'Yes': 0,
                    'No': 1}
df['Attrition_num'] = df['Attrition'].map(attrition_to_num)
col_categorical.remove('Attrition')
df_cat = pd.get_dummies(df[col_categorical])
X = pd.concat([df[col_numerical], df_cat], axis=1)
y = df['Attrition_num']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [10]:
def print_score(clf, X_train, X_test, y_train, y_test, train=True):
    '''
    v0.1 Follow the scikit learn library format in terms of input
    print the accuracy score, classification report and confusion matrix of classifier
    '''
    lb = preprocessing.LabelBinarizer()
    lb.fit(y_train)
    if train:
        '''
        training performance
        '''
        print("Train Result:\n")
        print("accuracy score: {0:.4f}\n".format(accuracy_score(y_train, clf.predict(X_train))))
        print("Classification Report: \n {}\n".format(classification_report(y_train, clf.predict(X_train))))
        print("Confusion Matrix: \n {}\n".format(confusion_matrix(y_train, clf.predict(X_train))))
        print("ROC AUC: {0:.4f}\n".format(roc_auc_score(lb.transform(y_train),
                                                        lb.transform(clf.predict(X_train)))))
 
        #cv_res = cross_val_score(clf, X_train, y_train, cv=10, scoring='accuracy')
        #print("Average Accuracy: \t {0:.4f}".format(np.mean(cv_res)))
        #print("Accuracy SD: \t\t {0:.4f}".format(np.std(cv_res)))
 
    elif train==False:
        '''
        test performance
        '''
        res_test = clf.predict(X_test)
        print("Test Result:\n")
        print("accuracy score: {0:.4f}\n".format(accuracy_score(y_test, clf.predict(X_test))))
        print("Classification Report: \n {}\n".format(classification_report(y_test, clf.predict(X_test))))
        print("Confusion Matrix: \n {}\n".format(confusion_matrix(y_test, clf.predict(X_test))))
        print("ROC AUC: {0:.4f}\n".format(roc_auc_score(lb.transform(y_test), lb.transform(res_test))))

In [11]:
#Desicion Tree
tree_clf = DecisionTreeClassifier()
tree_clf.fit(X_train, y_train)

In [12]:
print_score(tree_clf, X_train, X_test, y_train, y_test, train=True)
print("************************")
print_score(tree_clf, X_train, X_test, y_train, y_test, train=False)

Train Result:

accuracy score: 1.0000

Classification Report: 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       177
           1       1.00      1.00      1.00       925

    accuracy                           1.00      1102
   macro avg       1.00      1.00      1.00      1102
weighted avg       1.00      1.00      1.00      1102


Confusion Matrix: 
 [[177   0]
 [  0 925]]

ROC AUC: 1.0000

************************
Test Result:

accuracy score: 0.7853

Classification Report: 
               precision    recall  f1-score   support

           0       0.36      0.42      0.39        60
           1       0.88      0.86      0.87       308

    accuracy                           0.79       368
   macro avg       0.62      0.64      0.63       368
weighted avg       0.80      0.79      0.79       368


Confusion Matrix: 
 [[ 25  35]
 [ 44 264]]

ROC AUC: 0.6369



Model 2 : Random Forest

In [14]:
rf_clf = RandomForestClassifier(n_estimators=100)
rf_clf.fit(X_train, y_train.ravel())

  rf_clf.fit(X_train, y_train.ravel())


In [15]:
print_score(rf_clf, X_train, X_test, y_train, y_test, train=True)
print_score(rf_clf, X_train, X_test, y_train, y_test, train=False)

Train Result:

accuracy score: 1.0000

Classification Report: 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       177
           1       1.00      1.00      1.00       925

    accuracy                           1.00      1102
   macro avg       1.00      1.00      1.00      1102
weighted avg       1.00      1.00      1.00      1102


Confusion Matrix: 
 [[177   0]
 [  0 925]]

ROC AUC: 1.0000

Test Result:

accuracy score: 0.8641

Classification Report: 
               precision    recall  f1-score   support

           0       0.86      0.20      0.32        60
           1       0.86      0.99      0.92       308

    accuracy                           0.86       368
   macro avg       0.86      0.60      0.62       368
weighted avg       0.86      0.86      0.83       368


Confusion Matrix: 
 [[ 12  48]
 [  2 306]]

ROC AUC: 0.5968



Combination of the two Methods

In [29]:
# Create a DataFrame to store the predicted probabilities for X_train
en_en = pd.DataFrame()

# Get the predicted probabilities from tree_clf and rf_clf for X_train
# Use [:, 1] to select the probabilities for class 1
en_en['tree_clf'] = tree_clf.predict_proba(X_train)[:, 1]  # Probabilities for class 1 from tree_clf
en_en['rf_clf'] = rf_clf.predict_proba(X_train)[:, 1]  # Probabilities for class 1 from rf_clf

# Concatenate with y_train (assuming y_train is your target variable)
en_en = pd.concat([en_en, pd.DataFrame(y_train).reset_index(drop=True)], axis=1)
en_en.columns = ['tree_clf', 'rf_clf', 'ind']  # Rename the columns properly

# Train a logistic regression model using the predicted probabilities
from sklearn.linear_model import LogisticRegression
m_clf = LogisticRegression(fit_intercept=False, solver='lbfgs')
m_clf.fit(en_en[['tree_clf', 'rf_clf']], en_en['ind'])

# Create a DataFrame for the test data
en_test = pd.DataFrame()

# Get the predicted probabilities from tree_clf and rf_clf for X_test
en_test['tree_clf'] = tree_clf.predict_proba(X_test)[:, 1]  # Probabilities for class 1 from tree_clf
en_test['rf_clf'] = rf_clf.predict_proba(X_test)[:, 1]  # Probabilities for class 1 from rf_clf

# Use the logistic regression model to predict the final combined prediction
en_test['combined'] = m_clf.predict(en_test[['tree_clf', 'rf_clf']])

# Concatenate y_test (true labels) to en_test DataFrame
en_test = pd.concat([en_test, pd.DataFrame(y_test).reset_index(drop=True)], axis=1)
en_test.columns = ['tree_clf', 'rf_clf', 'combined', 'ind']  # Rename the columns properly

# Print a crosstab to compare actual vs predicted values
print(pd.crosstab(en_test['ind'], en_test['combined']))



combined   0    1
ind              
0         25   35
1         44  264
