# Model Stacking

In [1]:
## ignoring FutureChange Warning to avoid screen taken up by warning
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
df = pd.read_csv('WA_Fn-UseC_-HR-Employee-Attrition.csv')
df.pop('EmployeeCount') 
df.pop('EmployeeNumber')
df.pop('Over18') 
df.pop('StandardHours')
y = df['Attrition']
X = df
X.pop('Attrition')
from sklearn import preprocessing
le = preprocessing.LabelBinarizer()
y = le.fit_transform(y)
ind_BusinessTravel = pd.get_dummies(df['BusinessTravel'], prefix='BusinessTravel')
ind_Department = pd.get_dummies(df['Department'], prefix='Department')
ind_EducationField = pd.get_dummies(df['EducationField'], prefix='EducationField')
ind_Gender = pd.get_dummies(df['Gender'], prefix='Gender')
ind_JobRole = pd.get_dummies(df['JobRole'], prefix='JobRole')
ind_MaritalStatus = pd.get_dummies(df['MaritalStatus'], prefix='MaritalStatus')
ind_OverTime = pd.get_dummies(df['OverTime'], prefix='OverTime')
df1 = pd.concat([ind_BusinessTravel, ind_Department, ind_EducationField, ind_Gender, ind_JobRole, ind_MaritalStatus, ind_OverTime], axis=1)
df1 = pd.concat([ind_BusinessTravel, ind_Department, ind_EducationField, ind_Gender, 
                 ind_JobRole, ind_MaritalStatus, ind_OverTime, df.select_dtypes(['int64'])], axis=1)
df1.dropna(inplace=True)
df1.shape

(1470, 51)

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df1, y)

In [5]:
from sklearn.model_selection import cross_val_predict, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [6]:
def print_score(clf, X_train, X_test, y_train, y_test, train = True):
    '''
    print accuracy score and confusion matrix
    '''
    
    if train:
        '''
        train performance
        '''
        print('Train Result: \n')
        print('Accuracy Score: {0:.4f}\n'.format(accuracy_score(y_train, clf.predict(X_train))))
        print('Classification Report: \n {} \n'.format(classification_report(y_train, clf.predict(X_train))))
        print('Confusion Matrix: \n {} \n'.format(confusion_matrix(y_train, clf.predict(X_train))))
        
        res = cross_val_score(clf, X_train, y_train, cv = 10, scoring='accuracy')
        print('Average Accuracy: \t {0:.4f}'.format(np.mean(res)))
        print('Average SD: \t\t {0:.4f}'.format(np.std(res)))
        
    elif train == False:
        '''
        test performance
        '''
        print('Test Result \n')
        print('Accuracy Score: {0:.4f}\n'.format(accuracy_score(y_test, clf.predict(X_test))))
        print('Classification Report: \n {} \n'.format(classification_report(y_test, clf.predict(X_test))))
        print('Confusion Matrix: \n {} \n'.format(confusion_matrix(y_test, clf.predict(X_test))))

## Model 1: Decision Tree

In [7]:
from sklearn.tree import DecisionTreeClassifier

In [8]:
tree_clf = DecisionTreeClassifier()
tree_clf.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [9]:
print_score(tree_clf, X_train, X_test, y_train, y_test, train = True)

Train Result: 

Accuracy Score: 1.0000

Classification Report: 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       927
           1       1.00      1.00      1.00       175

   micro avg       1.00      1.00      1.00      1102
   macro avg       1.00      1.00      1.00      1102
weighted avg       1.00      1.00      1.00      1102
 

Confusion Matrix: 
 [[927   0]
 [  0 175]] 

Average Accuracy: 	 0.7913
Average SD: 		 0.0226


In [10]:
print_score(tree_clf, X_train, X_test, y_train, y_test, train = False)

Test Result 

Accuracy Score: 0.7772

Classification Report: 
               precision    recall  f1-score   support

           0       0.88      0.85      0.86       306
           1       0.36      0.40      0.38        62

   micro avg       0.78      0.78      0.78       368
   macro avg       0.62      0.63      0.62       368
weighted avg       0.79      0.78      0.78       368
 

Confusion Matrix: 
 [[261  45]
 [ 37  25]] 



## Model 2: Random Forest

In [11]:
from sklearn.ensemble import RandomForestClassifier

In [12]:
rf_clf = RandomForestClassifier()
rf_clf.fit(X_train, y_train.ravel())

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [13]:
print_score(rf_clf, X_train, X_test, y_train.ravel(), y_test, train = True)

Train Result: 

Accuracy Score: 0.9891

Classification Report: 
               precision    recall  f1-score   support

           0       0.99      1.00      0.99       927
           1       1.00      0.93      0.96       175

   micro avg       0.99      0.99      0.99      1102
   macro avg       0.99      0.97      0.98      1102
weighted avg       0.99      0.99      0.99      1102
 

Confusion Matrix: 
 [[927   0]
 [ 12 163]] 

Average Accuracy: 	 0.8512
Average SD: 		 0.0164


In [14]:
print_score(rf_clf, X_train, X_test, y_train, y_test, train = False)

Test Result 

Accuracy Score: 0.8397

Classification Report: 
               precision    recall  f1-score   support

           0       0.85      0.98      0.91       306
           1       0.62      0.13      0.21        62

   micro avg       0.84      0.84      0.84       368
   macro avg       0.73      0.56      0.56       368
weighted avg       0.81      0.84      0.79       368
 

Confusion Matrix: 
 [[301   5]
 [ 54   8]] 



## Combine

In [15]:
en_en = pd.DataFrame()

In [16]:
en_en['tree_clf'] = pd.DataFrame(tree_clf.predict_proba(X_train))[1]
en_en['rf_clf'] = pd.DataFrame(rf_clf.predict_proba(X_train))[1]
col_name = en_en.columns
en_en = pd.concat([en_en, pd.DataFrame(y_train).reset_index(drop=True)], axis=1)

In [17]:
en_en.head()

Unnamed: 0,tree_clf,rf_clf,0
0,0.0,0.1,0
1,0.0,0.0,0
2,0.0,0.0,0
3,0.0,0.2,0
4,0.0,0.0,0


In [18]:
tmp = list(col_name)
tmp.append('ind')
en_en.columns = tmp

In [19]:
en_en.head()

Unnamed: 0,tree_clf,rf_clf,ind
0,0.0,0.1,0
1,0.0,0.0,0
2,0.0,0.0,0
3,0.0,0.2,0
4,0.0,0.0,0


## Meta Classifier

In [20]:
from sklearn.linear_model import LogisticRegression

In [21]:
m_clf = LogisticRegression(fit_intercept = False)

In [22]:
m_clf.fit(en_en[['tree_clf', 'rf_clf']], en_en['ind'])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=False,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [23]:
en_test = pd.DataFrame()

In [24]:
en_test['tree_clf'] = pd.DataFrame(tree_clf.predict_proba(X_test))[1]
en_test['rf_clf'] = pd.DataFrame(rf_clf.predict_proba(X_test))[1]
col_name = en_test.columns
en_test['combined'] = m_clf.predict(en_test[['tree_clf', 'rf_clf']])

In [25]:
col_name = en_test.columns
tmp = list(col_name)
tmp.append('ind')

In [26]:
tmp

['tree_clf', 'rf_clf', 'combined', 'ind']

In [27]:
en_test = pd.concat([en_test, pd.DataFrame(y_test).reset_index(drop=True)], axis = 1)

In [28]:
en_test.columns = tmp

In [29]:
print(pd.crosstab(en_test['ind'], en_test['combined']))

combined    0   1
ind              
0         261  45
1          37  25


In [30]:
print(round(accuracy_score(en_test['ind'], en_test['combined']),4))

0.7772


In [31]:
print(classification_report(en_test['ind'], en_test['combined']))

              precision    recall  f1-score   support

           0       0.88      0.85      0.86       306
           1       0.36      0.40      0.38        62

   micro avg       0.78      0.78      0.78       368
   macro avg       0.62      0.63      0.62       368
weighted avg       0.79      0.78      0.78       368



## Adding AdaBoost

In [32]:
from sklearn.ensemble import AdaBoostClassifier

In [33]:
ada_clf = AdaBoostClassifier()
ada_clf.fit(X_train, y_train.ravel())

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None)

In [34]:
print_score(ada_clf, X_train, X_test, y_train.ravel(), y_test, train = True)

Train Result: 

Accuracy Score: 0.9156

Classification Report: 
               precision    recall  f1-score   support

           0       0.92      0.98      0.95       927
           1       0.87      0.55      0.67       175

   micro avg       0.92      0.92      0.92      1102
   macro avg       0.90      0.77      0.81      1102
weighted avg       0.91      0.92      0.91      1102
 

Confusion Matrix: 
 [[913  14]
 [ 79  96]] 

Average Accuracy: 	 0.8684
Average SD: 		 0.0182


In [35]:
print_score(ada_clf, X_train, X_test, y_train, y_test, train = False)

Test Result 

Accuracy Score: 0.8913

Classification Report: 
               precision    recall  f1-score   support

           0       0.90      0.97      0.94       306
           1       0.79      0.48      0.60        62

   micro avg       0.89      0.89      0.89       368
   macro avg       0.85      0.73      0.77       368
weighted avg       0.88      0.89      0.88       368
 

Confusion Matrix: 
 [[298   8]
 [ 32  30]] 



In [36]:
en_en_2 = pd.DataFrame()

In [37]:
en_en_2['tree_clf'] = pd.DataFrame(tree_clf.predict_proba(X_train))[1]
en_en_2['rf_clf'] = pd.DataFrame(rf_clf.predict_proba(X_train))[1]
en_en_2['ada_clf'] = pd.DataFrame(ada_clf.predict_proba(X_train))[1]
col_name = en_en_2.columns
en_en_2 = pd.concat([en_en_2, pd.DataFrame(y_train).reset_index(drop=True)], axis=1)

In [38]:
en_en_2.head()

Unnamed: 0,tree_clf,rf_clf,ada_clf,0
0,0.0,0.1,0.466588,0
1,0.0,0.0,0.489088,0
2,0.0,0.0,0.488668,0
3,0.0,0.2,0.493437,0
4,0.0,0.0,0.477497,0


In [39]:
tmp = list(col_name)
tmp.append('ind')
en_en_2.columns = tmp

In [40]:
en_en_2.head()

Unnamed: 0,tree_clf,rf_clf,ada_clf,ind
0,0.0,0.1,0.466588,0
1,0.0,0.0,0.489088,0
2,0.0,0.0,0.488668,0
3,0.0,0.2,0.493437,0
4,0.0,0.0,0.477497,0


In [41]:
m_clf = LogisticRegression(fit_intercept = False)

In [42]:
m_clf.fit(en_en_2[['tree_clf', 'rf_clf', 'ada_clf']], en_en_2['ind'])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=False,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [43]:
en_test_2 = pd.DataFrame()

In [44]:
en_test_2['tree_clf'] = pd.DataFrame(tree_clf.predict_proba(X_test))[1]
en_test_2['rf_clf'] = pd.DataFrame(rf_clf.predict_proba(X_test))[1]
en_test_2['ada_clf'] = pd.DataFrame(ada_clf.predict_proba(X_test))[1]
col_name = en_test_2.columns
en_test_2['combined'] = m_clf.predict(en_test_2[['tree_clf', 'rf_clf', 'ada_clf']])

In [45]:
col_name = en_test_2.columns
tmp = list(col_name)
tmp.append('ind')

In [46]:
tmp

['tree_clf', 'rf_clf', 'ada_clf', 'combined', 'ind']

In [47]:
en_test_2 = pd.concat([en_test_2, pd.DataFrame(y_test).reset_index(drop=True)], axis = 1)

In [48]:
en_test_2.columns = tmp

In [49]:
print(pd.crosstab(en_test_2['ind'], en_test_2['combined']))

combined    0   1
ind              
0         261  45
1          37  25


In [50]:
print(round(accuracy_score(en_test_2['ind'], en_test_2['combined']),4))

0.7772


In [51]:
print(classification_report(en_test_2['ind'], en_test_2['combined']))

              precision    recall  f1-score   support

           0       0.88      0.85      0.86       306
           1       0.36      0.40      0.38        62

   micro avg       0.78      0.78      0.78       368
   macro avg       0.62      0.63      0.62       368
weighted avg       0.79      0.78      0.78       368



# Single Classifier

In [52]:
df = pd.read_csv('WA_Fn-UseC_-HR-Employee-Attrition.csv')

In [53]:
df.Attrition.value_counts()

No     1233
Yes     237
Name: Attrition, dtype: int64

In [54]:
df.Attrition.value_counts() / df.Attrition.count()

No     0.838776
Yes    0.161224
Name: Attrition, dtype: float64

In [55]:
from sklearn.ensemble import RandomForestClassifier

In [56]:
from sklearn.ensemble import BaggingClassifier

In [57]:
from sklearn.ensemble import AdaBoostClassifier

In [58]:
pd.Series(list(y_train)).value_counts() / pd.Series(list(y_train)).count()

[0]    0.841198
[1]    0.158802
dtype: float64

In [59]:
class_weight = {0: 0.838, 1: 0.162}

In [60]:
forest = RandomForestClassifier(class_weight = class_weight)

In [61]:
ada = AdaBoostClassifier(base_estimator = forest, n_estimators = 100, learning_rate = 0.5, random_state = 42)

In [62]:
ada.fit(X_train, y_train.ravel())

AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=RandomForestClassifier(bootstrap=True, class_weight={0: 0.838, 1: 0.162},
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators='warn', n_jobs=None, oob_score=False,
            random_state=None, verbose=0, warm_start=False),
          learning_rate=0.5, n_estimators=100, random_state=42)

In [63]:
print_score(ada, X_train, X_test, y_train.ravel(), y_test, train = True)

Train Result: 

Accuracy Score: 1.0000

Classification Report: 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       927
           1       1.00      1.00      1.00       175

   micro avg       1.00      1.00      1.00      1102
   macro avg       1.00      1.00      1.00      1102
weighted avg       1.00      1.00      1.00      1102
 

Confusion Matrix: 
 [[927   0]
 [  0 175]] 

Average Accuracy: 	 0.8575
Average SD: 		 0.0160


In [64]:
print_score(ada, X_train, X_test, y_train, y_test, train = False)

Test Result 

Accuracy Score: 0.8478

Classification Report: 
               precision    recall  f1-score   support

           0       0.85      0.99      0.92       306
           1       0.75      0.15      0.24        62

   micro avg       0.85      0.85      0.85       368
   macro avg       0.80      0.57      0.58       368
weighted avg       0.83      0.85      0.80       368
 

Confusion Matrix: 
 [[303   3]
 [ 53   9]] 



In [65]:
bag_clf = BaggingClassifier(base_estimator=ada, n_estimators=50,
                           max_samples=1.0, max_features=1.0, bootstrap=True,
                           bootstrap_features=False, n_jobs=-1,
                           random_state=42)

In [66]:
bag_clf.fit(X_train, y_train.ravel())

BaggingClassifier(base_estimator=AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=RandomForestClassifier(bootstrap=True, class_weight={0: 0.838, 1: 0.162},
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_imp...=None, verbose=0, warm_start=False),
          learning_rate=0.5, n_estimators=100, random_state=42),
         bootstrap=True, bootstrap_features=False, max_features=1.0,
         max_samples=1.0, n_estimators=50, n_jobs=-1, oob_score=False,
         random_state=42, verbose=0, warm_start=False)

In [67]:
print_score(bag_clf, X_train, X_test, y_train.ravel(), y_test, train = True)

Train Result: 

Accuracy Score: 1.0000

Classification Report: 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       927
           1       1.00      1.00      1.00       175

   micro avg       1.00      1.00      1.00      1102
   macro avg       1.00      1.00      1.00      1102
weighted avg       1.00      1.00      1.00      1102
 

Confusion Matrix: 
 [[927   0]
 [  0 175]] 

Average Accuracy: 	 0.8539
Average SD: 		 0.0117


In [68]:
print_score(bag_clf, X_train, X_test, y_train.ravel(), y_test, train = False)

Test Result 

Accuracy Score: 0.8587

Classification Report: 
               precision    recall  f1-score   support

           0       0.85      1.00      0.92       306
           1       1.00      0.16      0.28        62

   micro avg       0.86      0.86      0.86       368
   macro avg       0.93      0.58      0.60       368
weighted avg       0.88      0.86      0.81       368
 

Confusion Matrix: 
 [[306   0]
 [ 52  10]] 

