## Import Minimum Library

In [1]:
import pandas as pd
import numpy as np

## Read the Data

In [2]:
data = pd.read_csv("HR_comma_sep.csv")

In [3]:
data.tail()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales,salary
14994,0.4,0.57,2,151,3,0,1,0,support,low
14995,0.37,0.48,2,160,3,0,1,0,support,low
14996,0.37,0.53,2,143,3,0,1,0,support,low
14997,0.11,0.96,6,280,4,0,1,0,support,low
14998,0.37,0.52,2,158,3,0,1,0,support,low


## Check for NAN

In [4]:
data.isnull().sum()

satisfaction_level       0
last_evaluation          0
number_project           0
average_montly_hours     0
time_spend_company       0
Work_accident            0
left                     0
promotion_last_5years    0
sales                    0
salary                   0
dtype: int64

## Extract the Data

In [5]:
def extractData():
    data = pd.read_csv("HR_comma_sep.csv")
    y = data["left"]
    x = data.drop(["left"], axis = 1)
    
    return x, y, data

In [6]:
x, y, data = extractData()

## The Benchmark

In [7]:
y.value_counts(normalize=True)

0    0.761917
1    0.238083
Name: left, dtype: float64

## Dummy data

In [8]:
def numericalData():
    dataNum = x
    categorial = ["sales", "salary"]
    numerical = dataNum.drop(categorial, axis = 1)
    return numerical

In [9]:
def salaryData():
    Salary = np.where(data["salary"] == "low", 1.0, data["salary"])
    Salary = np.where(Salary == "medium", 2.0, Salary)
    Salary = np.where(Salary == "high", 3.0, Salary)
    Salary = pd.DataFrame(Salary)
    Salary.columns = ['salary numerical']
    
    return Salary

In [10]:
def salesSalaryData():
    sales = pd.get_dummies(data[["salary", "sales"]])
    return sales

In [11]:
numerical = numericalData()
salaryNumerical = salaryData()
categorial = salesSalaryData()
x_data = pd.concat([numerical, salaryNumerical, categorial], axis=1)

# The Classifiers before Feature Engineering

In [12]:
from sklearn.cross_validation import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_data, y, test_size = 0.25, random_state = 123)



In [14]:
!pip install mlxtend

Collecting mlxtend
  Downloading mlxtend-0.10.0-py2.py3-none-any.whl (1.3MB)
Installing collected packages: mlxtend
Successfully installed mlxtend-0.10.0


In [15]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.svm import LinearSVC, SVC
from sklearn.tree import ExtraTreeClassifier, DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, BaggingClassifier

from mlxtend.classifier import StackingClassifier
from sklearn.cross_validation import cross_val_score


In [16]:
knn = KNeighborsClassifier()

logreg = LogisticRegression(random_state = 123)

linearSVM = LinearSVC( random_state = 123)
kernelSVM = SVC( random_state = 123)

extratree = ExtraTreeClassifier(random_state = 123)
decisiontree = DecisionTreeClassifier(random_state = 123)

bagging = BaggingClassifier(random_state= 123, n_estimators= 100)
randomforest = RandomForestClassifier(random_state = 123, n_estimators= 100)
boosting = GradientBoostingClassifier( random_state = 123, n_estimators = 100)
adaboost = AdaBoostClassifier(random_state= 123, n_estimators= 100)



In [17]:
randomforest2 = RandomForestClassifier(random_state = 123, n_estimators= 100, max_features="log2")
randomforest3 = RandomForestClassifier(random_state = 123, n_estimators= 100, max_features=7)


In [18]:
classifiers = [knn, logreg, linearSVM, kernelSVM, decisiontree, extratree, bagging, randomforest, boosting, adaboost]

In [19]:
for i in classifiers:
    scores = cross_val_score(i, x_data, y, cv=7, scoring='accuracy')
    print("Accuracy: %0.4f (+/- %0.4f)" 
          % (scores.mean(), scores.std())), i

Accuracy: 0.9335 (+/- 0.0130)
Accuracy: 0.7831 (+/- 0.0224)
Accuracy: 0.6752 (+/- 0.1697)
Accuracy: 0.9501 (+/- 0.0043)
Accuracy: 0.9813 (+/- 0.0121)
Accuracy: 0.9618 (+/- 0.0270)
Accuracy: 0.9913 (+/- 0.0088)
Accuracy: 0.9913 (+/- 0.0091)
Accuracy: 0.9759 (+/- 0.0024)
Accuracy: 0.9597 (+/- 0.0047)


In [20]:
logregCVStacking = LogisticRegressionCV(random_state= 123)
sclf = StackingClassifier(classifiers=[randomforest, boosting, randomforest2, randomforest3],
                          use_probas=True,
                          average_probas=False,
                          meta_classifier= logregCVStacking)

print('7-fold cross validation:\n')

for clf, label in zip([knn, logreg, linearSVM, kernelSVM, decisiontree, extratree, bagging, randomforest, randomforest2, randomforest3, boosting, adaboost, sclf], 
                      ['KNN',
                       'Logistic Regression', 
                       'Linear SVM', 
                       'Kernel SVM',
                       'Decision Trees',
                       'Extra Trees',
                       'Bagging',
                       'Random Forest 1',
                       'Random Forest 2',
                       'Random Forest 3',
                       'Stochastic Gradient Boosting',
                       'Adaptive Boosting',
                       'Stacking    : Stacking All Model with Logistic Reg']):

    scores = cross_val_score(clf, x_data, y, cv=7, scoring='accuracy')
    print("Accuracy: %0.4f (+/- %0.4f) [%s]" 
          % (scores.mean(), scores.std(), label))

7-fold cross validation:

Accuracy: 0.9335 (+/- 0.0130) [KNN]
Accuracy: 0.7831 (+/- 0.0224) [Logistic Regression]
Accuracy: 0.6752 (+/- 0.1697) [Linear SVM]
Accuracy: 0.9501 (+/- 0.0043) [Kernel SVM]
Accuracy: 0.9813 (+/- 0.0121) [Decision Trees]
Accuracy: 0.9618 (+/- 0.0270) [Extra Trees]
Accuracy: 0.9913 (+/- 0.0088) [Bagging]
Accuracy: 0.9913 (+/- 0.0091) [Random Forest 1]
Accuracy: 0.9913 (+/- 0.0091) [Random Forest 2]
Accuracy: 0.9926 (+/- 0.0079) [Random Forest 3]
Accuracy: 0.9759 (+/- 0.0024) [Stochastic Gradient Boosting]
Accuracy: 0.9597 (+/- 0.0047) [Adaptive Boosting]
Accuracy: 0.9920 (+/- 0.0086) [Stacking    : Stacking All Model with Logistic Reg]


# Feature Engineering

In [21]:
def featureEngineering():
    
    data = x_data
    feature = pd.DataFrame()
    
    feature["satisfaction_hour"] = data["satisfaction_level"] * data["average_montly_hours"]
    feature["disatisfaction_hour"] = data["average_montly_hours"] - feature["satisfaction_hour"]
    
    feature["satisfaction_project"] =  data["satisfaction_level"] * data["number_project"]
    feature["disatisfaction_project"] =  data["number_project"] - feature["satisfaction_project"]
    
    feature["project_hour_average"] = data["average_montly_hours"] / data["number_project"]
    feature["time_per_project"] = 1/data["number_project"]
    
    feature["salary_per_hour"] = data["salary numerical"] / data["average_montly_hours"]
    feature["salary_per_project"] = data["salary numerical"] / data["number_project"]

    return feature
    

In [22]:
featureEngineering = featureEngineering()
x_data_feature_engineering = pd.concat([x_data, featureEngineering], axis=1)

# The Classifiers after Feature Engineering

In [23]:
logregCVStacking = LogisticRegressionCV(random_state= 123)
sclf = StackingClassifier(classifiers=[randomforest, boosting, randomforest2, randomforest3],
                          use_probas=True,
                          average_probas=False,
                          meta_classifier= logregCVStacking)

print('7-fold cross validation:\n')

for clf, label in zip([knn, logreg, linearSVM, kernelSVM, decisiontree, extratree, bagging, randomforest, randomforest2, randomforest3, boosting, adaboost, sclf], 
                      ['KNN',
                       'Logistic Regression', 
                       'Linear SVM', 
                       'Kernel SVM',
                       'Decision Trees',
                       'Extra Trees',
                       'Bagging',
                       'Random Forest 1',
                       'Random Forest 2',
                       'Random Forest 3',
                       'Stochastic Gradient Boosting',
                       'Adaptive Boosting',
                       'Stacking    : Stacking All Model with Logistic Reg']):

    scores = cross_val_score(clf, x_data_feature_engineering, y, cv=7, scoring='accuracy')
    print("Accuracy: %0.4f (+/- %0.4f) [%s]" 
          % (scores.mean(), scores.std(), label))

7-fold cross validation:

Accuracy: 0.9441 (+/- 0.0103) [KNN]
Accuracy: 0.8821 (+/- 0.0114) [Logistic Regression]
Accuracy: 0.5920 (+/- 0.1945) [Linear SVM]
Accuracy: 0.9706 (+/- 0.0143) [Kernel SVM]
Accuracy: 0.9833 (+/- 0.0099) [Decision Trees]
Accuracy: 0.9667 (+/- 0.0202) [Extra Trees]
Accuracy: 0.9911 (+/- 0.0087) [Bagging]
Accuracy: 0.9922 (+/- 0.0078) [Random Forest 1]
Accuracy: 0.9922 (+/- 0.0082) [Random Forest 2]
Accuracy: 0.9923 (+/- 0.0077) [Random Forest 3]
Accuracy: 0.9765 (+/- 0.0022) [Stochastic Gradient Boosting]
Accuracy: 0.9667 (+/- 0.0032) [Adaptive Boosting]
Accuracy: 0.9924 (+/- 0.0075) [Stacking    : Stacking All Model with Logistic Reg]


## GridSearch Optim

In [24]:
param_knn = {'n_neighbors' : [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 , 13, 14, 15, 16, 17, 18 , 19, 20]}

In [25]:
from sklearn.grid_search import GridSearchCV



In [26]:
gridKNN = GridSearchCV(knn,param_knn, cv = 7)
gridKNN.fit( x_data_feature_engineering, y)

GridSearchCV(cv=7, error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [27]:
gridKNN.best_params_

{'n_neighbors': 2}

In [28]:
gridKNN.best_score_

0.9643309553970265

In [29]:
param_logreg = {'C' : [1750, 2000, 1500, 1200, 1000, 333, 100, 33, 10, 3.33, 1, 0.33, 0.1, 0.033, 0.01, 0.0033, 0.001, 0.00033, 0.0001]}

In [30]:
gridLogReg = GridSearchCV(logreg,param_logreg, cv = 7)
gridLogReg.fit(x_data_feature_engineering, y)

GridSearchCV(cv=7, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=123, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'C': [1750, 2000, 1500, 1200, 1000, 333, 100, 33, 10, 3.33, 1, 0.33, 0.1, 0.033, 0.01, 0.0033, 0.001, 0.00033, 0.0001]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [31]:
gridLogReg.best_score_

0.9013267551170078

In [32]:
gridLogReg.best_params_

{'C': 2000}

In [33]:
param_linSVC = {'C' : [1750, 2000, 1500, 1200, 1000, 333, 100, 33, 10, 3.33, 1, 0.33, 0.1, 0.033, 0.01, 0.0033, 0.001, 0.00033, 0.0001]}

In [34]:
gridLinSVC = GridSearchCV(linearSVM,param_logreg, cv = 7)
gridLinSVC.fit(x_data_feature_engineering, y)

GridSearchCV(cv=7, error_score='raise',
       estimator=LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=123, tol=0.0001,
     verbose=0),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'C': [1750, 2000, 1500, 1200, 1000, 333, 100, 33, 10, 3.33, 1, 0.33, 0.1, 0.033, 0.01, 0.0033, 0.001, 0.00033, 0.0001]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [35]:
gridLinSVC.best_score_

0.7924528301886793

In [36]:
gridLinSVC.best_params_

{'C': 0.01}

In [56]:
param_linSVC = {'C' : [ 12, 15, 20, 10, 8, 5]}

In [57]:
gridKernelSVM = GridSearchCV(kernelSVM,param_linSVC, cv = 7)

In [58]:
gridKernelSVM.fit(x_data_feature_engineering, y)

GridSearchCV(cv=7, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=123, shrinking=True,
  tol=0.001, verbose=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'C': [12, 15, 20, 10, 8, 5]}, pre_dispatch='2*n_jobs',
       refit=True, scoring=None, verbose=0)

In [59]:
gridKernelSVM.best_score_

0.9751983465564371

In [60]:
gridKernelSVM.best_params_

{'C': 12}

In [61]:
param_dectree = {'max_depth' : [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]}

In [62]:
gridDecTree = GridSearchCV(decisiontree, param_dectree, cv = 7)

In [63]:
gridDecTree.fit(x_data_feature_engineering, y)

GridSearchCV(cv=7, error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=123, splitter='best'),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [64]:
gridDecTree.best_score_

0.9815987732515501

In [65]:
gridDecTree.best_params_

{'max_depth': 11}

In [66]:
?ExtraTreeClassifier

In [67]:
gridExTree = GridSearchCV(extratree, param_dectree, cv = 7)

In [68]:
gridExTree.fit(x_data_feature_engineering, y)

GridSearchCV(cv=7, error_score='raise',
       estimator=ExtraTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
          max_features='auto', max_leaf_nodes=None,
          min_impurity_split=1e-07, min_samples_leaf=1,
          min_samples_split=2, min_weight_fraction_leaf=0.0,
          random_state=123, splitter='random'),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [69]:
gridDecTree.best_score_

0.9815987732515501

In [70]:
gridExTree.best_params_

{'max_depth': 12}

In [71]:
from sklearn.grid_search import RandomizedSearchCV

In [72]:
param_RF = {'max_depth' : [ 3, 4, 5, 6, 7, 8, 9, 10,
                           11, 12, 13, 14, 15, 16, 17,
                           18, 19, 20, 21, 22, 23, 24,
                           25, 26, 17, 28, 29, 30 ],
           'n_estimators': [100, 150, 200, 250, 300, 350, 400, 500, 600, 700,
                           800, 900, 1000, 1100, 1200, 1500, 2000],
           'min_samples_split': [1, 2, 3, 4, 5, 7, 9, 12, 15],
           'min_samples_leaf': [1, 2, 5, 10]}

In [73]:
randomRF = RandomizedSearchCV(randomforest, param_RF,n_iter = 50 , cv = 7)

In [74]:
randomRF.fit(x_data_feature_engineering, y)

ValueError: min_samples_split must be at least 2 or in (0, 1], got 1

In [None]:
randomRF.best_score_

In [None]:
randomRF.best_params_