## Import Minimum Library

In [1]:
import pandas as pd
import numpy as np

## Read the Data

In [3]:
data = pd.read_csv("HR_comma_sep.csv")

IOError: File HR_comma_sep.csv does not exist

In [None]:
data.tail()

## Check for NAN

In [None]:
data.isnull().sum()

## Extract the Data

In [None]:
def extractData():
    data = pd.read_csv("HR_comma_sep.csv")
    y = data["left"]
    x = data.drop(["left"], axis = 1)
    
    return x, y, data

In [None]:
x, y, data = extractData()

## The Benchmark

In [None]:
y.value_counts(normalize=True)

## Dummy data

In [None]:
def numericalData():
    dataNum = x
    categorial = ["sales", "salary"]
    numerical = dataNum.drop(categorial, axis = 1)
    return numerical

In [None]:
def salaryData():
    Salary = np.where(data["salary"] == "low", 1.0, data["salary"])
    Salary = np.where(Salary == "medium", 2.0, Salary)
    Salary = np.where(Salary == "high", 3.0, Salary)
    Salary = pd.DataFrame(Salary)
    Salary.columns = ['salary numerical']
    
    return Salary

In [None]:
def salesSalaryData():
    sales = pd.get_dummies(data[["salary", "sales"]])
    return sales

In [None]:
numerical = numericalData()
salaryNumerical = salaryData()
categorial = salesSalaryData()
x_data = pd.concat([numerical, salaryNumerical, categorial], axis=1)

# The Classifiers before Feature Engineering

In [None]:
from sklearn.cross_validation import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_data, y, test_size = 0.25, random_state = 123)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.svm import LinearSVC, SVC
from sklearn.tree import ExtraTreeClassifier, DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, BaggingClassifier

from mlxtend.classifier import StackingClassifier
from sklearn.cross_validation import cross_val_score


In [None]:
knn = KNeighborsClassifier()

logreg = LogisticRegression(random_state = 123)

linearSVM = LinearSVC( random_state = 123)
kernelSVM = SVC( random_state = 123)

extratree = ExtraTreeClassifier(random_state = 123)
decisiontree = DecisionTreeClassifier(random_state = 123)

bagging = BaggingClassifier(random_state= 123, n_estimators= 100)
randomforest = RandomForestClassifier(random_state = 123, n_estimators= 100)
boosting = GradientBoostingClassifier( random_state = 123, n_estimators = 100)
adaboost = AdaBoostClassifier(random_state= 123, n_estimators= 100)



In [None]:
randomforest2 = RandomForestClassifier(random_state = 123, n_estimators= 100, max_features="log2")
randomforest3 = RandomForestClassifier(random_state = 123, n_estimators= 100, max_features=7)


In [None]:
classifiers = [knn, logreg, linearSVM, kernelSVM, decisiontree, extratree, bagging, randomforest, boosting, adaboost]

In [None]:
for i in classifiers:
    scores = cross_val_score(i, x_data, y, cv=7, scoring='accuracy')
    print("Accuracy: %0.4f (+/- %0.4f)" 
          % (scores.mean(), scores.std())), i

In [None]:
logregCVStacking = LogisticRegressionCV(random_state= 123)
sclf = StackingClassifier(classifiers=[randomforest, boosting, randomforest2, randomforest3],
                          use_probas=True,
                          average_probas=False,
                          meta_classifier= logregCVStacking)

print('7-fold cross validation:\n')

for clf, label in zip([knn, logreg, linearSVM, kernelSVM, decisiontree, extratree, bagging, randomforest, randomforest2, randomforest3, boosting, adaboost, sclf], 
                      ['KNN',
                       'Logistic Regression', 
                       'Linear SVM', 
                       'Kernel SVM',
                       'Decision Trees',
                       'Extra Trees',
                       'Bagging',
                       'Random Forest 1',
                       'Random Forest 2',
                       'Random Forest 3',
                       'Stochastic Gradient Boosting',
                       'Adaptive Boosting',
                       'Stacking    : Stacking All Model with Logistic Reg']):

    scores = cross_val_score(clf, x_data, y, cv=7, scoring='accuracy')
    print("Accuracy: %0.4f (+/- %0.4f) [%s]" 
          % (scores.mean(), scores.std(), label))

# Feature Engineering

In [None]:
def featureEngineering():
    
    data = x_data
    feature = pd.DataFrame()
    
    feature["satisfaction_hour"] = data["satisfaction_level"] * data["average_montly_hours"]
    feature["disatisfaction_hour"] = data["average_montly_hours"] - feature["satisfaction_hour"]
    
    feature["satisfaction_project"] =  data["satisfaction_level"] * data["number_project"]
    feature["disatisfaction_project"] =  data["number_project"] - feature["satisfaction_project"]
    
    feature["project_hour_average"] = data["average_montly_hours"] / data["number_project"]
    feature["time_per_project"] = 1/data["number_project"]
    
    feature["salary_per_hour"] = data["salary numerical"] / data["average_montly_hours"]
    feature["salary_per_project"] = data["salary numerical"] / data["number_project"]

    return feature
    

In [None]:
featureEngineering = featureEngineering()
x_data_feature_engineering = pd.concat([x_data, featureEngineering], axis=1)

# The Classifiers after Feature Engineering

In [None]:
logregCVStacking = LogisticRegressionCV(random_state= 123)
sclf = StackingClassifier(classifiers=[randomforest, boosting, randomforest2, randomforest3],
                          use_probas=True,
                          average_probas=False,
                          meta_classifier= logregCVStacking)

print('7-fold cross validation:\n')

for clf, label in zip([knn, logreg, linearSVM, kernelSVM, decisiontree, extratree, bagging, randomforest, randomforest2, randomforest3, boosting, adaboost, sclf], 
                      ['KNN',
                       'Logistic Regression', 
                       'Linear SVM', 
                       'Kernel SVM',
                       'Decision Trees',
                       'Extra Trees',
                       'Bagging',
                       'Random Forest 1',
                       'Random Forest 2',
                       'Random Forest 3',
                       'Stochastic Gradient Boosting',
                       'Adaptive Boosting',
                       'Stacking    : Stacking All Model with Logistic Reg']):

    scores = cross_val_score(clf, x_data_feature_engineering, y, cv=7, scoring='accuracy')
    print("Accuracy: %0.4f (+/- %0.4f) [%s]" 
          % (scores.mean(), scores.std(), label))

## GridSearch Optim

In [None]:
param_knn = {'n_neighbors' : [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 , 13, 14, 15, 16, 17, 18 , 19, 20]}

In [None]:
from sklearn.grid_search import GridSearchCV

In [None]:
gridKNN = GridSearchCV(knn,param_knn, cv = 7)
gridKNN.fit( x_data_feature_engineering, y)

In [None]:
gridKNN.best_params_

In [None]:
gridKNN.best_score_

In [None]:
param_logreg = {'C' : [1750, 2000, 1500, 1200, 1000, 333, 100, 33, 10, 3.33, 1, 0.33, 0.1, 0.033, 0.01, 0.0033, 0.001, 0.00033, 0.0001]}

In [None]:
gridLogReg = GridSearchCV(logreg,param_logreg, cv = 7)
gridLogReg.fit(x_data_feature_engineering, y)

In [None]:
gridLogReg.best_score_

In [None]:
gridLogReg.best_params_

In [None]:
param_linSVC = {'C' : [1750, 2000, 1500, 1200, 1000, 333, 100, 33, 10, 3.33, 1, 0.33, 0.1, 0.033, 0.01, 0.0033, 0.001, 0.00033, 0.0001]}

In [None]:
gridLinSVC = GridSearchCV(linearSVM,param_logreg, cv = 7)
gridLinSVC.fit(x_data_feature_engineering, y)

In [None]:
gridLinSVC.best_score_

In [None]:
gridLinSVC.best_params_

In [None]:
param_linSVC = {'C' : [ 12, 15, 20, 10, 8, 5]}

In [None]:
gridKernelSVM = GridSearchCV(kernelSVM,param_linSVC, cv = 7)

In [None]:
gridKernelSVM.fit(x_data_feature_engineering, y)

In [None]:
gridKernelSVM.best_score_

In [None]:
gridKernelSVM.best_params_

In [None]:
param_dectree = {'max_depth' : [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]}

In [None]:
gridDecTree = GridSearchCV(decisiontree, param_dectree, cv = 7)

In [None]:
gridDecTree.fit(x_data_feature_engineering, y)

In [None]:
gridDecTree.best_score_

In [None]:
gridDecTree.best_params_

In [None]:
?ExtraTreeClassifier

In [None]:
gridExTree = GridSearchCV(extratree, param_dectree, cv = 7)

In [None]:
gridExTree.fit(x_data_feature_engineering, y)

In [None]:
gridDecTree.best_score_

In [None]:
gridExTree.best_params_

In [None]:
from sklearn.grid_search import RandomizedSearchCV

In [None]:
param_RF = {'max_depth' : [ 3, 4, 5, 6, 7, 8, 9, 10,
                           11, 12, 13, 14, 15, 16, 17,
                           18, 19, 20, 21, 22, 23, 24,
                           25, 26, 17, 28, 29, 30 ],
           'n_estimators': [100, 150, 200, 250, 300, 350, 400, 500, 600, 700,
                           800, 900, 1000, 1100, 1200, 1500, 2000],
           'min_samples_split': [1, 2, 3, 4, 5, 7, 9, 12, 15],
           'min_samples_leaf': [1, 2, 5, 10]}

In [None]:
randomRF = RandomizedSearchCV(randomforest, param_RF,n_iter = 50 , cv = 7)

In [None]:
randomRF.fit(x_data_feature_engineering, y)

In [None]:
randomRF.best_score_

In [None]:
randomRF.best_params_