In [1]:
import os
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')
sns.set_palette('muted')
sns.set_context("notebook", font_scale=2.5,
                rc={"lines.linewidth": 2.5})
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

In [2]:
data = pd.read_csv('Attrition_dummy.csv')
data.drop(columns = ['Unnamed: 0'], inplace=True)
data.head()

Unnamed: 0,Attrition,Age,DailyRate,DistanceFromHome,EnvironmentSatisfaction,Gender,HourlyRate,JobLevel,JobRole,JobSatisfaction,...,cat_JobInvolvement_2,cat_JobInvolvement_3,cat_JobInvolvement_4,cat_MaritalStatus_0,cat_MaritalStatus_1,cat_MaritalStatus_2,cat_StockOptionLevel_0,cat_StockOptionLevel_1,cat_StockOptionLevel_2,cat_StockOptionLevel_3
0,0,41,1102,1,2,0,94,2,0,4,...,0,1,0,1,0,0,1,0,0,0
1,1,49,279,8,3,1,61,2,1,2,...,1,0,0,0,1,0,0,1,0,0
2,0,37,1373,2,4,1,92,1,2,3,...,1,0,0,1,0,0,1,0,0,0
3,1,33,1392,3,4,0,56,1,1,3,...,0,1,0,0,1,0,1,0,0,0
4,1,27,591,2,1,1,40,1,2,2,...,0,1,0,0,1,0,0,1,0,0


In [3]:
data.shape

(1470, 52)

In [4]:
data["Attrition"].value_counts()

1    1233
0     237
Name: Attrition, dtype: int64

### Imbalance learn

In [5]:
from imblearn.datasets import make_imbalance
from collections import Counter
from imblearn.under_sampling import RandomUnderSampler

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [6]:
y = data[['Attrition']]
X = data.drop(columns = ['Attrition'])

In [7]:
rus = RandomUnderSampler(random_state=0)
X_resampled, y_resampled = rus.fit_resample(X, y)
print(sorted(Counter(y_resampled).items()))

[('Attrition', 1)]


In [8]:
print(y_resampled.shape)
print(X_resampled.shape)

(474, 1)
(474, 51)


In [9]:
y_resampled['Attrition'].value_counts()

1    237
0    237
Name: Attrition, dtype: int64

### Nuevo dataframe balanceado

In [10]:
df = pd.concat([y_resampled, X_resampled], axis=1)
df.head()

Unnamed: 0,Attrition,Age,DailyRate,DistanceFromHome,EnvironmentSatisfaction,Gender,HourlyRate,JobLevel,JobRole,JobSatisfaction,...,cat_JobInvolvement_2,cat_JobInvolvement_3,cat_JobInvolvement_4,cat_MaritalStatus_0,cat_MaritalStatus_1,cat_MaritalStatus_2,cat_StockOptionLevel_0,cat_StockOptionLevel_1,cat_StockOptionLevel_2,cat_StockOptionLevel_3
0,0,41,1102,1,2,0,94,2,0,4,...,0,1,0,1,0,0,1,0,0,0
1,0,37,1373,2,4,1,92,1,2,3,...,1,0,0,1,0,0,1,0,0,0
2,0,28,103,24,3,1,50,1,2,3,...,1,0,0,1,0,0,1,0,0,0
3,0,36,1218,9,3,1,82,1,6,1,...,1,0,0,1,0,0,1,0,0,0
4,0,34,699,6,2,1,83,1,1,1,...,0,1,0,1,0,0,1,0,0,0


In [11]:
df.shape

(474, 52)

### Separación de los datos

In [12]:
test_size = 0.4
df_train, df_test = train_test_split(df, test_size=test_size, random_state=42)

In [13]:
df_validate, df_test = train_test_split(df_test, test_size=0.5, random_state=42)

In [14]:
print(len(df_train), len(df_test), len(df_validate))

284 95 95


### Estandarización datos

In [15]:
X_train = df_train.drop("Attrition", axis = 1)
y_train = df_train["Attrition"]
X_test = df_test.drop("Attrition", axis = 1)
y_test = df_test["Attrition"]
X_validate = df_validate.drop("Attrition", axis = 1)
y_validate = df_validate["Attrition"]

In [16]:
col_names = X_train.columns.values.tolist()

In [17]:
num_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('imputer', SimpleImputer(strategy='median'))
])

col_transformer = ColumnTransformer(transformers=[
    ('num', num_pipeline, col_names)
])

In [18]:
col_transformer.fit(X_train)

ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
                  transformer_weights=None,
                  transformers=[('num',
                                 Pipeline(memory=None,
                                          steps=[('scaler',
                                                  StandardScaler(copy=True,
                                                                 with_mean=True,
                                                                 with_std=True)),
                                                 ('imputer',
                                                  SimpleImputer(add_indicator=False,
                                                                copy=True,
                                                                fill_value=None,
                                                                missing_values=nan,
                                                                strategy='median',
                      

In [19]:
X_trained_scaled = pd.DataFrame(col_transformer.transform(X_train), index=X_train.index, columns = col_names)
X_test_scaled = pd.DataFrame(col_transformer.transform(X_test), index=X_test.index, columns = col_names)
X_validate_scaled = pd.DataFrame(col_transformer.transform(X_validate), index=X_validate.index, columns = col_names)

In [20]:
X_trained_scaled.head()

Unnamed: 0,Age,DailyRate,DistanceFromHome,EnvironmentSatisfaction,Gender,HourlyRate,JobLevel,JobRole,JobSatisfaction,MonthlyIncome,...,cat_JobInvolvement_2,cat_JobInvolvement_3,cat_JobInvolvement_4,cat_MaritalStatus_0,cat_MaritalStatus_1,cat_MaritalStatus_2,cat_StockOptionLevel_0,cat_StockOptionLevel_1,cat_StockOptionLevel_2,cat_StockOptionLevel_3
222,-1.448976,-0.955404,-0.302058,1.197559,0.725866,0.480217,-0.797226,-0.666413,-0.542848,-0.732953,...,-0.571929,0.824908,-0.310685,1.239239,-0.843115,-0.484544,0.972217,-0.742908,-0.260133,-0.267765
225,-0.730093,-0.07947,1.726226,0.349413,0.725866,-1.001119,-0.797226,1.445878,-1.433998,-1.01548,...,-0.571929,-1.212256,3.218695,1.239239,-0.843115,-0.484544,0.972217,-0.742908,-0.260133,-0.267765
234,2.042742,-1.155265,-0.302058,1.197559,0.725866,0.332084,-0.797226,-0.243955,0.348301,-0.760162,...,-0.571929,0.824908,-0.310685,-0.806947,1.186077,-0.484544,-1.028577,1.346061,-0.260133,-0.267765
364,-0.216605,-1.387203,-1.017923,-0.498734,0.725866,0.480217,0.066943,-1.088871,1.239451,0.75088,...,-0.571929,-1.212256,3.218695,-0.806947,-0.843115,2.063797,-1.028577,1.346061,-0.260133,-0.267765
290,2.248137,-1.056568,-0.898612,-0.498734,0.725866,-0.655474,1.79528,1.02342,-0.542848,2.094164,...,-0.571929,0.824908,-0.310685,-0.806947,-0.843115,2.063797,-1.028577,1.346061,-0.260133,-0.267765


In [21]:
X_test_scaled.head()

Unnamed: 0,Age,DailyRate,DistanceFromHome,EnvironmentSatisfaction,Gender,HourlyRate,JobLevel,JobRole,JobSatisfaction,MonthlyIncome,...,cat_JobInvolvement_2,cat_JobInvolvement_3,cat_JobInvolvement_4,cat_MaritalStatus_0,cat_MaritalStatus_1,cat_MaritalStatus_2,cat_StockOptionLevel_0,cat_StockOptionLevel_1,cat_StockOptionLevel_2,cat_StockOptionLevel_3
203,-0.627395,-0.439713,-0.182747,0.349413,-1.377664,0.035816,-0.797226,2.290794,1.239451,-0.792691,...,1.748469,-1.212256,-0.310685,-0.806947,-0.843115,2.063797,-1.028577,1.346061,-0.260133,-0.267765
390,-0.01121,0.976586,-0.779302,-1.346881,-1.377664,1.665286,0.066943,-1.088871,-0.542848,0.659432,...,-0.571929,0.824908,-0.310685,1.239239,-0.843115,-0.484544,0.972217,-0.742908,-0.260133,-0.267765
153,0.810371,-0.387898,0.652429,-1.346881,-1.377664,0.381461,0.931111,0.600962,1.239451,0.393475,...,-0.571929,0.824908,-0.310685,-0.806947,1.186077,-0.484544,-1.028577,1.346061,-0.260133,-0.267765
176,-0.216605,0.811269,-0.063436,-1.346881,-1.377664,-0.655474,-0.797226,2.290794,0.348301,-0.677716,...,-0.571929,0.824908,-0.310685,-0.806947,1.186077,-0.484544,0.972217,-0.742908,-0.260133,-0.267765
407,-0.113908,0.058705,0.055874,-0.498734,0.725866,-1.001119,-0.797226,-0.243955,1.239451,-0.437332,...,-0.571929,0.824908,-0.310685,-0.806947,-0.843115,2.063797,-1.028577,1.346061,-0.260133,-0.267765


In [22]:
X_validate_scaled.head()

Unnamed: 0,Age,DailyRate,DistanceFromHome,EnvironmentSatisfaction,Gender,HourlyRate,JobLevel,JobRole,JobSatisfaction,MonthlyIncome,...,cat_JobInvolvement_2,cat_JobInvolvement_3,cat_JobInvolvement_4,cat_MaritalStatus_0,cat_MaritalStatus_1,cat_MaritalStatus_2,cat_StockOptionLevel_0,cat_StockOptionLevel_1,cat_StockOptionLevel_2,cat_StockOptionLevel_3
335,0.091488,-0.123884,-1.017923,-1.346881,-1.377664,0.727107,0.931111,1.868336,1.239451,1.544248,...,-0.571929,0.824908,-0.310685,-0.806947,-0.843115,2.063797,-1.028577,-0.742908,3.844188,-0.267765
154,-0.730093,0.068575,-0.302058,-0.498734,-1.377664,-1.297386,-0.797226,-0.243955,0.348301,-0.55558,...,-0.571929,0.824908,-0.310685,-0.806947,-0.843115,2.063797,-1.028577,-0.742908,3.844188,-0.267765
460,-0.216605,0.717507,-0.898612,1.197559,0.725866,-1.001119,0.066943,0.178503,0.348301,0.714464,...,1.748469,-1.212256,-0.310685,-0.806947,1.186077,-0.484544,0.972217,-0.742908,-0.260133,-0.267765
84,0.604976,0.381938,1.129672,0.349413,0.725866,-0.408585,-0.797226,-0.666413,0.348301,-0.674238,...,-0.571929,-1.212256,3.218695,-0.806947,-0.843115,2.063797,0.972217,-0.742908,-0.260133,-0.267765
465,-0.319303,-0.952937,-0.898612,1.197559,0.725866,1.369019,0.066943,-1.088871,1.239451,-0.112456,...,-0.571929,0.824908,-0.310685,-0.806947,-0.843115,2.063797,-1.028577,-0.742908,3.844188,-0.267765


### Feature selection

In [23]:
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegression, Lasso
from sklearn.feature_selection import SelectFromModel

In [24]:
sel = RFECV(estimator=LogisticRegression(solver='liblinear'), cv=5, scoring='accuracy')
sel.fit(X_trained_scaled, y_train)

RFECV(cv=5,
      estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                   fit_intercept=True, intercept_scaling=1,
                                   l1_ratio=None, max_iter=100,
                                   multi_class='auto', n_jobs=None,
                                   penalty='l2', random_state=None,
                                   solver='liblinear', tol=0.0001, verbose=0,
                                   warm_start=False),
      min_features_to_select=1, n_jobs=None, scoring='accuracy', step=1,
      verbose=0)

In [25]:
# let's add the variable names and order it for clearer visualisation
selected_feat = X_trained_scaled.columns[(sel.get_support())]
len(selected_feat)

21

In [26]:
# let's display the list of features
selected_feat

Index(['Age', 'DistanceFromHome', 'EnvironmentSatisfaction', 'HourlyRate',
       'JobLevel', 'JobSatisfaction', 'MonthlyIncome', 'NumCompaniesWorked',
       'OverTime', 'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole',
       'YearsSinceLastPromotion', 'YearsWithCurrManager',
       'cat_BusinessTravel_1', 'cat_BusinessTravel_2', 'cat_EducationField_3',
       'cat_EducationField_4', 'cat_JobInvolvement_1',
       'cat_StockOptionLevel_0', 'cat_StockOptionLevel_1'],
      dtype='object')

### GridSearch - Modelos clasificación

In [35]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.externals import joblib
import datetime as dt
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import neighbors
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.neural_network import MLPClassifier
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, recall_score, precision_score, roc_curve, auc,roc_auc_score, classification_report, f1_score



In [36]:
models = {
            'linear' : {
                    'mod' : LogisticRegression(solver = 'saga'),
                    'par' : {'penalty' : ('l1','l2','elasticnet','none'),
                                'C': [1,1.5,2],
                            'l1_ratio':[0.2,0.5,0.8]}
                    },                     
            'gradient' : {
                    'mod' : GradientBoostingClassifier(),
                    'par' : {'loss' : ('deviance', 'exponential'),
                             'max_depth' : [3, 4, 5, 6, 7],
                            'n_estimators':[50, 100, 200]}
                        },
           'tree':{'mod': DecisionTreeClassifier(),
                     'par':{'splitter':('best','random'),
                            'max_depth': [None, 2,4,6],
                            'min_samples_leaf':[1,5,8]}},
            'svm' : {
                    'mod' : svm.SVC(probability=True),
                    'par' : {'kernel' : ( 'linear', 'rbf')}
                    },
        'RandomForest' : {
                    'mod' : RandomForestClassifier(),
                    'par' : {'max_depth' :[None, 2,4,6],
                            'min_samples_leaf':[1,5,8],
                            'n_estimators':[50, 100, 200]}
                        },
        'Knn' : {
                    'mod' : neighbors.KNeighborsClassifier(),
                    'par' : {'n_neighbors' :[5, 10, 15],
                            'leaf_size':[15,25,30]}
                        },
        'GCP' : {
                    'mod' : GaussianProcessClassifier(),
                    'par' : {'optimizer' :['fmin_l_bfgs_b'],
                            }
                        },   
        'MLP' : {
                    'mod' : MLPClassifier(max_iter=100, n_iter_no_change = 50, tol=1e-2),
                    'par' : {'hidden_layer_sizes': [(1),(10),(50),(100),(50,50),(10,10),(5,5,5)],
                        'solver': ['sgd','adam'],
                        'learning_rate_init': [0.2, 0.5, 0.9]}
        }
    
                }



In [37]:
def grid(x_name,n_proc, os_X_tt, os_Y_tt, X_test, y_test,  models, score = 'roc_auc', cv = 5):
    
    # Gridsearch
    
    result = dict()
    bestmodels = models.copy()
    for name in models:
        print('*'*80)
        print("Model: " + name)
        t_beg = time.time()

        pipeline = Pipeline([(name,  bestmodels[name]['mod'])])          
        parameters = {}          
        for par in bestmodels[name]['par']:
            aux = name + '__' +  par
            parameters[aux] = bestmodels[name]['par'][par]    
        aux = GridSearchCV(pipeline, parameters, n_jobs = n_proc,\
                          scoring = score, verbose=2, cv = cv)
        
        print(os_X_tt.columns.values)
        
        aux.fit(os_X_tt, os_Y_tt)
        y_true, y_pred = y_test , aux.predict(X_test)
        
        precision = precision_score(y_test, y_pred, average="macro")
        recall = recall_score(y_test, y_pred, average="macro")  
        auc = roc_auc_score(y_test, y_pred, average="macro")
        f1score = f1_score(y_test, y_pred, average="macro")
        
        
        bestmodels[name]['bestModel'] = aux.best_estimator_
        bestmodels[name][score] = aux.best_score_
        bestmodels[name]['cols_order'] = os_X_tt.columns.values
        selection_time = time.time() - t_beg

        bestmodels[name]['selection_time'] = selection_time

        sample_f_path = f'modelos/{x_name}' + f'{name}_{dt.datetime.now().strftime("%Y%m%d-%H%M")}.sav'

        print(f"Saving model at {sample_f_path}")    
        joblib.dump(bestmodels[name]['bestModel'], sample_f_path)

        print(f"El tiempo de seleccion fue: {selection_time:0.3f} s")
        print(f"El error {score} de la familia {name} es: {bestmodels[name][score]:0.3f}")
        print('*'*80)
    
        result[name] = {"auc": round(auc,3), "precision": round(precision,3), "recall": round(recall,3),"f1score": round(f1score,3)}
        
        
    mod_name = None
    best_mae = -np.inf
    for name in models:
        if bestmodels[name][score] > best_mae:
            mod_name = name
            best_mae = bestmodels[name][score]

    print(f"best model: " + mod_name + f" with an error {score} of: " + str(best_mae))
    
    return bestmodels, result

In [38]:
#Bestmodels

def get_max(dictionary, key_val):
    auc_list = []
    auc_dict = {}

    for key in dictionary:
        for key2 in dictionary[key]:
            if key_val in key2:
                auc_list.append(dictionary[key][key_val])

    max_key = ''
    max_val = max(auc_list)

    for key in dictionary:
        for key2 in dictionary[key]:
            if max_val == dictionary[key][key_val]:
                max_key = key
                
    return max_key, max_val

In [39]:
# variables seleccionadas
X1_train= X_trained_scaled[selected_feat]
X1_test = X_test_scaled[selected_feat]
X1_validate = X_validate_scaled[selected_feat]


In [40]:
Bestmodels_X1, result_X1  = grid('X1', -1, X1_train, y_train.values, X1_test, y_test, models, score = 'roc_auc', cv = 5)

********************************************************************************
Model: linear
['Age' 'DistanceFromHome' 'EnvironmentSatisfaction' 'HourlyRate'
 'JobLevel' 'JobSatisfaction' 'MonthlyIncome' 'NumCompaniesWorked'
 'OverTime' 'WorkLifeBalance' 'YearsAtCompany' 'YearsInCurrentRole'
 'YearsSinceLastPromotion' 'YearsWithCurrManager' 'cat_BusinessTravel_1'
 'cat_BusinessTravel_2' 'cat_EducationField_3' 'cat_EducationField_4'
 'cat_JobInvolvement_1' 'cat_StockOptionLevel_0' 'cat_StockOptionLevel_1']
Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: Done 149 out of 180 | elapsed:    2.1s remaining:    0.4s
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:    2.1s finished
  "(penalty={})".format(self.penalty))
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.


Saving model at modelos/X1linear_20200515-2258.sav
El tiempo de seleccion fue: 2.168 s
El error roc_auc de la familia linear es: 0.845
********************************************************************************
********************************************************************************
Model: gradient
['Age' 'DistanceFromHome' 'EnvironmentSatisfaction' 'HourlyRate'
 'JobLevel' 'JobSatisfaction' 'MonthlyIncome' 'NumCompaniesWorked'
 'OverTime' 'WorkLifeBalance' 'YearsAtCompany' 'YearsInCurrentRole'
 'YearsSinceLastPromotion' 'YearsWithCurrManager' 'cat_BusinessTravel_1'
 'cat_BusinessTravel_2' 'cat_EducationField_3' 'cat_EducationField_4'
 'cat_JobInvolvement_1' 'cat_StockOptionLevel_0' 'cat_StockOptionLevel_1']
Fitting 5 folds for each of 30 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 119 out of 150 | elapsed:    2.4s remaining:    0.6s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:    2.9s finished


Saving model at modelos/X1gradient_20200515-2258.sav
El tiempo de seleccion fue: 3.121 s
El error roc_auc de la familia gradient es: 0.813
********************************************************************************
********************************************************************************
Model: tree
['Age' 'DistanceFromHome' 'EnvironmentSatisfaction' 'HourlyRate'
 'JobLevel' 'JobSatisfaction' 'MonthlyIncome' 'NumCompaniesWorked'
 'OverTime' 'WorkLifeBalance' 'YearsAtCompany' 'YearsInCurrentRole'
 'YearsSinceLastPromotion' 'YearsWithCurrManager' 'cat_BusinessTravel_1'
 'cat_BusinessTravel_2' 'cat_EducationField_3' 'cat_EducationField_4'
 'cat_JobInvolvement_1' 'cat_StockOptionLevel_0' 'cat_StockOptionLevel_1']
Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:    0.2s finished


Saving model at modelos/X1tree_20200515-2258.sav
El tiempo de seleccion fue: 0.201 s
El error roc_auc de la familia tree es: 0.768
********************************************************************************
********************************************************************************
Model: svm
['Age' 'DistanceFromHome' 'EnvironmentSatisfaction' 'HourlyRate'
 'JobLevel' 'JobSatisfaction' 'MonthlyIncome' 'NumCompaniesWorked'
 'OverTime' 'WorkLifeBalance' 'YearsAtCompany' 'YearsInCurrentRole'
 'YearsSinceLastPromotion' 'YearsWithCurrManager' 'cat_BusinessTravel_1'
 'cat_BusinessTravel_2' 'cat_EducationField_3' 'cat_EducationField_4'
 'cat_JobInvolvement_1' 'cat_StockOptionLevel_0' 'cat_StockOptionLevel_1']
Fitting 5 folds for each of 2 candidates, totalling 10 fits
Saving model at modelos/X1svm_20200515-2258.sav
El tiempo de seleccion fue: 0.068 s
El error roc_auc de la familia svm es: 0.838
********************************************************************************
********

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of  10 | elapsed:    0.0s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:    3.6s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  37 out of  45 | elapsed:    0.1s remaining:    0.0s


Saving model at modelos/X1RandomForest_20200515-2258.sav
El tiempo de seleccion fue: 3.658 s
El error roc_auc de la familia RandomForest es: 0.826
********************************************************************************
********************************************************************************
Model: Knn
['Age' 'DistanceFromHome' 'EnvironmentSatisfaction' 'HourlyRate'
 'JobLevel' 'JobSatisfaction' 'MonthlyIncome' 'NumCompaniesWorked'
 'OverTime' 'WorkLifeBalance' 'YearsAtCompany' 'YearsInCurrentRole'
 'YearsSinceLastPromotion' 'YearsWithCurrManager' 'cat_BusinessTravel_1'
 'cat_BusinessTravel_2' 'cat_EducationField_3' 'cat_EducationField_4'
 'cat_JobInvolvement_1' 'cat_StockOptionLevel_0' 'cat_StockOptionLevel_1']
Fitting 5 folds for each of 9 candidates, totalling 45 fits
Saving model at modelos/X1Knn_20200515-2258.sav
El tiempo de seleccion fue: 0.110 s
El error roc_auc de la familia Knn es: 0.811
*************************************************************************

[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:    0.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.


El tiempo de seleccion fue: 0.067 s
El error roc_auc de la familia GCP es: 0.727
********************************************************************************
********************************************************************************
Model: MLP
['Age' 'DistanceFromHome' 'EnvironmentSatisfaction' 'HourlyRate'
 'JobLevel' 'JobSatisfaction' 'MonthlyIncome' 'NumCompaniesWorked'
 'OverTime' 'WorkLifeBalance' 'YearsAtCompany' 'YearsInCurrentRole'
 'YearsSinceLastPromotion' 'YearsWithCurrManager' 'cat_BusinessTravel_1'
 'cat_BusinessTravel_2' 'cat_EducationField_3' 'cat_EducationField_4'
 'cat_JobInvolvement_1' 'cat_StockOptionLevel_0' 'cat_StockOptionLevel_1']
Fitting 5 folds for each of 42 candidates, totalling 210 fits


[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 179 out of 210 | elapsed:    2.2s remaining:    0.4s
[Parallel(n_jobs=-1)]: Done 210 out of 210 | elapsed:    2.4s finished


Saving model at modelos/X1MLP_20200515-2258.sav
El tiempo de seleccion fue: 2.448 s
El error roc_auc de la familia MLP es: 0.816
********************************************************************************
best model: linear with an error roc_auc of: 0.8448914431673054


In [41]:
pd.DataFrame.from_dict(result_X1)

Unnamed: 0,linear,gradient,tree,svm,RandomForest,Knn,GCP,MLP
auc,0.736,0.679,0.717,0.756,0.741,0.72,0.694,0.744
precision,0.736,0.68,0.715,0.757,0.741,0.72,0.693,0.746
recall,0.736,0.679,0.717,0.756,0.741,0.72,0.694,0.744
f1score,0.736,0.674,0.715,0.756,0.737,0.716,0.694,0.745


In [42]:
%matplotlib notebook
sns.set_context("notebook", font_scale=1.0,
                rc={"lines.linewidth": 2.5})
fig, ax = plt.subplots()
for m in Bestmodels_X1:
    model =Bestmodels_X1[m]['bestModel'] # select the model
#     y_pred=model.predict(X_test) # predict the test data
# Compute False postive rate, and True positive rate
    fpr, tpr, thresholds = roc_curve(y_validate, model.predict_proba(X1_validate)[:,1])
# Calculate Area under the curve to display on the plot
    auc = roc_auc_score(y_validate,model.predict(X1_validate))
# Now, plot the computed values
    plt.plot(fpr, tpr, label='%s ROC (area = %0.3f)' % (m, auc))
# Custom settings for the plot 


plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('1-Specificity(False Positive Rate)')
plt.ylabel('Sensitivity(True Positive Rate)')
plt.title('Receiver Operating Characteristic X1')
plt.legend(loc="lower right", fontsize ='xx-small')
plt.show()   # Display
plt.savefig('Bestmodels_X1.png', facecolor=fig.get_facecolor(), bbox_inches='tight', xlabelsize=1)

<IPython.core.display.Javascript object>

### Análisis random forest

In [43]:
Bestmodels_X1['RandomForest']

{'mod': RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                        criterion='gini', max_depth=None, max_features='auto',
                        max_leaf_nodes=None, max_samples=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, n_estimators=100,
                        n_jobs=None, oob_score=False, random_state=None,
                        verbose=0, warm_start=False),
 'par': {'max_depth': [None, 2, 4, 6],
  'min_samples_leaf': [1, 5, 8],
  'n_estimators': [50, 100, 200]},
 'bestModel': Pipeline(memory=None,
          steps=[('RandomForest',
                  RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                         class_weight=None, criterion='gini',
                                         max_depth=None, max_features='auto',
                                

In [45]:
rf = RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                         class_weight=None, criterion='gini',
                                         max_depth=None, max_features='auto',
                                         max_leaf_nodes=None, max_samples=None,
                                         min_impurity_decrease=0.0,
                                         min_impurity_split=None,
                                         min_samples_leaf=5, min_samples_split=2,
                                         min_weight_fraction_leaf=0.0,
                                         n_estimators=50, n_jobs=None,
                                         oob_score=False, random_state=None,
                                         verbose=0, warm_start=False)
rf.fit(X1_train, y_train)
pred = rf.predict_proba(X1_train)

In [46]:
print("RF train accuracy: %0.3f" % rf.score(X1_train, y_train))
print("RF test accuracy: %0.3f" % rf.score(X1_test, y_test))
print("RF validate| accuracy: %0.3f" % rf.score(X1_validate, y_validate))

RF train accuracy: 0.923
RF test accuracy: 0.705
RF validate| accuracy: 0.747


In [47]:
### Random forest importance

tree_feature_importances = (
    rf.feature_importances_)
sorted_idx = tree_feature_importances.argsort()

y_ticks = np.arange(0, len(selected_feat))
fig, ax = plt.subplots()
ax.barh(y_ticks, tree_feature_importances[sorted_idx])
ax.set_yticklabels(selected_feat[sorted_idx])
ax.set_yticks(y_ticks)
ax.set_title("Random Forest Feature Importances (MDI)")
fig.tight_layout()
plt.show()

<IPython.core.display.Javascript object>

In [48]:
from sklearn.inspection import permutation_importance
result = permutation_importance(rf, X1_test, y_test, n_repeats=10,
                                random_state=42, n_jobs=2)
sorted_idx = result.importances_mean.argsort()

fig, ax = plt.subplots()
ax.boxplot(result.importances[sorted_idx].T,
           vert=False, labels=X_test.columns[sorted_idx])
ax.set_title("Permutation Importances (test set)")
fig.tight_layout()
plt.show()

<IPython.core.display.Javascript object>