In [62]:
import pandas as pd
import numpy as np
from sklearn.model_selection import RandomizedSearchCV, cross_val_score
from sklearn.ensemble import GradientBoostingClassifier
from xgboost.sklearn import XGBClassifier

In [2]:
file1=r'hr_train.csv'
ci=pd.read_csv(file1)
ci.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales,salary
0,0.42,0.46,2,150,3,0,1,0,sales,medium
1,0.66,0.77,2,171,2,0,0,0,technical,medium
2,0.55,0.49,5,240,3,0,0,0,technical,high
3,0.22,0.88,4,213,3,1,0,0,technical,medium
4,0.2,0.72,6,224,4,0,1,0,technical,medium


In [3]:
ci.left.value_counts()

0    7424
1    3075
Name: left, dtype: int64

In [4]:
7424/3075

2.414308943089431

In [5]:
cat_cols=ci.select_dtypes(['object']).columns
cat_cols=['sales', 'salary']
cat_cols

['sales', 'salary']

In [6]:
for col in cat_cols:
    freqs=ci[col].value_counts()
    selected_cats=freqs.index[freqs>500][:-1]
    
    print (col)
    for cat in selected_cats:
        name=col+'_'+cat
        
        ci[name]=(ci[col]==cat).astype(int)
    del ci[col]

sales
salary


In [7]:
print (ci.shape)

(10499, 18)


In [8]:
x_train=ci.drop('left',axis=1)
y_train=ci['left']
x_train.columns

Index(['satisfaction_level', 'last_evaluation', 'number_project',
       'average_montly_hours', 'time_spend_company', 'Work_accident',
       'promotion_last_5years', 'sales_sales', 'sales_technical',
       'sales_support', 'sales_IT', 'sales_product_mng', 'sales_marketing',
       'sales_hr', 'sales_RandD', 'salary_low', 'salary_medium'],
      dtype='object')

In [9]:
del ci

In [10]:
def report(results, n_top=3):
    for i in range(1,n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: (0)".format(i))
            print("Mean validation score: {0:.3f}  (std: {1:.5f})".format(results['mean_test_score'][candidate], results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

In [11]:
xgb_params = {
               "learning_rate":[0.01,0.05,0.1,0.3,0.5],
               "gamma":[i/10.0 for i in range (0,5)], "max_depth": [2,3,4,5,6,7,8], "max_child_weight":[1,2,5,10], "max_delta_step":[0,1,2,5,10], "subsample":[i/10.0 for i in range(5,10)],
               "colsample_bytree":[i/10.0 for i in range(5,10)], "colsample_bylevel":[i/10.0 for i in range(5,10)], "reg_lambda":[1e-5, 1e-2, 0.1, 1, 100], "reg_alpha":[1e-5, 1e-2, 0.1, 1, 100],
               "scale_pos_weight":[1,2,3,4,5,6,7,8,9], "n_estimators":[100,500,700,1000]
}

In [12]:
xgb=XGBClassifier(objective='binary:logistic')

In [13]:
n_iter=10

random_search=RandomizedSearchCV(xgb,n_jobs=-1,cv=5,n_iter=n_iter,scoring='roc_auc',param_distributions=xgb_params)

In [14]:
random_search.fit(x_train,y_train)

Parameters: { "max_child_weight" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




In [15]:
report(random_search.cv_results_,5)

Model with rank: (0)
Mean validation score: 0.838  (std: 0.01924)
Parameters: {'subsample': 0.5, 'scale_pos_weight': 6, 'reg_lambda': 1e-05, 'reg_alpha': 0.01, 'n_estimators': 100, 'max_depth': 2, 'max_delta_step': 5, 'max_child_weight': 10, 'learning_rate': 0.1, 'gamma': 0.0, 'colsample_bytree': 0.8, 'colsample_bylevel': 0.5}

Model with rank: (0)
Mean validation score: 0.837  (std: 0.01944)
Parameters: {'subsample': 0.9, 'scale_pos_weight': 3, 'reg_lambda': 1, 'reg_alpha': 100, 'n_estimators': 1000, 'max_depth': 8, 'max_delta_step': 1, 'max_child_weight': 5, 'learning_rate': 0.05, 'gamma': 0.3, 'colsample_bytree': 0.9, 'colsample_bylevel': 0.6}

Model with rank: (0)
Mean validation score: 0.837  (std: 0.01735)
Parameters: {'subsample': 0.6, 'scale_pos_weight': 3, 'reg_lambda': 100, 'reg_alpha': 1, 'n_estimators': 1000, 'max_depth': 8, 'max_delta_step': 0, 'max_child_weight': 1, 'learning_rate': 0.05, 'gamma': 0.1, 'colsample_bytree': 0.8, 'colsample_bylevel': 0.8}

Model with rank: (

In [16]:
random_search.best_estimator_

In [17]:
xgb_params = {"n_estimators":[100,500,700,900,1000,1200,1500]}
                
             


In [18]:
xgb1=XGBClassifier(learning_rate=0.1,subsample=0.8,colsample_bylevel=0.8,colsample_bytree=0.8)

In [19]:
from sklearn.model_selection import GridSearchCV

In [20]:
grid_search=GridSearchCV(xgb1,cv=5,param_grid=xgb_params,scoring='roc_auc',verbose=2,n_jobs=-1)

In [21]:
grid_search.fit(x_train,y_train)

Fitting 5 folds for each of 7 candidates, totalling 35 fits


In [22]:
report(grid_search.cv_results_,3)

Model with rank: (0)
Mean validation score: 0.840  (std: 0.01601)
Parameters: {'n_estimators': 100}

Model with rank: (0)
Mean validation score: 0.837  (std: 0.01735)
Parameters: {'n_estimators': 500}

Model with rank: (0)
Mean validation score: 0.835  (std: 0.01730)
Parameters: {'n_estimators': 700}



In [23]:
xgb_params={"gamma":[5,8,10,12,15],
            "max_depth":[6,7,8,9,10,11,12]}

In [24]:
xgb2=XGBClassifier(learning_rate=0.1,n_estimator=100,subsample=0.8,min_child_weight=2, colsample_bylevel=0.8,colsample_tree=0.8)

In [25]:
random_search=RandomizedSearchCV(xgb2,param_distributions=xgb_params,n_iter=20,cv=5,scoring='roc_auc',n_jobs=-1,verbose=2)

In [26]:
random_search.fit(x_train,y_train)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
Parameters: { "colsample_tree", "n_estimator" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




In [27]:
report(random_search.cv_results_,3)

Model with rank: (0)
Mean validation score: 0.841  (std: 0.01875)
Parameters: {'max_depth': 8, 'gamma': 5}

Model with rank: (0)
Mean validation score: 0.840  (std: 0.01936)
Parameters: {'max_depth': 9, 'gamma': 8}

Model with rank: (0)
Mean validation score: 0.840  (std: 0.01881)
Parameters: {'max_depth': 8, 'gamma': 8}



In [28]:
random_search.best_estimator_

In [29]:
y_train.value_counts()

0    7424
1    3075
Name: left, dtype: int64

In [30]:
xgb_params={'max_delta_step':[0,1,3,6,10],'scale_pos_weight':[1,2,3,4]}

In [31]:
xgb3=XGBClassifier(learning_rate=0.1,n_estimator=100,subsample=0.8,min_child_weight=2, colsample_bylevel=0.8,colsample_tree=0.8,gamma=12,max_depth=8)

In [32]:
grid_search=GridSearchCV(xgb3,param_grid=xgb_params,cv=5,scoring='roc_auc',n_jobs=-1,verbose=10)

In [33]:
grid_search.fit(x_train,y_train)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
Parameters: { "colsample_tree", "n_estimator" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




In [34]:
report(grid_search.cv_results_,3)

Model with rank: (0)
Mean validation score: 0.841  (std: 0.02170)
Parameters: {'max_delta_step': 1, 'scale_pos_weight': 1}

Model with rank: (0)
Mean validation score: 0.840  (std: 0.01890)
Parameters: {'max_delta_step': 1, 'scale_pos_weight': 4}

Model with rank: (0)
Mean validation score: 0.839  (std: 0.02005)
Parameters: {'max_delta_step': 0, 'scale_pos_weight': 2}

Model with rank: (0)
Mean validation score: 0.839  (std: 0.02005)
Parameters: {'max_delta_step': 3, 'scale_pos_weight': 2}

Model with rank: (0)
Mean validation score: 0.839  (std: 0.02005)
Parameters: {'max_delta_step': 6, 'scale_pos_weight': 2}

Model with rank: (0)
Mean validation score: 0.839  (std: 0.02005)
Parameters: {'max_delta_step': 10, 'scale_pos_weight': 2}



In [35]:
xgb_params={
    'subsample':[i/10 for i in range(5,11)],
    'colsample_bytree':[i/10 for i in range(5,11)],
    'colsample_bylevel':[i/10 for i in range(5,11)]
}

In [36]:
xgb4=XGBClassifier(learning_rate=0.1,n_estimators=100,min_child_weight=2,gamma=12,max_depth=8,scale_pos_weight=1,max_delta_step=0)

In [37]:
random_search=RandomizedSearchCV(xgb4,param_distributions=xgb_params,n_iter=20,cv=5,scoring='roc_auc',n_jobs=-1,verbose=20)

In [38]:
random_search.fit(x_train,y_train)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


In [39]:
report(random_search.cv_results_,3)

Model with rank: (0)
Mean validation score: 0.843  (std: 0.02160)
Parameters: {'subsample': 0.9, 'colsample_bytree': 1.0, 'colsample_bylevel': 0.5}

Model with rank: (0)
Mean validation score: 0.841  (std: 0.01995)
Parameters: {'subsample': 0.9, 'colsample_bytree': 0.8, 'colsample_bylevel': 0.7}

Model with rank: (0)
Mean validation score: 0.840  (std: 0.01947)
Parameters: {'subsample': 0.8, 'colsample_bytree': 0.9, 'colsample_bylevel': 0.8}



In [40]:
xgb5=XGBClassifier(learning_rate=0.1,n_estimators=100,min_child_weight=2,gamma=12,max_depth=8,scale_pos_weight=1,max_delta_step=0,colsample_bylevel=1.0,colsample_bytree=0.8,subsample=0.7)

In [41]:
xgb_params={
    'reg_lambda': [i/10 for i in range(0,50)],
    'reg_alpha':[i/10 for i in range (0,50)]
}

In [42]:
random_search=RandomizedSearchCV(xgb5,param_distributions=xgb_params,n_iter=20,cv=5,scoring='roc_auc',n_jobs=-1,verbose=10)

In [43]:
random_search.fit(x_train,y_train)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


In [44]:
report(random_search.cv_results_,3)

Model with rank: (0)
Mean validation score: 0.839  (std: 0.02008)
Parameters: {'reg_lambda': 1.9, 'reg_alpha': 0.2}

Model with rank: (0)
Mean validation score: 0.839  (std: 0.02016)
Parameters: {'reg_lambda': 1.0, 'reg_alpha': 1.0}

Model with rank: (0)
Mean validation score: 0.839  (std: 0.01931)
Parameters: {'reg_lambda': 1.6, 'reg_alpha': 2.7}



In [45]:
xgb6=XGBClassifier(learning_rate=0.1,n_estimators=100,min_child_weight=2,gamma=12,max_depth=8,scale_pos_weight=1,max_delta_step=0,colsample_bylevel=1.0,colsample_bytree=0.8,subsample=0.7,reg_lambda=1.0,reg_alpha=0)

In [46]:
xgb6.fit(x_train,y_train)

In [63]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(xgb6, x_train,y_train,scoring='accuracy',verbose=10,n_jobs=-1,cv=10)
np.mean(scores), np.std(scores)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of  10 | elapsed:    2.1s remaining:    5.0s
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:    2.1s remaining:    2.1s
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:    2.1s remaining:    0.9s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    2.2s finished


(0.8780847973126333, 0.010403036722445828)