In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
data=pd.read_csv('data.csv')
data.drop('empid',axis=1,inplace=True)
data.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,salary,left
0,0.38,0.53,2,157,3,0,0,low,1
1,0.8,0.86,5,262,6,0,0,medium,1
2,0.11,0.88,7,272,4,0,0,medium,1
3,0.72,0.87,5,223,5,0,0,low,1
4,0.37,0.52,2,159,3,0,0,low,1


In [3]:
df=data.copy()

In [7]:
df.shape

(14999, 9)

## Feature Engineering

In [5]:
df.isnull().sum()

satisfaction_level       2
last_evaluation          0
number_project           0
average_montly_hours     0
time_spend_company       0
Work_accident            0
promotion_last_5years    0
salary                   0
left                     0
dtype: int64

In [8]:
# take away : satisfaction_level is having 2 missing values

In [8]:
df['satisfaction_level'].describe()

count    14997.000000
mean         0.612863
std          0.248634
min          0.090000
25%          0.440000
50%          0.640000
75%          0.820000
max          1.000000
Name: satisfaction_level, dtype: float64

In [9]:
data['satisfaction_level'].fillna(data['satisfaction_level'].mean(),inplace=True)

In [10]:
data['satisfaction_level'].isnull().sum()

0

In [11]:
salary_dummies = pd.get_dummies(data['salary'],drop_first=True)

In [12]:
salary_dummies

Unnamed: 0,low,medium
0,1,0
1,0,1
2,0,1
3,1,0
4,1,0
...,...,...
14994,1,0
14995,1,0
14996,1,0
14997,1,0


In [13]:
data2=pd.concat([data,salary_dummies],axis=1)
data2.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,salary,left,low,medium
0,0.38,0.53,2,157,3,0,0,low,1,1,0
1,0.8,0.86,5,262,6,0,0,medium,1,0,1
2,0.11,0.88,7,272,4,0,0,medium,1,0,1
3,0.72,0.87,5,223,5,0,0,low,1,1,0
4,0.37,0.52,2,159,3,0,0,low,1,1,0


In [14]:
data2.drop('salary',axis=1,inplace=True)
data2.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,left,low,medium
0,0.38,0.53,2,157,3,0,0,1,1,0
1,0.8,0.86,5,262,6,0,0,1,0,1
2,0.11,0.88,7,272,4,0,0,1,0,1
3,0.72,0.87,5,223,5,0,0,1,1,0
4,0.37,0.52,2,159,3,0,0,1,1,0


## Cross-Validation

In [15]:
X=data2.drop('left',axis=1)
Y=data2['left']

In [16]:
X.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,low,medium
0,0.38,0.53,2,157,3,0,0,1,0
1,0.8,0.86,5,262,6,0,0,0,1
2,0.11,0.88,7,272,4,0,0,0,1
3,0.72,0.87,5,223,5,0,0,1,0
4,0.37,0.52,2,159,3,0,0,1,0


In [17]:
Y.head()

0    1
1    1
2    1
3    1
4    1
Name: left, dtype: int64

In [18]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(X,Y,test_size=0.2,random_state=0)

In [21]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

In [23]:
model_param={
    'RandomForestClassifier' : {
        'model' : RandomForestClassifier(),
        'param':{
            'n_estimators' : [10,50,100,130],
            'criterion': ['gini','entropy'],
            'max_depth' : range(2 ,4, 1),
            'max_features' : ['auto','log2']
            
        }
    },
    'XGBClassifier':{
        'model' : XGBClassifier(objective='binary:logistic'),
        'param':{
            'learning_rate' : [0.5,0.1,0.01,0.001],
            'max_depth' : [3,5,10,20],
            'n_estimators':[10,50,100,200]
        }
    }
}

In [25]:
scores=[]
for model_name, np in model_param.items():
    model_selection = GridSearchCV(estimator=np['model'],param_grid=np['param'],cv=5,return_train_score=False)
    model_selection.fit(X,Y)
    scores.append({
        'model':model_name,
        'best_score':model_selection.best_score_,
        'best_params':model_selection.best_params_
    })





























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































In [26]:
scores

[{'model': 'RandomForestClassifier',
  'best_score': 0.9255309325330666,
  'best_params': {'criterion': 'entropy',
   'max_depth': 3,
   'max_features': 'auto',
   'n_estimators': 10}},
 {'model': 'XGBClassifier',
  'best_score': 0.9902661776147605,
  'best_params': {'learning_rate': 0.1, 'max_depth': 20, 'n_estimators': 200}}]

In [27]:
model_xgb = XGBClassifier(objective='binary:logistic',learning_rate=0.1,max_depth=20,n_estimators=200)

In [28]:
model_xgb.fit(x_train,y_train)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.1, max_delta_step=0,
              max_depth=20, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=200, n_jobs=12,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [29]:
model_xgb.score(x_test,y_test)

0.9923333333333333

In [30]:
df.columns

Index(['satisfaction_level', 'last_evaluation', 'number_project',
       'average_montly_hours', 'time_spend_company', 'Work_accident',
       'promotion_last_5years', 'salary', 'left'],
      dtype='object')