In [None]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
%matplotlib inline

HR Data: Predict the probability that an employee will leave the organization.

In [None]:
hr=pd.read_csv('hr_sample.csv')
hr.head()

In [None]:
#Sanity check
hr.isnull().sum()

In [None]:
hr['left'].value_counts()/hr.shape[0]

In [None]:
hr.shape

Train and Test split
1. Train: To teach my model using past data
2. Test: Check how well your trained model is performing on unseen data or new data

In [None]:
#Random sampling: Each record will have equal chance of getting selected into the sample.
train=hr.sample(frac=0.75,random_state=1)
train.shape

In [None]:
train.index

In [None]:
test=hr.drop(train.index)

In [None]:
print(train.shape)
print(test.shape)

In [None]:
train['left'].value_counts()/train.shape[0]

In [None]:
np.log(0.498)

In [None]:
train=pd.get_dummies(train,prefix_sep='')
train.columns

In [None]:
train.shape

In [None]:
train.head()

In [None]:
import statsmodels.formula.api as smf
import statsmodels.api as sm

In [None]:
glm1=smf.glm('left~satisfaction_level+last_evaluation+number_project+average_montly_hours+time_spend_company+Work_accident+promotion_last_5years+Departmentaccounting+DepartmentIT+Departmentmanagement+Departmentmarketing+Departmentproduct_mng+DepartmentRandD+Departmentsales+Departmentsupport+Departmenttechnical+salarylow+salarymedium',data=train,family=sm.families.Binomial()).fit()
glm1.summary()

In [None]:
glm1.aic

In [None]:
glm2=smf.glm('left~satisfaction_level+last_evaluation+number_project+average_montly_hours+time_spend_company+Work_accident+promotion_last_5years+Departmentaccounting+DepartmentIT+Departmentmanagement+Departmentmarketing+Departmentproduct_mng+DepartmentRandD+Departmentsales+Departmentsupport+Departmenthr+salarylow+salarymedium',data=train,family=sm.families.Binomial()).fit()
glm2.summary()

In [None]:
glm2.aic

In [None]:
glm3=smf.glm('left~satisfaction_level+last_evaluation+number_project+average_montly_hours+time_spend_company+Work_accident+promotion_last_5years+DepartmentIT+Departmentmanagement+Departmentmarketing+Departmentproduct_mng+DepartmentRandD+Departmentsales+salarylow+salarymedium',data=train,family=sm.families.Binomial()).fit()
print(glm3.summary())

In [None]:
glm3.aic

In [None]:
glm4=smf.glm('left~satisfaction_level+last_evaluation+number_project+average_montly_hours+time_spend_company+Work_accident+promotion_last_5years+DepartmentRandD+salarylow+salarymedium',data=train,family=sm.families.Binomial()).fit()
print(glm4.summary())

In [None]:
glm4.aic

In [None]:
glm4.model.exog

In [None]:
#Multicollinearity check
from statsmodels.stats.outliers_influence import variance_inflation_factor
idv = glm4.model.exog
vif = [variance_inflation_factor(idv, i) for i in range(idv.shape[1])]

pd.DataFrame({'Features':glm4.model.exog_names,'vif':vif})

In [None]:
#Prediction on new data
test=pd.get_dummies(test,prefix_sep='')
test.columns

In [None]:
test['prob']=glm4.predict_proba(test)
test['prob'].head()

In [None]:
pred_y=test['prob'].map(lambda x:1 if x>=0.5 else 0)
pred_y.head()

In [None]:
#Accuracy: Confusion Matrix,ROC, AUC
#Confusion matrix
from sklearn import metrics
metrics.confusion_matrix(test['left'],pred_y)

In [None]:
#ROC
fpr,tpr,thresholds=metrics.roc_curve(test['left'],test['prob'])
x,y=np.arange(0,1.1,0.1),np.arange(0,1.1,0.1)

In [None]:
plt.plot(fpr,tpr)
plt.plot(x,y,'b--')
plt.title("ROC curve")

In [None]:
# AUC
metrics.roc_auc_score(test['left'],test['prob'])

In [None]:
prob=pd.DataFrame({'0':1-test['prob'],'1':test['prob']})
prob.columns=['']*2
prob=np.array(prob)
prob

In [None]:
import scikitplot as skplt
skplt.metrics.plot_cumulative_gain(test['left'], prob)