In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt 
import seaborn as sns 


df=pd.read_csv("../input/hr-dataset/HR_Dataset.csv")
df.head()




In [None]:
df.info()

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
df["Work_accident"].unique()


In [None]:
#check total employee range based on target column that left
sns.countplot(x="left",data=df)
plt.show()

In [None]:
sns.countplot(x="salary",data=df)
plt.show()

In [None]:
sns.catplot(x="left",col="promotion_last_5years",kind="count",data=df);

In [None]:
sns.catplot(x="left",col="Work_accident",kind="count",data=df);
#As we see those who didnot face accident left more so it doesn't make any correlation . Thus this is a less important feature .

In [None]:
sns.distplot(df["satisfaction_level"])

In [None]:
sns.boxplot(x="left",y="number_project",data=df)
#employee retain is high who had 3-4 projects 

In [None]:
sns.distplot(df["last_evaluation"])

In [None]:
sns.boxplot(x="left",y="average_montly_hours",data=df)
#high average monthly hour resulted in high left 

In [None]:
cor_mat=df.corr()
fig=plt.figure(figsize=(15,7))
sns.heatmap(cor_mat,annot=True)

# **Feature Engineering **

In [None]:
df2=df.copy()

In [None]:
#handling missing values 
df2.isnull().sum()

#Result : No missing value 

In [None]:
df2['salary'].unique()

In [None]:
#Handling Categorical Values 
#One-hot encoding turns your categorical data into a binary vector representation. Pandas get dummies makes this very easy.

salary_dummies=pd.get_dummies(df2['salary'],drop_first=True)

In [None]:
salary_dummies

In [None]:
df2=pd.concat([df2,salary_dummies],axis=1)

In [None]:
df2.head()

In [None]:
df2.drop(['salary'],axis=1,inplace=True)

In [None]:
df2.head()

In [None]:
#Split dataset into training set & test set 
x=df2.drop(labels='left',axis=1)
y=df2['left']

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=0)

In [None]:
print(len(x_train))
print(len(x_test))

**Model Selection**

In [None]:
from sklearn.ensemble import RandomForestClassifier 
from xgboost import XGBClassifier 
from sklearn.model_selection import GridSearchCV # To determine which model to use for given dataset

In [None]:
model_param={
    'RandomForestClassifier':{
        'model':RandomForestClassifier(),
        'param':{
            'n_estimators':[10,50,100,130],
            'criterion':['gini','entropy'],
            'max_depth': range(2,4,1),
            'max_features':['auto','log2']
            
        }
    },
    'XGBClassifier':{
        'model':XGBClassifier(objective='binary:logistic'),
        'param':{
            'n_estimators':[10,50,100,200],
            'max_depth': [3,5,10,20],
            'learning_rate':[0.5,0.1,0.01,0.001]
            
        }
    }
}

In [None]:
scores=[]
for model_name,mp in model_param.items():
    model_selection=GridSearchCV(estimator=mp['model'],param_grid=mp['param'],cv=5,return_train_score=False)
    model_selection.fit(x,y)
    scores.append({
        'model':model_name,
        'best_score':model_selection.best_score_,
        'best_params':model_selection.best_params_
        
    }
    )

In [None]:
scores


In [None]:
model_xgb=XGBClassifier(objective='binary:logistic',learning_rate= 0.1, max_depth= 20, n_estimators= 100)

In [None]:
model_xgb.fit(x_train,y_train)

In [None]:
model_xgb.score(x_test,y_test)

In [None]:
x_test.head()

In [None]:
x_test[:1]

In [None]:
model_xgb.predict(x_test[:1])

In [None]:
df.loc[1670]

In [None]:
import pickle

In [None]:
filename='churnmodel.sav'

In [None]:
pickle.dump(model_xgb,open(filename,'wb'))